Repository: yjh0410/yolov2-yolov3_PyTorch Branch: master Commit: 49dbf4bbcaba Files: 35 Total size: 223.1 KB Directory structure: gitextract_dmpt53ff/ ├── .gitignore ├── LICENSE ├── README.md ├── backbone/ │ ├── __init__.py │ ├── darknet19.py │ ├── darknet53.py │ ├── darknet_tiny.py │ ├── resnet.py │ └── weights/ │ └── README.md ├── data/ │ ├── __init__.py │ ├── coco2017.py │ ├── config.py │ ├── scripts/ │ │ ├── COCO2017.sh │ │ ├── VOC2007.sh │ │ └── VOC2012.sh │ └── voc0712.py ├── demo.py ├── eval.py ├── models/ │ ├── yolov2_d19.py │ ├── yolov2_r50.py │ ├── yolov3.py │ ├── yolov3_spp.py │ └── yolov3_tiny.py ├── test.py ├── tools.py ├── train.py ├── utils/ │ ├── __init__.py │ ├── augmentations.py │ ├── cocoapi_evaluator.py │ ├── com_paras_flops.py │ ├── distributed_utils.py │ ├── kmeans_anchor.py │ ├── modules.py │ └── vocapi_evaluator.py └── weights/ └── README.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pt *.pth *.txt *.pkl __pycache__ .vscode det_results ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # Update Recently, I have released a new YOLO project: https://github.com/yjh0410/PyTorch_YOLO_Tutorial In my new YOLO project, you can enjoy: - a new and stronger YOLOv1 - a new and stronger YOLOv2 - YOLOv3 - YOLOv4 - YOLOv5 - YOLOv7 - YOLOX - RTCDet # This project In this project, you can enjoy: - YOLOv2 with DarkNet-19 - YOLOv2 with ResNet-50 - YOLOv2Slim - YOLOv3 - YOLOv3-Spp - YOLOv3-Tiny I just want to provide a good YOLO project for everyone who is interested in Object Detection. # Weights Google Drive: https://drive.google.com/drive/folders/1T5hHyGICbFSdu6u2_vqvxn_puotvPsbd?usp=sharing BaiDuYunDisk: https://pan.baidu.com/s/1tSylvzOVFReUAvaAxKRSwg Password d266 You can download all my models from the above links. # YOLOv2 ## YOLOv2 with DarkNet-19 ### Tricks Tricks in official paper: - [x] batch norm - [x] hi-res classifier - [x] convolutional - [x] anchor boxes - [x] new network - [x] dimension priors - [x] location prediction - [x] passthrough - [x] multi-scale - [x] hi-red detector ## VOC2007
size Original (darknet) Ours (pytorch) 160peochs Ours (pytorch) 250epochs
VOC07 test 416 76.8 76.0 77.1
VOC07 test 544 78.6 77.0 78.1
## COCO
data AP AP50 AP75 AP_S AP_M AP_L
Original (darknet) COCO test-dev 21.6 44.0 19.2 5.0 22.4 35.5
Ours (pytorch) COCO test-dev 26.8 46.6 26.8 5.8 27.4 45.2
Ours (pytorch) COCO eval 26.6 46.0 26.7 5.9 27.8 47.1
## YOLOv2 with ResNet-50 I replace darknet-19 with resnet-50 and get a better result on COCO-val
data AP AP50 AP75 AP_S AP_M AP_L
Our YOLOv2-320 COCO eval 25.8 44.6 25.9 4.6 26.8 47.9
Our YOLOv2-416 COCO eval 29.0 48.8 29.7 7.4 31.9 48.3
Our YOLOv2-512 COCO eval 30.4 51.6 30.9 10.1 34.9 46.6
Our YOLOv2-544 COCO eval 30.4 51.9 30.9 11.1 35.8 45.5
Our YOLOv2-608 COCO eval 29.2 51.6 29.1 13.6 36.8 40.5
# YOLOv3 ## VOC2007
size Original (darknet) Ours (pytorch) 250epochs
VOC07 test 416 80.25 81.4
# COCO Official YOLOv3:
data AP AP50 AP75 AP_S AP_M AP_L
YOLOv3-320 COCO test-dev 28.2 51.5 - - - -
YOLOv3-416 COCO test-dev 31.0 55.3 - - - -
YOLOv3-608 COCO test-dev 33.0 57.0 34.4 18.3 35.4 41.9
Our YOLOv3:
data AP AP50 AP75 AP_S AP_M AP_L
YOLOv3-320 COCO test-dev 33.1 54.1 34.5 12.1 34.5 49.6
YOLOv3-416 COCO test-dev 36.0 57.4 37.0 16.3 37.5 51.1
YOLOv3-608 COCO test-dev 37.6 59.4 39.9 20.4 39.9 48.2
# YOLOv3SPP ## COCO:
data AP AP50 AP75 AP_S AP_M AP_L
YOLOv3Spp-320 COCO eval 32.78 53.79 33.9 12.4 35.5 50.6
YOLOv3Spp-416 COCO eval 35.66 57.09 37.4 16.8 38.1 50.7
YOLOv3Spp-608 COCO eval 37.52 59.44 39.3 21.5 40.6 49.6
# YOLOv3Tiny
data AP AP50 AP75 AP_S AP_M AP_L
(official) YOLOv3Tiny COCO test-dev - 33.1 - - - -
(Our) YOLOv3Tiny COCO val 15.9 33.8 12.8 7.6 17.7 22.4
# Installation - Pytorch-gpu 1.1.0/1.2.0/1.3.0 - Tensorboard 1.14. - opencv-python, python3.6/3.7 # Dataset ## VOC Dataset I copy the download files from the following excellent project: https://github.com/amdegroot/ssd.pytorch I have uploaded the VOC2007 and VOC2012 to BaiDuYunDisk, so for researchers in China, you can download them from BaiDuYunDisk: Link:https://pan.baidu.com/s/1tYPGCYGyC0wjpC97H-zzMQ Password:4la9 You will get a ```VOCdevkit.zip```, then what you need to do is just to unzip it and put it into ```data/```. After that, the whole path to VOC dataset is ```data/VOCdevkit/VOC2007``` and ```data/VOCdevkit/VOC2012```. ### Download VOC2007 trainval & test ```Shell # specify a directory for dataset to be downloaded into, else default is ~/data/ sh data/scripts/VOC2007.sh # ``` ### Download VOC2012 trainval ```Shell # specify a directory for dataset to be downloaded into, else default is ~/data/ sh data/scripts/VOC2012.sh # ``` ## MSCOCO Dataset I copy the download files from the following excellent project: https://github.com/DeNA/PyTorch_YOLOv3 ### Download MSCOCO 2017 dataset Just run ```sh data/scripts/COCO2017.sh```. You will get COCO train2017, val2017, test2017. # Train ## VOC ```Shell python train.py -d voc --cuda -v [select a model] -hr -ms --ema ``` You can run ```python train.py -h``` to check all optional argument. ## COCO If you have only one gpu: ```Shell python train.py -d coco --cuda -v [select a model] -hr -ms --ema ``` If you have multi gpus like 8, and you put 4 images on each gpu: ```Shell python -m torch.distributed.launch --nproc_per_node=8 train.py -d coco --cuda -v [select a model] -hr -ms --ema \ -dist \ --sybn \ --num_gpu 8\ --batch_size 4 ``` # Test ## VOC ```Shell python test.py -d voc --cuda -v [select a model] --trained_model [ Please input the path to model dir. ] ``` ## COCO ```Shell python test.py -d coco-val --cuda -v [select a model] --trained_model [ Please input the path to model dir. ] ``` # Evaluation ## VOC ```Shell python eval.py -d voc --cuda -v [select a model] --train_model [ Please input the path to model dir. ] ``` ## COCO To run on COCO_val: ```Shell python eval.py -d coco-val --cuda -v [select a model] --train_model [ Please input the path to model dir. ] ``` To run on COCO_test-dev(You must be sure that you have downloaded test2017): ```Shell python eval.py -d coco-test --cuda -v [select a model] --train_model [ Please input the path to model dir. ] ``` You will get a .json file which can be evaluated on COCO test server. ================================================ FILE: backbone/__init__.py ================================================ from .resnet import build_resnet from .darknet19 import build_darknet19 from .darknet53 import build_darknet53 from .darknet_tiny import build_darknet_tiny def build_backbone(model_name='resnet18', pretrained=False): if 'resnet' in model_name: backbone = build_resnet(model_name, pretrained) elif model_name == 'darknet19': backbone = build_darknet19(pretrained) elif model_name == 'darknet53': backbone = build_darknet53(pretrained) elif model_name == 'darknet19': backbone = build_darknet_tiny(pretrained) return backbone ================================================ FILE: backbone/darknet19.py ================================================ import torch import torch.nn as nn import os model_urls = { "darknet19": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet19.pth", } __all__ = ['darknet19'] class Conv_BN_LeakyReLU(nn.Module): def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1): super(Conv_BN_LeakyReLU, self).__init__() self.convs = nn.Sequential( nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation), nn.BatchNorm2d(out_channels), nn.LeakyReLU(0.1, inplace=True) ) def forward(self, x): return self.convs(x) class DarkNet_19(nn.Module): def __init__(self): super(DarkNet_19, self).__init__() # backbone network : DarkNet-19 # output : stride = 2, c = 32 self.conv_1 = nn.Sequential( Conv_BN_LeakyReLU(3, 32, 3, 1), nn.MaxPool2d((2,2), 2), ) # output : stride = 4, c = 64 self.conv_2 = nn.Sequential( Conv_BN_LeakyReLU(32, 64, 3, 1), nn.MaxPool2d((2,2), 2) ) # output : stride = 8, c = 128 self.conv_3 = nn.Sequential( Conv_BN_LeakyReLU(64, 128, 3, 1), Conv_BN_LeakyReLU(128, 64, 1), Conv_BN_LeakyReLU(64, 128, 3, 1), nn.MaxPool2d((2,2), 2) ) # output : stride = 8, c = 256 self.conv_4 = nn.Sequential( Conv_BN_LeakyReLU(128, 256, 3, 1), Conv_BN_LeakyReLU(256, 128, 1), Conv_BN_LeakyReLU(128, 256, 3, 1), ) # output : stride = 16, c = 512 self.maxpool_4 = nn.MaxPool2d((2, 2), 2) self.conv_5 = nn.Sequential( Conv_BN_LeakyReLU(256, 512, 3, 1), Conv_BN_LeakyReLU(512, 256, 1), Conv_BN_LeakyReLU(256, 512, 3, 1), Conv_BN_LeakyReLU(512, 256, 1), Conv_BN_LeakyReLU(256, 512, 3, 1), ) # output : stride = 32, c = 1024 self.maxpool_5 = nn.MaxPool2d((2, 2), 2) self.conv_6 = nn.Sequential( Conv_BN_LeakyReLU(512, 1024, 3, 1), Conv_BN_LeakyReLU(1024, 512, 1), Conv_BN_LeakyReLU(512, 1024, 3, 1), Conv_BN_LeakyReLU(1024, 512, 1), Conv_BN_LeakyReLU(512, 1024, 3, 1) ) def forward(self, x): c1 = self.conv_1(x) c2 = self.conv_2(c1) c3 = self.conv_3(c2) c3 = self.conv_4(c3) c4 = self.conv_5(self.maxpool_4(c3)) c5 = self.conv_6(self.maxpool_5(c4)) output = { 'layer1': c3, 'layer2': c4, 'layer3': c5 } return output def build_darknet19(pretrained=False): # model model = DarkNet_19() # load weight if pretrained: print('Loading pretrained weight ...') url = model_urls['darknet19'] # checkpoint state dict checkpoint_state_dict = torch.hub.load_state_dict_from_url( url=url, map_location="cpu", check_hash=True) # model state dict model_state_dict = model.state_dict() # check for k in list(checkpoint_state_dict.keys()): if k in model_state_dict: shape_model = tuple(model_state_dict[k].shape) shape_checkpoint = tuple(checkpoint_state_dict[k].shape) if shape_model != shape_checkpoint: checkpoint_state_dict.pop(k) else: checkpoint_state_dict.pop(k) print(k) model.load_state_dict(checkpoint_state_dict) return model if __name__ == '__main__': import time net = build_darknet19(pretrained=True) x = torch.randn(1, 3, 224, 224) t0 = time.time() output = net(x) t1 = time.time() print('Time: ', t1 - t0) for k in output.keys(): print('{} : {}'.format(k, output[k].shape)) ================================================ FILE: backbone/darknet53.py ================================================ import torch import torch.nn as nn model_urls = { "darknet53": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet53.pth", } __all__ = ['darknet53'] class Conv_BN_LeakyReLU(nn.Module): def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1): super(Conv_BN_LeakyReLU, self).__init__() self.convs = nn.Sequential( nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation), nn.BatchNorm2d(out_channels), nn.LeakyReLU(0.1, inplace=True) ) def forward(self, x): return self.convs(x) class ResBlock(nn.Module): def __init__(self, ch, nblocks=1): super().__init__() self.module_list = nn.ModuleList() for _ in range(nblocks): resblock_one = nn.Sequential( Conv_BN_LeakyReLU(ch, ch//2, 1), Conv_BN_LeakyReLU(ch//2, ch, 3, padding=1) ) self.module_list.append(resblock_one) def forward(self, x): for module in self.module_list: x = module(x) + x return x class DarkNet_53(nn.Module): """ DarkNet-53. """ def __init__(self): super(DarkNet_53, self).__init__() # stride = 2 self.layer_1 = nn.Sequential( Conv_BN_LeakyReLU(3, 32, 3, padding=1), Conv_BN_LeakyReLU(32, 64, 3, padding=1, stride=2), ResBlock(64, nblocks=1) ) # stride = 4 self.layer_2 = nn.Sequential( Conv_BN_LeakyReLU(64, 128, 3, padding=1, stride=2), ResBlock(128, nblocks=2) ) # stride = 8 self.layer_3 = nn.Sequential( Conv_BN_LeakyReLU(128, 256, 3, padding=1, stride=2), ResBlock(256, nblocks=8) ) # stride = 16 self.layer_4 = nn.Sequential( Conv_BN_LeakyReLU(256, 512, 3, padding=1, stride=2), ResBlock(512, nblocks=8) ) # stride = 32 self.layer_5 = nn.Sequential( Conv_BN_LeakyReLU(512, 1024, 3, padding=1, stride=2), ResBlock(1024, nblocks=4) ) def forward(self, x, targets=None): c1 = self.layer_1(x) c2 = self.layer_2(c1) c3 = self.layer_3(c2) c4 = self.layer_4(c3) c5 = self.layer_5(c4) output = { 'layer1': c3, 'layer2': c4, 'layer3': c5 } return output def build_darknet53(pretrained=False): # model model = DarkNet_53() # load weight if pretrained: print('Loading pretrained weight ...') url = model_urls['darknet53'] # checkpoint state dict checkpoint_state_dict = torch.hub.load_state_dict_from_url( url=url, map_location="cpu", check_hash=True) # model state dict model_state_dict = model.state_dict() # check for k in list(checkpoint_state_dict.keys()): if k in model_state_dict: shape_model = tuple(model_state_dict[k].shape) shape_checkpoint = tuple(checkpoint_state_dict[k].shape) if shape_model != shape_checkpoint: checkpoint_state_dict.pop(k) else: checkpoint_state_dict.pop(k) print(k) model.load_state_dict(checkpoint_state_dict) return model if __name__ == '__main__': import time net = build_darknet53(pretrained=True) x = torch.randn(1, 3, 224, 224) t0 = time.time() output = net(x) t1 = time.time() print('Time: ', t1 - t0) for k in output.keys(): print('{} : {}'.format(k, output[k].shape)) ================================================ FILE: backbone/darknet_tiny.py ================================================ import torch import torch.nn as nn model_urls = { "darknet_tiny": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet_tiny.pth", } __all__ = ['darknet_tiny'] class Conv_BN_LeakyReLU(nn.Module): def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1): super(Conv_BN_LeakyReLU, self).__init__() self.convs = nn.Sequential( nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation), nn.BatchNorm2d(out_channels), nn.LeakyReLU(0.1, inplace=True) ) def forward(self, x): return self.convs(x) class DarkNet_Tiny(nn.Module): def __init__(self): super(DarkNet_Tiny, self).__init__() # backbone network : DarkNet_Tiny self.conv_1 = Conv_BN_LeakyReLU(3, 16, 3, 1) self.maxpool_1 = nn.MaxPool2d((2, 2), 2) # stride = 2 self.conv_2 = Conv_BN_LeakyReLU(16, 32, 3, 1) self.maxpool_2 = nn.MaxPool2d((2, 2), 2) # stride = 4 self.conv_3 = Conv_BN_LeakyReLU(32, 64, 3, 1) self.maxpool_3 = nn.MaxPool2d((2, 2), 2) # stride = 8 self.conv_4 = Conv_BN_LeakyReLU(64, 128, 3, 1) self.maxpool_4 = nn.MaxPool2d((2, 2), 2) # stride = 16 self.conv_5 = Conv_BN_LeakyReLU(128, 256, 3, 1) self.maxpool_5 = nn.MaxPool2d((2, 2), 2) # stride = 32 self.conv_6 = Conv_BN_LeakyReLU(256, 512, 3, 1) self.maxpool_6 = nn.Sequential( nn.ZeroPad2d((0, 1, 0, 1)), nn.MaxPool2d((2, 2), 1) # stride = 32 ) self.conv_7 = Conv_BN_LeakyReLU(512, 1024, 3, 1) def forward(self, x): x = self.conv_1(x) c1 = self.maxpool_1(x) c1 = self.conv_2(c1) c2 = self.maxpool_2(c1) c2 = self.conv_3(c2) c3 = self.maxpool_3(c2) c3 = self.conv_4(c3) c4 = self.maxpool_4(c3) c4 = self.conv_5(c4) # stride = 16 c5 = self.maxpool_5(c4) c5 = self.conv_6(c5) c5 = self.maxpool_6(c5) c5 = self.conv_7(c5) # stride = 32 output = { 'layer1': c3, 'layer2': c4, 'layer3': c5 } return output def build_darknet_tiny(pretrained=False): # model model = DarkNet_Tiny() # load weight if pretrained: print('Loading pretrained weight ...') url = model_urls['darknet_tiny'] # checkpoint state dict checkpoint_state_dict = torch.hub.load_state_dict_from_url( url=url, map_location="cpu", check_hash=True) # model state dict model_state_dict = model.state_dict() # check for k in list(checkpoint_state_dict.keys()): if k in model_state_dict: shape_model = tuple(model_state_dict[k].shape) shape_checkpoint = tuple(checkpoint_state_dict[k].shape) if shape_model != shape_checkpoint: checkpoint_state_dict.pop(k) else: checkpoint_state_dict.pop(k) print(k) model.load_state_dict(checkpoint_state_dict) return model if __name__ == '__main__': import time net = build_darknet_tiny(pretrained=True) x = torch.randn(1, 3, 224, 224) t0 = time.time() output = net(x) t1 = time.time() print('Time: ', t1 - t0) for k in output.keys(): print('{} : {}'.format(k, output[k].shape)) ================================================ FILE: backbone/resnet.py ================================================ import torch import torch.nn as nn import torch.utils.model_zoo as model_zoo __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) def conv1x1(in_planes, out_planes, stride=1): """1x1 convolution""" return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = conv1x1(inplanes, planes) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = conv3x3(planes, planes, stride) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = conv1x1(planes, planes * self.expansion) self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class ResNet(nn.Module): def __init__(self, block, layers, zero_init_residual=False): super(ResNet, self).__init__() self.inplanes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual block behaves like an identity. # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): nn.init.constant_(m.bn3.weight, 0) elif isinstance(m, BasicBlock): nn.init.constant_(m.bn2.weight, 0) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( conv1x1(self.inplanes, planes * block.expansion, stride), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): c1 = self.conv1(x) c1 = self.bn1(c1) c1 = self.relu(c1) c1 = self.maxpool(c1) c2 = self.layer1(c1) c3 = self.layer2(c2) c4 = self.layer3(c3) c5 = self.layer4(c4) output = { 'layer1': c3, 'layer2': c4, 'layer3': c5 } return output def resnet18(pretrained=False, **kwargs): """Constructs a ResNet-18 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) if pretrained: # strict = False as we don't need fc layer params. model.load_state_dict(model_zoo.load_url(model_urls['resnet18']), strict=False) return model def resnet34(pretrained=False, **kwargs): """Constructs a ResNet-34 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet34']), strict=False) return model def resnet50(pretrained=False, **kwargs): """Constructs a ResNet-50 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False) return model def resnet101(pretrained=False, **kwargs): """Constructs a ResNet-101 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet101']), strict=False) return model def resnet152(pretrained=False, **kwargs): """Constructs a ResNet-152 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) return model def build_resnet(model_name='resnet18', pretrained=False): if model_name == 'resnet18': model = resnet18(pretrained=pretrained) elif model_name == 'resnet34': model = resnet34(pretrained=pretrained) elif model_name == 'resnet50': model = resnet50(pretrained=pretrained) elif model_name == 'resnet101': model = resnet101(pretrained=pretrained) elif model_name == 'resnet152': model = resnet152(pretrained=pretrained) return model if __name__ == "__main__": import time model = build_resnet(model_name='resnet18', pretrained=True) x = torch.randn(1, 3, 224, 224) t0 = time.time() output = model(x) t1 = time.time() print('Time: ', t1 - t0) for k in output.keys(): print('{} : {}'.format(k, output[k].shape)) ================================================ FILE: backbone/weights/README.md ================================================ # darknet19, darknet53, darknet-tiny, darknet-light darknet-tiny is designed by myself. It is a very simple and lightweight backbone. darknet-light is same to the backbone used in official TinyYOLOv3. For researchers in China, you can download them from BaiduYunDisk: link:https://pan.baidu.com/s/1Rm87Fcj1RXZFmeTUrDWANA password:qgzn Also, you can download them from Google Drive: link: https://drive.google.com/drive/folders/15saMtvYiz3yfFNu5EnC7GSltEAvTImMB?usp=sharing ================================================ FILE: data/__init__.py ================================================ from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES from .coco2017 import COCODataset, coco_class_labels, coco_class_index from .config import * import torch import cv2 import numpy as np def detection_collate(batch): """Custom collate fn for dealing with batches of images that have a different number of associated object annotations (bounding boxes). Arguments: batch: (tuple) A tuple of tensor images and lists of annotations Return: A tuple containing: 1) (tensor) batch of images stacked on their 0 dim 2) (list of tensors) annotations for a given image are stacked on 0 dim """ targets = [] imgs = [] for sample in batch: imgs.append(sample[0]) targets.append(torch.FloatTensor(sample[1])) return torch.stack(imgs, 0), targets def base_transform(image, size, mean, std): x = cv2.resize(image, (size, size)).astype(np.float32) x /= 255. x -= mean x /= std return x class BaseTransform: def __init__(self, size, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)): self.size = size self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) def __call__(self, image, boxes=None, labels=None): return base_transform(image, self.size, self.mean, self.std), boxes, labels ================================================ FILE: data/coco2017.py ================================================ import os import numpy as np import random import torch from torch.utils.data import Dataset import cv2 from pycocotools.coco import COCO coco_class_labels = ('background', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk', 'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') coco_class_index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90] class COCODataset(Dataset): """ COCO dataset class. """ def __init__(self, data_dir=None, transform=None, json_file='instances_train2017.json', name='train2017'): """ COCO dataset initialization. Annotation data are read into memory by COCO API. Args: data_dir (str): dataset root directory json_file (str): COCO json file name name (str): COCO data name (e.g. 'train2017' or 'val2017') img_size (int): target image size after pre-processing min_size (int): bounding boxes smaller than this are ignored debug (bool): if True, only one data id is selected from the dataset """ self.data_dir = data_dir self.json_file = json_file self.coco = COCO(os.path.join(self.data_dir, 'annotations', self.json_file)) self.ids = self.coco.getImgIds() self.class_ids = sorted(self.coco.getCatIds()) self.name = name self.transform = transform def __len__(self): return len(self.ids) def pull_image(self, index): id_ = self.ids[index] img_file = os.path.join(self.data_dir, self.name, '{:012}'.format(id_) + '.jpg') img = cv2.imread(img_file) if self.json_file == 'instances_val5k.json' and img is None: img_file = os.path.join(self.data_dir, 'train2017', '{:012}'.format(id_) + '.jpg') img = cv2.imread(img_file) return img, id_ def pull_anno(self, index): id_ = self.ids[index] anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None) annotations = self.coco.loadAnns(anno_ids) target = [] for anno in annotations: if 'bbox' in anno: xmin = np.max((0, anno['bbox'][0])) ymin = np.max((0, anno['bbox'][1])) xmax = xmin + anno['bbox'][2] ymax = ymin + anno['bbox'][3] if anno['area'] > 0 and xmax >= xmin and ymax >= ymin: label_ind = anno['category_id'] cls_id = self.class_ids.index(label_ind) target.append([xmin, ymin, xmax, ymax, cls_id]) # [xmin, ymin, xmax, ymax, label_ind] else: print('No bbox !!') return target def __getitem__(self, index): img, gt, h, w = self.pull_item(index) return img, gt def pull_item(self, index): id_ = self.ids[index] anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None) annotations = self.coco.loadAnns(anno_ids) # load an image img_file = os.path.join(self.data_dir, self.name, '{:012}'.format(id_) + '.jpg') img = cv2.imread(img_file) if self.json_file == 'instances_val5k.json' and img is None: img_file = os.path.join(self.data_dir, 'train2017', '{:012}'.format(id_) + '.jpg') img = cv2.imread(img_file) assert img is not None height, width, channels = img.shape # load a target target = [] for anno in annotations: if 'bbox' in anno and anno['area'] > 0: xmin = np.max((0, anno['bbox'][0])) ymin = np.max((0, anno['bbox'][1])) xmax = np.min((width - 1, xmin + np.max((0, anno['bbox'][2] - 1)))) ymax = np.min((height - 1, ymin + np.max((0, anno['bbox'][3] - 1)))) if xmax > xmin and ymax > ymin: label_ind = anno['category_id'] cls_id = self.class_ids.index(label_ind) xmin /= width ymin /= height xmax /= width ymax /= height target.append([xmin, ymin, xmax, ymax, cls_id]) # [xmin, ymin, xmax, ymax, label_ind] else: print('No bbox !!!') # check target if len(target) == 0: target = np.zeros([1, 5]) else: target = np.array(target) # transform if self.transform is not None: img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) # to rgb img = img[:, :, (2, 1, 0)] # to tensor img = torch.from_numpy(img).permute(2, 0, 1).float() target = np.hstack((boxes, np.expand_dims(labels, axis=1))) return img, target, height, width if __name__ == "__main__": def base_transform(image, size, mean): x = cv2.resize(image, (size, size)).astype(np.float32) x -= mean x = x.astype(np.float32) return x class BaseTransform: def __init__(self, size, mean): self.size = size self.mean = np.array(mean, dtype=np.float32) def __call__(self, image, boxes=None, labels=None): return base_transform(image, self.size, self.mean), boxes, labels img_size = 640 dataset = COCODataset( data_dir='/mnt/share/ssd2/dataset/COCO/', transform=BaseTransform(img_size, (0, 0, 0))) for i in range(1000): im, gt, h, w = dataset.pull_item(i) img = im.permute(1,2,0).numpy()[:, :, (2, 1, 0)].astype(np.uint8) img = img.copy() for box in gt: xmin, ymin, xmax, ymax, _ = box xmin *= img_size ymin *= img_size xmax *= img_size ymax *= img_size img = cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0,0,255), 2) cv2.imshow('gt', img) # cv2.imwrite(str(i)+'.jpg', img) cv2.waitKey(0) ================================================ FILE: data/config.py ================================================ # config.py # YOLOv2 with darknet-19 yolov2_d19_cfg = { # network 'backbone': 'd19', # for multi-scale trick 'train_size': 640, 'val_size': 416, 'random_size_range': [10, 19], # anchor size 'anchor_size_voc': [[1.19, 1.98], [2.79, 4.59], [4.53, 8.92], [8.06, 5.29], [10.32, 10.65]], 'anchor_size_coco': [[0.53, 0.79], [1.71, 2.36], [2.89, 6.44], [6.33, 3.79], [9.03, 9.74]], # train 'lr_epoch': (150, 200), 'max_epoch': 250, 'ignore_thresh': 0.5 } # YOLOv2 with resnet-50 yolov2_r50_cfg = { # network 'backbone': 'r50', # for multi-scale trick 'train_size': 640, 'val_size': 416, 'random_size_range': [10, 19], # anchor size 'anchor_size_voc': [[1.19, 1.98], [2.79, 4.59], [4.53, 8.92], [8.06, 5.29], [10.32, 10.65]], 'anchor_size_coco': [[0.53, 0.79], [1.71, 2.36], [2.89, 6.44], [6.33, 3.79], [9.03, 9.74]], # train 'lr_epoch': (150, 200), 'max_epoch': 250, 'ignore_thresh': 0.5 } # YOLOv3 / YOLOv3Spp yolov3_d53_cfg = { # network 'backbone': 'd53', # for multi-scale trick 'train_size': 640, 'val_size': 416, 'random_size_range': [10, 19], # anchor size 'anchor_size_voc': [[32.64, 47.68], [50.24, 108.16], [126.72, 96.32], [78.4, 201.92], [178.24, 178.56], [129.6, 294.72], [331.84, 194.56], [227.84, 325.76], [365.44, 358.72]], 'anchor_size_coco': [[12.48, 19.2], [31.36, 46.4],[46.4, 113.92], [97.28, 55.04], [133.12, 127.36], [79.04, 224.], [301.12, 150.4 ], [172.16, 285.76], [348.16, 341.12]], # train 'lr_epoch': (150, 200), 'max_epoch': 250, 'ignore_thresh': 0.5 } # YOLOv3Tiny yolov3_tiny_cfg = { # network 'backbone': 'd-light', # for multi-scale trick 'train_size': 640, 'val_size': 416, 'random_size_range':[10, 19], # anchor size 'anchor_size_voc': [[34.01, 61.79], [86.94, 109.68], [93.49, 227.46], [246.38, 163.33], [178.68, 306.55], [344.89, 337.14]], 'anchor_size_coco': [[15.09, 23.25], [46.36, 61.47], [68.41, 161.84], [168.88, 93.59], [154.96, 257.45], [334.74, 302.47]], # train 'lr_epoch': (150, 200), 'max_epoch': 250, 'ignore_thresh': 0.5 } ================================================ FILE: data/scripts/COCO2017.sh ================================================ mkdir COCO cd COCO wget http://images.cocodataset.org/zips/train2017.zip wget http://images.cocodataset.org/zips/val2017.zip wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip wget http://images.cocodataset.org/zips/test2017.zip wget http://images.cocodataset.org/annotations/image_info_test2017.zip  unzip train2017.zip unzip val2017.zip unzip annotations_trainval2017.zip unzip test2017.zip unzip image_info_test2017.zip # rm -f train2017.zip # rm -f val2017.zip # rm -f annotations_trainval2017.zip # rm -f test2017.zip # rm -f image_info_test2017.zip ================================================ FILE: data/scripts/VOC2007.sh ================================================ #!/bin/bash # Ellis Brown start=`date +%s` # handle optional download dir if [ -z "$1" ] then # navigate to ~/data echo "navigating to ~/data/ ..." mkdir -p ~/data cd ~/data/ else # check if is valid directory if [ ! -d $1 ]; then echo $1 "is not a valid directory" exit 0 fi echo "navigating to" $1 "..." cd $1 fi echo "Downloading VOC2007 trainval ..." # Download the data. curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar echo "Downloading VOC2007 test data ..." curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar echo "Done downloading." # Extract data echo "Extracting trainval ..." tar -xvf VOCtrainval_06-Nov-2007.tar echo "Extracting test ..." tar -xvf VOCtest_06-Nov-2007.tar echo "removing tars ..." rm VOCtrainval_06-Nov-2007.tar rm VOCtest_06-Nov-2007.tar end=`date +%s` runtime=$((end-start)) echo "Completed in" $runtime "seconds" ================================================ FILE: data/scripts/VOC2012.sh ================================================ #!/bin/bash # Ellis Brown start=`date +%s` # handle optional download dir if [ -z "$1" ] then # navigate to ~/data echo "navigating to ~/data/ ..." mkdir -p ~/data cd ~/data/ else # check if is valid directory if [ ! -d $1 ]; then echo $1 "is not a valid directory" exit 0 fi echo "navigating to" $1 "..." cd $1 fi echo "Downloading VOC2012 trainval ..." # Download the data. curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar echo "Done downloading." # Extract data echo "Extracting trainval ..." tar -xvf VOCtrainval_11-May-2012.tar echo "removing tar ..." rm VOCtrainval_11-May-2012.tar end=`date +%s` runtime=$((end-start)) echo "Completed in" $runtime "seconds" ================================================ FILE: data/voc0712.py ================================================ """VOC Dataset Classes Original author: Francisco Massa https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py Updated by: Ellis Brown, Max deGroot """ import os.path as osp import sys import torch import torch.utils.data as data import cv2 import numpy as np import random import xml.etree.ElementTree as ET VOC_CLASSES = ( # always index 0 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') class VOCAnnotationTransform(object): """Transforms a VOC annotation into a Tensor of bbox coords and label index Initilized with a dictionary lookup of classnames to indexes Arguments: class_to_ind (dict, optional): dictionary lookup of classnames -> indexes (default: alphabetic indexing of VOC's 20 classes) keep_difficult (bool, optional): keep difficult instances or not (default: False) height (int): height width (int): width """ def __init__(self, class_to_ind=None, keep_difficult=False): self.class_to_ind = class_to_ind or dict( zip(VOC_CLASSES, range(len(VOC_CLASSES)))) self.keep_difficult = keep_difficult def __call__(self, target, width, height): """ Arguments: target (annotation) : the target annotation to be made usable will be an ET.Element Returns: a list containing lists of bounding boxes [bbox coords, class name] """ res = [] for obj in target.iter('object'): difficult = int(obj.find('difficult').text) == 1 if not self.keep_difficult and difficult: continue name = obj.find('name').text.lower().strip() bbox = obj.find('bndbox') pts = ['xmin', 'ymin', 'xmax', 'ymax'] bndbox = [] for i, pt in enumerate(pts): cur_pt = int(bbox.find(pt).text) - 1 # scale height or width cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height bndbox.append(cur_pt) label_idx = self.class_to_ind[name] bndbox.append(label_idx) res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] # img_id = target.find('filename').text[:-4] return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] class VOCDetection(data.Dataset): """VOC Detection Dataset Object input is image, target is annotation Arguments: root (string): filepath to VOCdevkit folder. image_set (string): imageset to use (eg. 'train', 'val', 'test') transform (callable, optional): transformation to perform on the input image target_transform (callable, optional): transformation to perform on the target `annotation` (eg: take in caption string, return tensor of word indices) dataset_name (string, optional): which dataset to load (default: 'VOC2007') """ def __init__(self, data_dir=None, image_sets=[('2007', 'trainval'), ('2012', 'trainval')], transform=None, target_transform=VOCAnnotationTransform(), dataset_name='VOC0712'): self.root = data_dir self.image_set = image_sets self.transform = transform self.target_transform = target_transform self.name = dataset_name self._annopath = osp.join('%s', 'Annotations', '%s.xml') self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') self.ids = list() for (year, name) in image_sets: rootpath = osp.join(self.root, 'VOC' + year) for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): self.ids.append((rootpath, line.strip())) def __getitem__(self, index): im, gt, h, w = self.pull_item(index) return im, gt def __len__(self): return len(self.ids) def pull_item(self, index): # load an image img_id = self.ids[index] img = cv2.imread(self._imgpath % img_id) height, width, channels = img.shape # load a target target = ET.parse(self._annopath % img_id).getroot() if self.target_transform is not None: target = self.target_transform(target, width, height) # check target if len(target) == 0: target = np.zeros([1, 5]) else: target = np.array(target) # transform if self.transform is not None: img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) # to rgb img = img[:, :, (2, 1, 0)] # to tensor img = torch.from_numpy(img).permute(2, 0, 1).float() # target target = np.hstack((boxes, np.expand_dims(labels, axis=1))) return img, target, height, width def pull_image(self, index): '''Returns the original image object at index in PIL form Note: not using self.__getitem__(), as any transformations passed in could mess up this functionality. Argument: index (int): index of img to show Return: PIL img ''' img_id = self.ids[index] return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR), img_id def pull_anno(self, index): '''Returns the original annotation of image at index Note: not using self.__getitem__(), as any transformations passed in could mess up this functionality. Argument: index (int): index of img to get annotation of Return: list: [img_id, [(label, bbox coords),...]] eg: ('001718', [('dog', (96, 13, 438, 332))]) ''' img_id = self.ids[index] anno = ET.parse(self._annopath % img_id).getroot() gt = self.target_transform(anno, 1, 1) return img_id[1], gt def pull_tensor(self, index): '''Returns the original image at an index in tensor form Note: not using self.__getitem__(), as any transformations passed in could mess up this functionality. Argument: index (int): index of img to show Return: tensorized version of img, squeezed ''' return torch.Tensor(self.pull_image(index)).unsqueeze_(0) if __name__ == "__main__": def base_transform(image, size, mean): x = cv2.resize(image, (size, size)).astype(np.float32) x -= mean x = x.astype(np.float32) return x class BaseTransform: def __init__(self, size, mean): self.size = size self.mean = np.array(mean, dtype=np.float32) def __call__(self, image, boxes=None, labels=None): return base_transform(image, self.size, self.mean), boxes, labels img_size = 640 # dataset dataset = VOCDetection(data_dir='/mnt/share/ssd2/dataset/VOCdevkit/', image_sets=[('2007', 'trainval')], transform=BaseTransform(img_size, (0, 0, 0))) for i in range(1000): im, gt, h, w = dataset.pull_item(i) img = im.permute(1,2,0).numpy()[:, :, (2, 1, 0)].astype(np.uint8) img = img.copy() for box in gt: xmin, ymin, xmax, ymax, _ = box xmin *= img_size ymin *= img_size xmax *= img_size ymax *= img_size img = cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0,0,255), 2) cv2.imshow('gt', img) cv2.waitKey(0) ================================================ FILE: demo.py ================================================ import argparse import os import numpy as np import cv2 import time import torch from data.coco2017 import coco_class_index, coco_class_labels from data import config, BaseTransform def parse_args(): parser = argparse.ArgumentParser(description='YOLO Demo Detection') # basic parser.add_argument('--mode', default='image', type=str, help='Use the data from image, video or camera') parser.add_argument('-size', '--input_size', default=416, type=int, help='input_size') parser.add_argument('--cuda', action='store_true', default=False, help='Use cuda') parser.add_argument('--path_to_img', default='data/demo/images/', type=str, help='The path to image files') parser.add_argument('--path_to_vid', default='data/demo/videos/', type=str, help='The path to video files') parser.add_argument('--path_to_save', default='det_results/', type=str, help='The path to save the detection results') parser.add_argument('-vs', '--visual_threshold', default=0.3, type=float, help='visual threshold') # model parser.add_argument('-v', '--version', default='yolo_v2', help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny') parser.add_argument('--conf_thresh', default=0.1, type=float, help='NMS threshold') parser.add_argument('--nms_thresh', default=0.45, type=float, help='NMS threshold') parser.add_argument('--trained_model', default='weights/', type=str, help='Trained state_dict file path to open') return parser.parse_args() def plot_bbox_labels(img, bbox, label, cls_color, test_scale=0.4): x1, y1, x2, y2 = bbox x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0] # plot bbox cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2) # plot title bbox cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * test_scale), y1), cls_color, -1) # put the test on the title bbox cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, test_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA) return img def visualize(img, bboxes, scores, cls_inds, class_colors, vis_thresh=0.3): ts = 0.4 for i, bbox in enumerate(bboxes): if scores[i] > vis_thresh: cls_color = class_colors[int(cls_inds[i])] cls_id = coco_class_index[int(cls_inds[i])] mess = '%s: %.2f' % (coco_class_labels[cls_id], scores[i]) img = plot_bbox_labels(img, bbox, mess, cls_color, test_scale=ts) return img def detect(net, device, transform, vis_thresh, mode='image', path_to_img=None, path_to_vid=None, path_to_save=None): # class color class_colors = [(np.random.randint(255), np.random.randint(255), np.random.randint(255)) for _ in range(80)] save_path = os.path.join(path_to_save, mode) os.makedirs(save_path, exist_ok=True) # ------------------------- Camera ---------------------------- if mode == 'camera': print('use camera !!!') cap = cv2.VideoCapture(0, cv2.CAP_DSHOW) while True: ret, frame = cap.read() if ret: if cv2.waitKey(1) == ord('q'): break img_h, img_w = frame.shape[:2] scale = np.array([[img_w, img_h, img_w, img_h]]) # prepare x = torch.from_numpy(transform(frame)[0][:, :, ::-1]).permute(2, 0, 1) x = x.unsqueeze(0).to(device) # inference t0 = time.time() bboxes, scores, cls_inds = net(x) t1 = time.time() print("detection time used ", t1-t0, "s") # rescale bboxes *= scale frame_processed = visualize(img=frame, bboxes=bboxes, scores=scores, cls_inds=cls_inds, class_colors=class_colors, vis_thresh=vis_thresh) cv2.imshow('detection result', frame_processed) cv2.waitKey(1) else: break cap.release() cv2.destroyAllWindows() # ------------------------- Image ---------------------------- elif mode == 'image': for i, img_id in enumerate(os.listdir(path_to_img)): img = cv2.imread(path_to_img + '/' + img_id, cv2.IMREAD_COLOR) img_h, img_w = img.shape[:2] scale = np.array([[img_w, img_h, img_w, img_h]]) # prepare x = torch.from_numpy(transform(img)[0][:, :, ::-1]).permute(2, 0, 1) x = x.unsqueeze(0).to(device) # inference t0 = time.time() bboxes, scores, cls_inds = net(x) t1 = time.time() print("detection time used ", t1-t0, "s") # rescale bboxes *= scale img_processed = visualize(img=img, bboxes=bboxes, scores=scores, cls_inds=cls_inds, class_colors=class_colors, vis_thresh=vis_thresh) cv2.imshow('detection', img_processed) cv2.imwrite(os.path.join(save_path, str(i).zfill(6)+'.jpg'), img_processed) cv2.waitKey(0) # ------------------------- Video --------------------------- elif mode == 'video': video = cv2.VideoCapture(path_to_vid) fourcc = cv2.VideoWriter_fourcc(*'XVID') save_size = (640, 480) save_path = os.path.join(save_path, 'det.avi') fps = 15.0 out = cv2.VideoWriter(save_path, fourcc, fps, save_size) while(True): ret, frame = video.read() if ret: # ------------------------- Detection --------------------------- img_h, img_w = frame.shape[:2] scale = np.array([[img_w, img_h, img_w, img_h]]) # prepare x = torch.from_numpy(transform(frame)[0][:, :, ::-1]).permute(2, 0, 1) x = x.unsqueeze(0).to(device) # inference t0 = time.time() bboxes, scores, cls_inds = net(x) t1 = time.time() print("detection time used ", t1-t0, "s") # rescale bboxes *= scale frame_processed = visualize(img=frame, bboxes=bboxes, scores=scores, cls_inds=cls_inds, class_colors=class_colors, vis_thresh=vis_thresh) frame_processed_resize = cv2.resize(frame_processed, save_size) out.write(frame_processed_resize) cv2.imshow('detection', frame_processed) cv2.waitKey(1) else: break video.release() out.release() cv2.destroyAllWindows() def run(): args = parse_args() # use cuda if args.cuda: device = torch.device("cuda") else: device = torch.device("cpu") # model model_name = args.version print('Model: ', model_name) # load model and config file if model_name == 'yolov2_d19': from models.yolov2_d19 import YOLOv2D19 as yolo_net cfg = config.yolov2_d19_cfg elif model_name == 'yolov2_r50': from models.yolov2_r50 import YOLOv2R50 as yolo_net cfg = config.yolov2_r50_cfg elif model_name == 'yolov2_slim': from models.yolov2_slim import YOLOv2Slim as yolo_net cfg = config.yolov2_slim_cfg elif model_name == 'yolov3': from models.yolov3 import YOLOv3 as yolo_net cfg = config.yolov3_d53_cfg elif model_name == 'yolov3_spp': from models.yolov3_spp import YOLOv3Spp as yolo_net cfg = config.yolov3_d53_cfg elif model_name == 'yolov3_tiny': from models.yolov3_tiny import YOLOv3tiny as yolo_net cfg = config.yolov3_tiny_cfg else: print('Unknown model name...') exit(0) input_size = [args.input_size, args.input_size] # build model anchor_size = cfg['anchor_size_coco'] net = yolo_net(device=device, input_size=input_size, num_classes=80, trainable=False, conf_thresh=args.conf_thresh, nms_thresh=args.nms_thresh, anchor_size=anchor_size) # load weight net.load_state_dict(torch.load(args.trained_model, map_location=device)) net.to(device).eval() print('Finished loading model!') # run detect(net=net, device=device, transform=BaseTransform(input_size), mode=args.mode, path_to_img=args.path_to_img, path_to_vid=args.path_to_vid, path_to_save=args.path_to_save, thresh=args.visual_threshold ) if __name__ == '__main__': run() ================================================ FILE: eval.py ================================================ import argparse import os import torch from utils.vocapi_evaluator import VOCAPIEvaluator from utils.cocoapi_evaluator import COCOAPIEvaluator from data import BaseTransform, config parser = argparse.ArgumentParser(description='YOLO Detector Evaluation') parser.add_argument('-v', '--version', default='yolo_v2', help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny') parser.add_argument('--trained_model', type=str, default='weights/', help='Trained state_dict file path to open') parser.add_argument('-size', '--input_size', default=416, type=int, help='input_size') parser.add_argument('--cuda', action='store_true', default=False, help='Use cuda') # dataset parser.add_argument('--root', default='/mnt/share/ssd2/dataset', help='data root') parser.add_argument('-d', '--dataset', default='coco-val', help='voc, coco-val, coco-test.') args = parser.parse_args() def voc_test(model, data_dir, device, input_size): evaluator = VOCAPIEvaluator(data_root=data_dir, img_size=input_size, device=device, transform=BaseTransform(input_size), display=True) # VOC evaluation evaluator.evaluate(model) def coco_test(model, data_dir, device, input_size, test=False): if test: # test-dev print('test on test-dev 2017') evaluator = COCOAPIEvaluator( data_dir=data_dir, img_size=input_size, device=device, testset=True, transform=BaseTransform(input_size) ) else: # eval evaluator = COCOAPIEvaluator( data_dir=data_dir, img_size=input_size, device=device, testset=False, transform=BaseTransform(input_size) ) # COCO evaluation evaluator.evaluate(model) if __name__ == '__main__': # dataset if args.dataset == 'voc': print('eval on voc ...') num_classes = 20 data_dir = os.path.join(args.root, 'VOCdevkit') elif args.dataset == 'coco-val': print('eval on coco-val ...') num_classes = 80 data_dir = os.path.join(args.root, 'COCO') elif args.dataset == 'coco-test': print('eval on coco-test-dev ...') num_classes = 80 data_dir = os.path.join(args.root, 'COCO') else: print('unknow dataset !! we only support voc, coco-val, coco-test !!!') exit(0) # cuda if args.cuda: print('use cuda') torch.backends.cudnn.benchmark = True device = torch.device("cuda") else: device = torch.device("cpu") # model model_name = args.version print('Model: ', model_name) # load model and config file if model_name == 'yolov2_d19': from models.yolov2_d19 import YOLOv2D19 as yolo_net cfg = config.yolov2_d19_cfg elif model_name == 'yolov2_r50': from models.yolov2_r50 import YOLOv2R50 as yolo_net cfg = config.yolov2_r50_cfg elif model_name == 'yolov2_slim': from models.yolov2_slim import YOLOv2Slim as yolo_net cfg = config.yolov2_slim_cfg elif model_name == 'yolov3': from models.yolov3 import YOLOv3 as yolo_net cfg = config.yolov3_d53_cfg elif model_name == 'yolov3_spp': from models.yolov3_spp import YOLOv3Spp as yolo_net cfg = config.yolov3_d53_cfg elif model_name == 'yolov3_tiny': from models.yolov3_tiny import YOLOv3tiny as yolo_net cfg = config.yolov3_tiny_cfg else: print('Unknown model name...') exit(0) # input size input_size = args.input_size # build model anchor_size = cfg['anchor_size_voc'] if args.dataset == 'voc' else cfg['anchor_size_coco'] net = yolo_net(device=device, input_size=input_size, num_classes=num_classes, trainable=False, anchor_size=anchor_size) # load net net.load_state_dict(torch.load(args.trained_model, map_location='cuda')) net.eval() print('Finished loading model!') net = net.to(device) # evaluation with torch.no_grad(): if args.dataset == 'voc': voc_test(net, data_dir, device, input_size) elif args.dataset == 'coco-val': coco_test(net, data_dir, device, input_size, test=False) elif args.dataset == 'coco-test': coco_test(net, data_dir, device, input_size, test=True) ================================================ FILE: models/yolov2_d19.py ================================================ import numpy as np import torch import torch.nn as nn from utils.modules import Conv, reorg_layer from backbone import build_backbone import tools class YOLOv2D19(nn.Module): def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.5, anchor_size=None): super(YOLOv2D19, self).__init__() self.device = device self.input_size = input_size self.num_classes = num_classes self.trainable = trainable self.conf_thresh = conf_thresh self.nms_thresh = nms_thresh self.anchor_size = torch.tensor(anchor_size) self.num_anchors = len(anchor_size) self.stride = 32 self.grid_cell, self.all_anchor_wh = self.create_grid(input_size) # backbone darknet-19 self.backbone = build_backbone(model_name='darknet19', pretrained=trainable) # detection head self.convsets_1 = nn.Sequential( Conv(1024, 1024, k=3, p=1), Conv(1024, 1024, k=3, p=1) ) self.route_layer = Conv(512, 64, k=1) self.reorg = reorg_layer(stride=2) self.convsets_2 = Conv(1280, 1024, k=3, p=1) # prediction layer self.pred = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) def create_grid(self, input_size): w, h = input_size, input_size # generate grid cells ws, hs = w // self.stride, h // self.stride grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() grid_xy = grid_xy.view(1, hs*ws, 1, 2).to(self.device) # generate anchor_wh tensor anchor_wh = self.anchor_size.repeat(hs*ws, 1, 1).unsqueeze(0).to(self.device) return grid_xy, anchor_wh def set_grid(self, input_size): self.input_size = input_size self.grid_cell, self.all_anchor_wh = self.create_grid(input_size) def decode_xywh(self, txtytwth_pred): """ Input: \n txtytwth_pred : [B, H*W, anchor_n, 4] \n Output: \n xywh_pred : [B, H*W*anchor_n, 4] \n """ B, HW, ab_n, _ = txtytwth_pred.size() # b_x = sigmoid(tx) + gride_x # b_y = sigmoid(ty) + gride_y xy_pred = torch.sigmoid(txtytwth_pred[..., :2]) + self.grid_cell # b_w = anchor_w * exp(tw) # b_h = anchor_h * exp(th) wh_pred = torch.exp(txtytwth_pred[..., 2:]) * self.all_anchor_wh # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4] xywh_pred = torch.cat([xy_pred, wh_pred], -1).view(B, -1, 4) * self.stride return xywh_pred def decode_boxes(self, txtytwth_pred): """ Input: \n txtytwth_pred : [B, H*W, anchor_n, 4] \n Output: \n x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n """ # txtytwth -> cxcywh xywh_pred = self.decode_xywh(txtytwth_pred) # cxcywh -> x1y1x2y2 x1y1x2y2_pred = torch.zeros_like(xywh_pred) x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) return x1y1x2y2_pred def nms(self, dets, scores): """"Pure Python NMS baseline.""" x1 = dets[:, 0] #xmin y1 = dets[:, 1] #ymin x2 = dets[:, 2] #xmax y2 = dets[:, 3] #ymax areas = (x2 - x1) * (y2 - y1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(1e-10, xx2 - xx1) h = np.maximum(1e-10, yy2 - yy1) inter = w * h # Cross Area / (bbox + particular area - Cross Area) ovr = inter / (areas[i] + areas[order[1:]] - inter) #reserve all the boundingbox whose ovr less than thresh inds = np.where(ovr <= self.nms_thresh)[0] order = order[inds + 1] return keep def postprocess(self, bboxes, scores): """ bboxes: (HxW, 4), bsize = 1 scores: (HxW, num_classes), bsize = 1 """ cls_inds = np.argmax(scores, axis=1) scores = scores[(np.arange(scores.shape[0]), cls_inds)] # threshold keep = np.where(scores >= self.conf_thresh) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] # NMS keep = np.zeros(len(bboxes), dtype=np.int) for i in range(self.num_classes): inds = np.where(cls_inds == i)[0] if len(inds) == 0: continue c_bboxes = bboxes[inds] c_scores = scores[inds] c_keep = self.nms(c_bboxes, c_scores) keep[inds[c_keep]] = 1 keep = np.where(keep > 0) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] return bboxes, scores, cls_inds @ torch.no_grad() def inference(self, x): # backbone feats = self.backbone(x) # reorg layer p5 = self.convsets_1(feats['layer3']) p4 = self.reorg(self.route_layer(feats['layer2'])) p5 = torch.cat([p4, p5], dim=1) # head p5 = self.convsets_2(p5) # pred pred = self.pred(p5) B, abC, H, W = pred.size() # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC) # [B, H*W*num_anchor, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1) # [B, H*W, num_anchor, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes) # [B, H*W, num_anchor, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() # decode box reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4) box_pred = self.decode_boxes(reg_pred) # batch size = 1 conf_pred = conf_pred[0] cls_pred = cls_pred[0] box_pred = box_pred[0] # score scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) # normalize bbox bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) # to cpu scores = scores.to('cpu').numpy() bboxes = bboxes.to('cpu').numpy() # post-process bboxes, scores, cls_inds = self.postprocess(bboxes, scores) return bboxes, scores, cls_inds def forward(self, x, target=None): if not self.trainable: return self.inference(x) else: # backbone feats = self.backbone(x) # reorg layer p5 = self.convsets_1(feats['layer3']) p4 = self.reorg(self.route_layer(feats['layer2'])) p5 = torch.cat([p4, p5], dim=1) # head p5 = self.convsets_2(p5) # pred pred = self.pred(p5) B, abC, H, W = pred.size() # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC) # [B, H*W*num_anchor, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1) # [B, H*W, num_anchor, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes) # [B, H*W, num_anchor, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4) # decode bbox x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) reg_pred = reg_pred.view(B, H*W*self.num_anchors, 4) # set conf target iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) gt_conf = iou_pred.clone().detach() # [obj, cls, txtytwth, x1y1x2y2] -> [conf, obj, cls, txtytwth] target = torch.cat([gt_conf, target[:, :, :7]], dim=2) # loss ( conf_loss, cls_loss, bbox_loss, iou_loss ) = tools.loss(pred_conf=conf_pred, pred_cls=cls_pred, pred_txtytwth=reg_pred, pred_iou=iou_pred, label=target ) return conf_loss, cls_loss, bbox_loss, iou_loss ================================================ FILE: models/yolov2_r50.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from utils.modules import Conv, reorg_layer from backbone import build_backbone import numpy as np import tools class YOLOv2R50(nn.Module): def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.6, anchor_size=None, hr=False): super(YOLOv2R50, self).__init__() self.device = device self.input_size = input_size self.num_classes = num_classes self.trainable = trainable self.conf_thresh = conf_thresh self.nms_thresh = nms_thresh self.anchor_size = torch.tensor(anchor_size) self.num_anchors = len(anchor_size) self.stride = 32 self.grid_cell, self.all_anchor_wh = self.create_grid(input_size) # backbone self.backbone = build_backbone(model_name='resnet50', pretrained=trainable) # head self.convsets_1 = nn.Sequential( Conv(2048, 1024, k=1), Conv(1024, 1024, k=3, p=1), Conv(1024, 1024, k=3, p=1) ) # reorg self.route_layer = Conv(1024, 128, k=1) self.reorg = reorg_layer(stride=2) # head self.convsets_2 = Conv(1024+128*4, 1024, k=3, p=1) # pred self.pred = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), 1) if self.trainable: # init bias self.init_bias() def init_bias(self): # init bias init_prob = 0.01 bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) nn.init.constant_(self.pred.bias[..., :self.num_anchors], bias_value) def create_grid(self, input_size): w, h = input_size, input_size # generate grid cells ws, hs = w // self.stride, h // self.stride grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() grid_xy = grid_xy.view(1, hs*ws, 1, 2).to(self.device) # generate anchor_wh tensor anchor_wh = self.anchor_size.repeat(hs*ws, 1, 1).unsqueeze(0).to(self.device) return grid_xy, anchor_wh def set_grid(self, input_size): self.input_size = input_size self.grid_cell, self.all_anchor_wh = self.create_grid(input_size) def decode_xywh(self, txtytwth_pred): """ Input: \n txtytwth_pred : [B, H*W, anchor_n, 4] \n Output: \n xywh_pred : [B, H*W*anchor_n, 4] \n """ B, HW, ab_n, _ = txtytwth_pred.size() # b_x = sigmoid(tx) + gride_x # b_y = sigmoid(ty) + gride_y xy_pred = torch.sigmoid(txtytwth_pred[:, :, :, :2]) + self.grid_cell # b_w = anchor_w * exp(tw) # b_h = anchor_h * exp(th) wh_pred = torch.exp(txtytwth_pred[:, :, :, 2:]) * self.all_anchor_wh # [H*W, anchor_n, 4] -> [H*W*anchor_n, 4] xywh_pred = torch.cat([xy_pred, wh_pred], -1).view(B, -1, 4) * self.stride return xywh_pred def decode_boxes(self, txtytwth_pred): """ Input: \n txtytwth_pred : [B, H*W, anchor_n, 4] \n Output: \n x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n """ # txtytwth -> cxcywh xywh_pred = self.decode_xywh(txtytwth_pred) # cxcywh -> x1y1x2y2 x1y1x2y2_pred = torch.zeros_like(xywh_pred) x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) return x1y1x2y2_pred def nms(self, dets, scores): """"Pure Python NMS baseline.""" x1 = dets[:, 0] #xmin y1 = dets[:, 1] #ymin x2 = dets[:, 2] #xmax y2 = dets[:, 3] #ymax areas = (x2 - x1) * (y2 - y1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(1e-10, xx2 - xx1) h = np.maximum(1e-10, yy2 - yy1) inter = w * h # Cross Area / (bbox + particular area - Cross Area) ovr = inter / (areas[i] + areas[order[1:]] - inter) #reserve all the boundingbox whose ovr less than thresh inds = np.where(ovr <= self.nms_thresh)[0] order = order[inds + 1] return keep def postprocess(self, bboxes, scores): """ bboxes: (HxW, 4), bsize = 1 scores: (HxW, num_classes), bsize = 1 """ cls_inds = np.argmax(scores, axis=1) scores = scores[(np.arange(scores.shape[0]), cls_inds)] # threshold keep = np.where(scores >= self.conf_thresh) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] # NMS keep = np.zeros(len(bboxes), dtype=np.int) for i in range(self.num_classes): inds = np.where(cls_inds == i)[0] if len(inds) == 0: continue c_bboxes = bboxes[inds] c_scores = scores[inds] c_keep = self.nms(c_bboxes, c_scores) keep[inds[c_keep]] = 1 keep = np.where(keep > 0) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] return bboxes, scores, cls_inds @ torch.no_grad() def inference(self, x): # backbone feats = self.backbone(x) # reorg layer p5 = self.convsets_1(feats['layer3']) p4 = self.reorg(self.route_layer(feats['layer2'])) p5 = torch.cat([p4, p5], dim=1) # head p5 = self.convsets_2(p5) # pred pred = self.pred(p5) B, abC, H, W = pred.size() # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC) # [B, H*W*num_anchor, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1) # [B, H*W, num_anchor, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes) # [B, H*W, num_anchor, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() # decode box reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4) box_pred = self.decode_boxes(reg_pred) # batch size = 1 conf_pred = conf_pred[0] cls_pred = cls_pred[0] box_pred = box_pred[0] # score scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) # normalize bbox bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) # to cpu scores = scores.to('cpu').numpy() bboxes = bboxes.to('cpu').numpy() # post-process bboxes, scores, cls_inds = self.postprocess(bboxes, scores) return bboxes, scores, cls_inds def forward(self, x, target=None): if not self.trainable: return self.inference(x) else: # backbone feats = self.backbone(x) # reorg layer p5 = self.convsets_1(feats['layer3']) p4 = self.reorg(self.route_layer(feats['layer2'])) p5 = torch.cat([p4, p5], dim=1) # head p5 = self.convsets_2(p5) # pred pred = self.pred(p5) B, abC, H, W = pred.size() # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC) # [B, H*W*num_anchor, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1) # [B, H*W, num_anchor, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes) # [B, H*W, num_anchor, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4) # decode bbox x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) reg_pred = reg_pred.view(B, H*W*self.num_anchors, 4) # set conf target iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) gt_conf = iou_pred.clone().detach() # [obj, cls, txtytwth, x1y1x2y2] -> [conf, obj, cls, txtytwth] target = torch.cat([gt_conf, target[:, :, :7]], dim=2) # loss ( conf_loss, cls_loss, bbox_loss, iou_loss ) = tools.loss(pred_conf=conf_pred, pred_cls=cls_pred, pred_txtytwth=reg_pred, pred_iou=iou_pred, label=target ) return conf_loss, cls_loss, bbox_loss, iou_loss ================================================ FILE: models/yolov3.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from utils.modules import Conv from backbone import build_backbone import numpy as np import tools class YOLOv3(nn.Module): def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.50, anchor_size=None): super(YOLOv3, self).__init__() self.device = device self.input_size = input_size self.num_classes = num_classes self.trainable = trainable self.conf_thresh = conf_thresh self.nms_thresh = nms_thresh self.topk = 3000 self.stride = [8, 16, 32] self.anchor_size = torch.tensor(anchor_size).view(3, len(anchor_size) // 3, 2) self.num_anchors = self.anchor_size.size(1) self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) # backbone self.backbone = build_backbone(model_name='darknet53', pretrained=trainable) # s = 32 self.conv_set_3 = nn.Sequential( Conv(1024, 512, k=1), Conv(512, 1024, k=3, p=1), Conv(1024, 512, k=1), Conv(512, 1024, k=3, p=1), Conv(1024, 512, k=1) ) self.conv_1x1_3 = Conv(512, 256, k=1) self.extra_conv_3 = Conv(512, 1024, k=3, p=1) self.pred_3 = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) # s = 16 self.conv_set_2 = nn.Sequential( Conv(768, 256, k=1), Conv(256, 512, k=3, p=1), Conv(512, 256, k=1), Conv(256, 512, k=3, p=1), Conv(512, 256, k=1) ) self.conv_1x1_2 = Conv(256, 128, k=1) self.extra_conv_2 = Conv(256, 512, k=3, p=1) self.pred_2 = nn.Conv2d(512, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) # s = 8 self.conv_set_1 = nn.Sequential( Conv(384, 128, k=1), Conv(128, 256, k=3, p=1), Conv(256, 128, k=1), Conv(128, 256, k=3, p=1), Conv(256, 128, k=1) ) self.extra_conv_1 = Conv(128, 256, k=3, p=1) self.pred_1 = nn.Conv2d(256, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) self.init_yolo() def init_yolo(self): # Init head init_prob = 0.01 bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) # init obj&cls pred for pred in [self.pred_1, self.pred_2, self.pred_3]: nn.init.constant_(pred.bias[..., :self.num_anchors], bias_value) nn.init.constant_(pred.bias[..., self.num_anchors : (1 + self.num_classes) * self.num_anchors], bias_value) def create_grid(self, input_size): total_grid_xy = [] total_stride = [] total_anchor_wh = [] w, h = input_size, input_size for ind, s in enumerate(self.stride): # generate grid cells ws, hs = w // s, h // s grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() grid_xy = grid_xy.view(1, hs*ws, 1, 2) # generate stride tensor stride_tensor = torch.ones([1, hs*ws, self.num_anchors, 2]) * s # generate anchor_wh tensor anchor_wh = self.anchor_size[ind].repeat(hs*ws, 1, 1) total_grid_xy.append(grid_xy) total_stride.append(stride_tensor) total_anchor_wh.append(anchor_wh) total_grid_xy = torch.cat(total_grid_xy, dim=1).to(self.device) total_stride = torch.cat(total_stride, dim=1).to(self.device) total_anchor_wh = torch.cat(total_anchor_wh, dim=0).to(self.device).unsqueeze(0) return total_grid_xy, total_stride, total_anchor_wh def set_grid(self, input_size): self.input_size = input_size self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) def decode_xywh(self, txtytwth_pred): """ Input: txtytwth_pred : [B, H*W, anchor_n, 4] containing [tx, ty, tw, th] Output: xywh_pred : [B, H*W*anchor_n, 4] containing [x, y, w, h] """ # b_x = sigmoid(tx) + gride_x, b_y = sigmoid(ty) + gride_y B, HW, ab_n, _ = txtytwth_pred.size() c_xy_pred = (torch.sigmoid(txtytwth_pred[..., :2]) + self.grid_cell) * self.stride_tensor # b_w = anchor_w * exp(tw), b_h = anchor_h * exp(th) b_wh_pred = torch.exp(txtytwth_pred[..., 2:]) * self.all_anchors_wh # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4] xywh_pred = torch.cat([c_xy_pred, b_wh_pred], -1).view(B, HW*ab_n, 4) return xywh_pred def decode_boxes(self, txtytwth_pred): """ Input: \n txtytwth_pred : [B, H*W, anchor_n, 4] \n Output: \n x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n """ # txtytwth -> cxcywh xywh_pred = self.decode_xywh(txtytwth_pred) # cxcywh -> x1y1x2y2 x1y1x2y2_pred = torch.zeros_like(xywh_pred) x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) return x1y1x2y2_pred def nms(self, dets, scores): """"Pure Python NMS baseline.""" x1 = dets[:, 0] #xmin y1 = dets[:, 1] #ymin x2 = dets[:, 2] #xmax y2 = dets[:, 3] #ymax areas = (x2 - x1) * (y2 - y1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(1e-10, xx2 - xx1) h = np.maximum(1e-10, yy2 - yy1) inter = w * h # Cross Area / (bbox + particular area - Cross Area) ovr = inter / (areas[i] + areas[order[1:]] - inter) #reserve all the boundingbox whose ovr less than thresh inds = np.where(ovr <= self.nms_thresh)[0] order = order[inds + 1] return keep def postprocess(self, bboxes, scores): """ bboxes: (HxW, 4), bsize = 1 scores: (HxW, num_classes), bsize = 1 """ cls_inds = np.argmax(scores, axis=1) scores = scores[(np.arange(scores.shape[0]), cls_inds)] # threshold keep = np.where(scores >= self.conf_thresh) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] # NMS keep = np.zeros(len(bboxes), dtype=np.int) for i in range(self.num_classes): inds = np.where(cls_inds == i)[0] if len(inds) == 0: continue c_bboxes = bboxes[inds] c_scores = scores[inds] c_keep = self.nms(c_bboxes, c_scores) keep[inds[c_keep]] = 1 keep = np.where(keep > 0) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] # topk scores_sorted, scores_sorted_inds = np.sort(scores), np.argsort(scores) topk_scores, topk_scores_inds = scores_sorted[:self.topk], scores_sorted_inds[:self.topk] topk_bboxes = bboxes[topk_scores_inds] topk_cls_inds = cls_inds[topk_scores_inds] return topk_bboxes, topk_scores, topk_cls_inds @torch.no_grad() def inference(self, x): B = x.size(0) # backbone feats = self.backbone(x) c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3'] # FPN p5 = self.conv_set_3(c5) p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True) p4 = torch.cat([c4, p5_up], 1) p4 = self.conv_set_2(p4) p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True) p3 = torch.cat([c3, p4_up], 1) p3 = self.conv_set_1(p3) # head # s = 32 p5 = self.extra_conv_3(p5) pred_3 = self.pred_3(p5) # s = 16 p4 = self.extra_conv_2(p4) pred_2 = self.pred_2(p4) # s = 8 p3 = self.extra_conv_1(p3) pred_1 = self.pred_1(p3) preds = [pred_1, pred_2, pred_3] total_conf_pred = [] total_cls_pred = [] total_reg_pred = [] for pred in preds: C = pred.size(1) # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) # [B, H*W*anchor_n, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) # [B, H*W*anchor_n, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) # [B, H*W*anchor_n, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() total_conf_pred.append(conf_pred) total_cls_pred.append(cls_pred) total_reg_pred.append(reg_pred) conf_pred = torch.cat(total_conf_pred, dim=1) cls_pred = torch.cat(total_cls_pred, dim=1) reg_pred = torch.cat(total_reg_pred, dim=1) # decode bbox reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) box_pred = self.decode_boxes(reg_pred) # batch size = 1 conf_pred = conf_pred[0] cls_pred = cls_pred[0] box_pred = box_pred[0] # score scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) # normalize bbox bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) # to cpu scores = scores.to('cpu').numpy() bboxes = bboxes.to('cpu').numpy() # post-process bboxes, scores, cls_inds = self.postprocess(bboxes, scores) return bboxes, scores, cls_inds def forward(self, x, target=None): if not self.trainable: return self.inference(x) else: # backbone B = x.size(0) # backbone feats = self.backbone(x) c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3'] # FPN p5 = self.conv_set_3(c5) p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True) p4 = torch.cat([c4, p5_up], 1) p4 = self.conv_set_2(p4) p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True) p3 = torch.cat([c3, p4_up], 1) p3 = self.conv_set_1(p3) # head # s = 32 p5 = self.extra_conv_3(p5) pred_3 = self.pred_3(p5) # s = 16 p4 = self.extra_conv_2(p4) pred_2 = self.pred_2(p4) # s = 8 p3 = self.extra_conv_1(p3) pred_1 = self.pred_1(p3) preds = [pred_1, pred_2, pred_3] total_conf_pred = [] total_cls_pred = [] total_reg_pred = [] for pred in preds: C = pred.size(1) # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) # [B, H*W*anchor_n, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) # [B, H*W*anchor_n, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) # [B, H*W*anchor_n, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() total_conf_pred.append(conf_pred) total_cls_pred.append(cls_pred) total_reg_pred.append(reg_pred) conf_pred = torch.cat(total_conf_pred, dim=1) cls_pred = torch.cat(total_cls_pred, dim=1) reg_pred = torch.cat(total_reg_pred, dim=1) # decode bbox reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) reg_pred = reg_pred.view(B, -1, 4) x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) # set conf target iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) gt_conf = iou_pred.clone().detach() # [obj, cls, txtytwth, scale_weight, x1y1x2y2] -> [conf, obj, cls, txtytwth, scale_weight] target = torch.cat([gt_conf, target[:, :, :7]], dim=2) # loss ( conf_loss, cls_loss, bbox_loss, iou_loss ) = tools.loss(pred_conf=conf_pred, pred_cls=cls_pred, pred_txtytwth=reg_pred, pred_iou=iou_pred, label=target ) return conf_loss, cls_loss, bbox_loss, iou_loss ================================================ FILE: models/yolov3_spp.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np from utils.modules import Conv, SPP from backbone import build_backbone import tools # YOLOv3 SPP class YOLOv3Spp(nn.Module): def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.50, anchor_size=None): super(YOLOv3Spp, self).__init__() self.device = device self.input_size = input_size self.num_classes = num_classes self.trainable = trainable self.conf_thresh = conf_thresh self.nms_thresh = nms_thresh self.stride = [8, 16, 32] self.anchor_size = torch.tensor(anchor_size).view(3, len(anchor_size) // 3, 2) self.num_anchors = self.anchor_size.size(1) self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) # backbone self.backbone = build_backbone(model_name='darknet53', pretrained=trainable) # s = 32 self.conv_set_3 = nn.Sequential( SPP(), Conv(1024*4, 512, k=1), Conv(512, 1024, k=3, p=1), Conv(1024, 512, k=1), Conv(512, 1024, k=3, p=1), Conv(1024, 512, k=1) ) self.conv_1x1_3 = Conv(512, 256, k=1) self.extra_conv_3 = Conv(512, 1024, k=3, p=1) self.pred_3 = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) # s = 16 self.conv_set_2 = nn.Sequential( Conv(768, 256, k=1), Conv(256, 512, k=3, p=1), Conv(512, 256, k=1), Conv(256, 512, k=3, p=1), Conv(512, 256, k=1) ) self.conv_1x1_2 = Conv(256, 128, k=1) self.extra_conv_2 = Conv(256, 512, k=3, p=1) self.pred_2 = nn.Conv2d(512, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) # s = 8 self.conv_set_1 = nn.Sequential( Conv(384, 128, k=1), Conv(128, 256, k=3, p=1), Conv(256, 128, k=1), Conv(128, 256, k=3, p=1), Conv(256, 128, k=1) ) self.extra_conv_1 = Conv(128, 256, k=3, p=1) self.pred_1 = nn.Conv2d(256, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) self.init_yolo() def init_yolo(self): # Init head init_prob = 0.01 bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) # init obj&cls pred for pred in [self.pred_1, self.pred_2, self.pred_3]: nn.init.constant_(pred.bias[..., :self.num_anchors], bias_value) nn.init.constant_(pred.bias[..., self.num_anchors : (1 + self.num_classes) * self.num_anchors], bias_value) def create_grid(self, input_size): total_grid_xy = [] total_stride = [] total_anchor_wh = [] w, h = input_size, input_size for ind, s in enumerate(self.stride): # generate grid cells ws, hs = w // s, h // s grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() grid_xy = grid_xy.view(1, hs*ws, 1, 2) # generate stride tensor stride_tensor = torch.ones([1, hs*ws, self.num_anchors, 2]) * s # generate anchor_wh tensor anchor_wh = self.anchor_size[ind].repeat(hs*ws, 1, 1) total_grid_xy.append(grid_xy) total_stride.append(stride_tensor) total_anchor_wh.append(anchor_wh) total_grid_xy = torch.cat(total_grid_xy, dim=1).to(self.device) total_stride = torch.cat(total_stride, dim=1).to(self.device) total_anchor_wh = torch.cat(total_anchor_wh, dim=0).to(self.device).unsqueeze(0) return total_grid_xy, total_stride, total_anchor_wh def set_grid(self, input_size): self.input_size = input_size self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) def decode_xywh(self, txtytwth_pred): """ Input: txtytwth_pred : [B, H*W, anchor_n, 4] containing [tx, ty, tw, th] Output: xywh_pred : [B, H*W*anchor_n, 4] containing [x, y, w, h] """ # b_x = sigmoid(tx) + gride_x, b_y = sigmoid(ty) + gride_y B, HW, ab_n, _ = txtytwth_pred.size() c_xy_pred = (torch.sigmoid(txtytwth_pred[:, :, :, :2]) + self.grid_cell) * self.stride_tensor # b_w = anchor_w * exp(tw), b_h = anchor_h * exp(th) b_wh_pred = torch.exp(txtytwth_pred[:, :, :, 2:]) * self.all_anchors_wh # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4] xywh_pred = torch.cat([c_xy_pred, b_wh_pred], -1).view(B, HW*ab_n, 4) return xywh_pred def decode_boxes(self, txtytwth_pred): """ Input: \n txtytwth_pred : [B, H*W, anchor_n, 4] \n Output: \n x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n """ # txtytwth -> cxcywh xywh_pred = self.decode_xywh(txtytwth_pred) # cxcywh -> x1y1x2y2 x1y1x2y2_pred = torch.zeros_like(xywh_pred) x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) return x1y1x2y2_pred def nms(self, dets, scores): """"Pure Python NMS baseline.""" x1 = dets[:, 0] #xmin y1 = dets[:, 1] #ymin x2 = dets[:, 2] #xmax y2 = dets[:, 3] #ymax areas = (x2 - x1) * (y2 - y1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(1e-10, xx2 - xx1) h = np.maximum(1e-10, yy2 - yy1) inter = w * h # Cross Area / (bbox + particular area - Cross Area) ovr = inter / (areas[i] + areas[order[1:]] - inter) #reserve all the boundingbox whose ovr less than thresh inds = np.where(ovr <= self.nms_thresh)[0] order = order[inds + 1] return keep def postprocess(self, bboxes, scores): """ bboxes: (HxW, 4), bsize = 1 scores: (HxW, num_classes), bsize = 1 """ cls_inds = np.argmax(scores, axis=1) scores = scores[(np.arange(scores.shape[0]), cls_inds)] # threshold keep = np.where(scores >= self.conf_thresh) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] # NMS keep = np.zeros(len(bboxes), dtype=np.int) for i in range(self.num_classes): inds = np.where(cls_inds == i)[0] if len(inds) == 0: continue c_bboxes = bboxes[inds] c_scores = scores[inds] c_keep = self.nms(c_bboxes, c_scores) keep[inds[c_keep]] = 1 keep = np.where(keep > 0) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] return bboxes, scores, cls_inds @torch.no_grad() def inference(self, x): B = x.size(0) # backbone feats = self.backbone(x) c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3'] # FPN p5 = self.conv_set_3(c5) p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True) p4 = torch.cat([c4, p5_up], 1) p4 = self.conv_set_2(p4) p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True) p3 = torch.cat([c3, p4_up], 1) p3 = self.conv_set_1(p3) # head # s = 32 p5 = self.extra_conv_3(p5) pred_3 = self.pred_3(p5) # s = 16 p4 = self.extra_conv_2(p4) pred_2 = self.pred_2(p4) # s = 8 p3 = self.extra_conv_1(p3) pred_1 = self.pred_1(p3) preds = [pred_1, pred_2, pred_3] total_conf_pred = [] total_cls_pred = [] total_reg_pred = [] for pred in preds: C = pred.size(1) # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) # [B, H*W*anchor_n, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) # [B, H*W*anchor_n, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) # [B, H*W*anchor_n, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() total_conf_pred.append(conf_pred) total_cls_pred.append(cls_pred) total_reg_pred.append(reg_pred) conf_pred = torch.cat(total_conf_pred, dim=1) cls_pred = torch.cat(total_cls_pred, dim=1) reg_pred = torch.cat(total_reg_pred, dim=1) # decode bbox reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) box_pred = self.decode_boxes(reg_pred) # batch size = 1 conf_pred = conf_pred[0] cls_pred = cls_pred[0] box_pred = box_pred[0] # score scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) # normalize bbox bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) # to cpu scores = scores.to('cpu').numpy() bboxes = bboxes.to('cpu').numpy() # post-process bboxes, scores, cls_inds = self.postprocess(bboxes, scores) return bboxes, scores, cls_inds def forward(self, x, target=None): if not self.trainable: return self.inference(x) else: # backbone B = x.size(0) # backbone feats = self.backbone(x) c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3'] # FPN p5 = self.conv_set_3(c5) p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True) p4 = torch.cat([c4, p5_up], 1) p4 = self.conv_set_2(p4) p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True) p3 = torch.cat([c3, p4_up], 1) p3 = self.conv_set_1(p3) # head # s = 32 p5 = self.extra_conv_3(p5) pred_3 = self.pred_3(p5) # s = 16 p4 = self.extra_conv_2(p4) pred_2 = self.pred_2(p4) # s = 8 p3 = self.extra_conv_1(p3) pred_1 = self.pred_1(p3) preds = [pred_1, pred_2, pred_3] total_conf_pred = [] total_cls_pred = [] total_reg_pred = [] for pred in preds: C = pred.size(1) # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) # [B, H*W*anchor_n, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) # [B, H*W*anchor_n, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) # [B, H*W*anchor_n, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() total_conf_pred.append(conf_pred) total_cls_pred.append(cls_pred) total_reg_pred.append(reg_pred) conf_pred = torch.cat(total_conf_pred, dim=1) cls_pred = torch.cat(total_cls_pred, dim=1) reg_pred = torch.cat(total_reg_pred, dim=1) # decode bbox reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) reg_pred = reg_pred.view(B, -1, 4) x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) # set conf target iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) gt_conf = iou_pred.clone().detach() # [obj, cls, txtytwth, scale_weight, x1y1x2y2] -> [conf, obj, cls, txtytwth, scale_weight] target = torch.cat([gt_conf, target[:, :, :7]], dim=2) # loss ( conf_loss, cls_loss, bbox_loss, iou_loss ) = tools.loss(pred_conf=conf_pred, pred_cls=cls_pred, pred_txtytwth=reg_pred, pred_iou=iou_pred, label=target ) return conf_loss, cls_loss, bbox_loss, iou_loss ================================================ FILE: models/yolov3_tiny.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np from utils.modules import Conv from backbone import build_backbone import tools # YOLOv3 Tiny class YOLOv3tiny(nn.Module): def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.01, nms_thresh=0.50, anchor_size=None, hr=False): super(YOLOv3tiny, self).__init__() self.device = device self.input_size = input_size self.num_classes = num_classes self.trainable = trainable self.conf_thresh = conf_thresh self.nms_thresh = nms_thresh self.stride = [16, 32] self.anchor_size = torch.tensor(anchor_size).view(2, len(anchor_size) // 2, 2) self.num_anchors = self.anchor_size.size(1) self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) # backbone self.backbone = build_backbone(model_name='darknet_tiny', pretrained=trainable) # s = 32 self.conv_set_2 = Conv(1024, 256, k=3, p=1) self.conv_1x1_2 = Conv(256, 128, k=1) self.extra_conv_2 = Conv(256, 512, k=3, p=1) self.pred_2 = nn.Conv2d(512, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) # s = 16 self.conv_set_1 = Conv(384, 256, k=3, p=1) self.pred_1 = nn.Conv2d(256, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) self.init_yolo() def init_yolo(self): # Init head init_prob = 0.01 bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) # init obj&cls pred for pred in [self.pred_1, self.pred_2, self.pred_3]: nn.init.constant_(pred.bias[..., :self.num_anchors], bias_value) nn.init.constant_(pred.bias[..., self.num_anchors : (1 + self.num_classes) * self.num_anchors], bias_value) def create_grid(self, input_size): total_grid_xy = [] total_stride = [] total_anchor_wh = [] w, h = input_size, input_size for ind, s in enumerate(self.stride): # generate grid cells ws, hs = w // s, h // s grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() grid_xy = grid_xy.view(1, hs*ws, 1, 2) # generate stride tensor stride_tensor = torch.ones([1, hs*ws, self.num_anchors, 2]) * s # generate anchor_wh tensor anchor_wh = self.anchor_size[ind].repeat(hs*ws, 1, 1) total_grid_xy.append(grid_xy) total_stride.append(stride_tensor) total_anchor_wh.append(anchor_wh) total_grid_xy = torch.cat(total_grid_xy, dim=1).to(self.device) total_stride = torch.cat(total_stride, dim=1).to(self.device) total_anchor_wh = torch.cat(total_anchor_wh, dim=0).to(self.device).unsqueeze(0) return total_grid_xy, total_stride, total_anchor_wh def set_grid(self, input_size): self.input_size = input_size self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) def decode_xywh(self, txtytwth_pred): """ Input: txtytwth_pred : [B, H*W, anchor_n, 4] containing [tx, ty, tw, th] Output: xywh_pred : [B, H*W*anchor_n, 4] containing [x, y, w, h] """ # b_x = sigmoid(tx) + gride_x, b_y = sigmoid(ty) + gride_y B, HW, ab_n, _ = txtytwth_pred.size() c_xy_pred = (torch.sigmoid(txtytwth_pred[:, :, :, :2]) + self.grid_cell) * self.stride_tensor # b_w = anchor_w * exp(tw), b_h = anchor_h * exp(th) b_wh_pred = torch.exp(txtytwth_pred[:, :, :, 2:]) * self.all_anchors_wh # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4] xywh_pred = torch.cat([c_xy_pred, b_wh_pred], -1).view(B, HW*ab_n, 4) return xywh_pred def decode_boxes(self, txtytwth_pred): """ Input: \n txtytwth_pred : [B, H*W, anchor_n, 4] \n Output: \n x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n """ # txtytwth -> cxcywh xywh_pred = self.decode_xywh(txtytwth_pred) # cxcywh -> x1y1x2y2 x1y1x2y2_pred = torch.zeros_like(xywh_pred) x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) return x1y1x2y2_pred def nms(self, dets, scores): """"Pure Python NMS baseline.""" x1 = dets[:, 0] #xmin y1 = dets[:, 1] #ymin x2 = dets[:, 2] #xmax y2 = dets[:, 3] #ymax areas = (x2 - x1) * (y2 - y1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(1e-10, xx2 - xx1) h = np.maximum(1e-10, yy2 - yy1) inter = w * h # Cross Area / (bbox + particular area - Cross Area) ovr = inter / (areas[i] + areas[order[1:]] - inter) #reserve all the boundingbox whose ovr less than thresh inds = np.where(ovr <= self.nms_thresh)[0] order = order[inds + 1] return keep def postprocess(self, bboxes, scores): """ bboxes: (HxW, 4), bsize = 1 scores: (HxW, num_classes), bsize = 1 """ cls_inds = np.argmax(scores, axis=1) scores = scores[(np.arange(scores.shape[0]), cls_inds)] # threshold keep = np.where(scores >= self.conf_thresh) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] # NMS keep = np.zeros(len(bboxes), dtype=np.int) for i in range(self.num_classes): inds = np.where(cls_inds == i)[0] if len(inds) == 0: continue c_bboxes = bboxes[inds] c_scores = scores[inds] c_keep = self.nms(c_bboxes, c_scores) keep[inds[c_keep]] = 1 keep = np.where(keep > 0) bboxes = bboxes[keep] scores = scores[keep] cls_inds = cls_inds[keep] return bboxes, scores, cls_inds @torch.no_grad() def inference(self, x): B = x.size(0) # backbone feats = self.backbone(x) c4, c5 = feats['layer2'], feats['layer3'] # FPN p5 = self.conv_set_2(c5) p5_up = F.interpolate(self.conv_1x1_2(p5), scale_factor=2.0, mode='bilinear', align_corners=True) p4 = torch.cat([c4, p5_up], dim=1) p4 = self.conv_set_1(p4) # head # s = 32 p5 = self.extra_conv_2(p5) pred_2 = self.pred_2(p5) # s = 16 pred_1 = self.pred_1(p4) preds = [pred_1, pred_2] total_conf_pred = [] total_cls_pred = [] total_reg_pred = [] for pred in preds: C = pred.size(1) # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) # Divide prediction to obj_pred, xywh_pred and cls_pred # [B, H*W*anchor_n, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) # [B, H*W*anchor_n, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) # [B, H*W*anchor_n, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() total_conf_pred.append(conf_pred) total_cls_pred.append(cls_pred) total_reg_pred.append(reg_pred) conf_pred = torch.cat(total_conf_pred, dim=1) cls_pred = torch.cat(total_cls_pred, dim=1) reg_pred = torch.cat(total_reg_pred, dim=1) # decode bbox reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) box_pred = self.decode_boxes(reg_pred) # batch size = 1 conf_pred = conf_pred[0] cls_pred = cls_pred[0] box_pred = box_pred[0] # score scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) # normalize bbox bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) # to cpu scores = scores.to('cpu').numpy() bboxes = bboxes.to('cpu').numpy() # post-process bboxes, scores, cls_inds = self.postprocess(bboxes, scores) return bboxes, scores, cls_inds def forward(self, x, target=None): if not self.trainable: return self.inference(x) else: # backbone B = x.size(0) # backbone feats = self.backbone(x) c4, c5 = feats['layer2'], feats['layer3'] # FPN p5 = self.conv_set_2(c5) p5_up = F.interpolate(self.conv_1x1_2(p5), scale_factor=2.0, mode='bilinear', align_corners=True) p4 = torch.cat([c4, p5_up], dim=1) p4 = self.conv_set_1(p4) # head # s = 32 p5 = self.extra_conv_2(p5) pred_2 = self.pred_2(p5) # s = 16 pred_1 = self.pred_1(p4) preds = [pred_1, pred_2] total_conf_pred = [] total_cls_pred = [] total_reg_pred = [] for pred in preds: C = pred.size(1) # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) # Divide prediction to obj_pred, xywh_pred and cls_pred # [B, H*W*anchor_n, 1] conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) # [B, H*W*anchor_n, num_cls] cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) # [B, H*W*anchor_n, 4] reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() total_conf_pred.append(conf_pred) total_cls_pred.append(cls_pred) total_reg_pred.append(reg_pred) conf_pred = torch.cat(total_conf_pred, dim=1) cls_pred = torch.cat(total_cls_pred, dim=1) reg_pred = torch.cat(total_reg_pred, dim=1) # decode bbox reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) reg_pred = reg_pred.view(B, -1, 4) x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) # set conf target iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) gt_conf = iou_pred.clone().detach() # [obj, cls, txtytwth, scale_weight, x1y1x2y2] -> [conf, obj, cls, txtytwth, scale_weight] target = torch.cat([gt_conf, target[:, :, :7]], dim=2) # loss ( conf_loss, cls_loss, bbox_loss, iou_loss ) = tools.loss(pred_conf=conf_pred, pred_cls=cls_pred, pred_txtytwth=reg_pred, pred_iou=iou_pred, label=target ) return conf_loss, cls_loss, bbox_loss, iou_loss ================================================ FILE: test.py ================================================ import os import argparse import torch import torch.backends.cudnn as cudnn from data.voc0712 import VOC_CLASSES, VOCDetection from data.coco2017 import COCODataset, coco_class_index, coco_class_labels from data import config, BaseTransform import numpy as np import cv2 import time parser = argparse.ArgumentParser(description='YOLO Detection') # basic parser.add_argument('-size', '--input_size', default=416, type=int, help='input_size') parser.add_argument('--cuda', action='store_true', default=False, help='use cuda.') # model parser.add_argument('-v', '--version', default='yolo_v2', help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny') parser.add_argument('--trained_model', default='weight/', type=str, help='Trained state_dict file path to open') parser.add_argument('--conf_thresh', default=0.1, type=float, help='Confidence threshold') parser.add_argument('--nms_thresh', default=0.50, type=float, help='NMS threshold') # dataset parser.add_argument('-root', '--data_root', default='/mnt/share/ssd2/dataset', help='dataset root') parser.add_argument('-d', '--dataset', default='voc', help='voc or coco') # visualize parser.add_argument('-vs', '--visual_threshold', default=0.25, type=float, help='Final confidence threshold') parser.add_argument('--show', action='store_true', default=False, help='show the visulization results.') args = parser.parse_args() def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4): x1, y1, x2, y2 = bbox x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0] # plot bbox cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2) if label is not None: # plot title bbox cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1) # put the test on the title bbox cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA) return img def visualize(img, bboxes, scores, cls_inds, vis_thresh, class_colors, class_names, class_indexs=None, dataset_name='voc'): ts = 0.4 for i, bbox in enumerate(bboxes): if scores[i] > vis_thresh: cls_id = int(cls_inds[i]) if dataset_name == 'coco': cls_color = class_colors[cls_id] cls_id = class_indexs[cls_id] else: cls_color = class_colors[cls_id] if len(class_names) > 1: mess = '%s: %.2f' % (class_names[cls_id], scores[i]) else: cls_color = [255, 0, 0] mess = None img = plot_bbox_labels(img, bbox, mess, cls_color, text_scale=ts) return img def test(net, device, dataset, transform, vis_thresh, class_colors=None, class_names=None, class_indexs=None, dataset_name='voc'): num_images = len(dataset) save_path = os.path.join('det_results/', args.dataset, args.version) os.makedirs(save_path, exist_ok=True) for index in range(num_images): print('Testing image {:d}/{:d}....'.format(index+1, num_images)) image, _ = dataset.pull_image(index) h, w, _ = image.shape scale = np.array([[w, h, w, h]]) # to tensor x = torch.from_numpy(transform(image)[0][:, :, (2, 1, 0)]).permute(2, 0, 1) x = x.unsqueeze(0).to(device) t0 = time.time() # forward bboxes, scores, cls_inds = net(x) print("detection time used ", time.time() - t0, "s") # rescale bboxes *= scale # vis detection img_processed = visualize( img=image, bboxes=bboxes, scores=scores, cls_inds=cls_inds, vis_thresh=vis_thresh, class_colors=class_colors, class_names=class_names, class_indexs=class_indexs, dataset_name=dataset_name ) if args.show: cv2.imshow('detection', img_processed) cv2.waitKey(0) # save result cv2.imwrite(os.path.join(save_path, str(index).zfill(6) +'.jpg'), img_processed) if __name__ == '__main__': # cuda if args.cuda: print('use cuda') cudnn.benchmark = True device = torch.device("cuda") else: device = torch.device("cpu") # input size input_size = args.input_size # dataset if args.dataset == 'voc': print('test on voc ...') data_dir = os.path.join(args.data_root, 'VOCdevkit') class_names = VOC_CLASSES class_indexs = None num_classes = 20 dataset = VOCDetection(root=data_dir, image_sets=[('2007', 'test')]) elif args.dataset == 'coco': print('test on coco-val ...') data_dir = os.path.join(args.data_root, 'COCO') class_names = coco_class_labels class_indexs = coco_class_index num_classes = 80 dataset = COCODataset( data_dir=data_dir, json_file='instances_val2017.json', name='val2017') class_colors = [(np.random.randint(255), np.random.randint(255), np.random.randint(255)) for _ in range(num_classes)] # model model_name = args.version print('Model: ', model_name) # load model and config file if model_name == 'yolov2_d19': from models.yolov2_d19 import YOLOv2D19 as yolo_net cfg = config.yolov2_d19_cfg elif model_name == 'yolov2_r50': from models.yolov2_r50 import YOLOv2R50 as yolo_net cfg = config.yolov2_r50_cfg elif model_name == 'yolov3': from models.yolov3 import YOLOv3 as yolo_net cfg = config.yolov3_d53_cfg elif model_name == 'yolov3_spp': from models.yolov3_spp import YOLOv3Spp as yolo_net cfg = config.yolov3_d53_cfg elif model_name == 'yolov3_tiny': from models.yolov3_tiny import YOLOv3tiny as yolo_net cfg = config.yolov3_tiny_cfg else: print('Unknown model name...') exit(0) # build model anchor_size = cfg['anchor_size_voc'] if args.dataset == 'voc' else cfg['anchor_size_coco'] net = yolo_net(device=device, input_size=input_size, num_classes=num_classes, trainable=False, conf_thresh=args.conf_thresh, nms_thresh=args.nms_thresh, anchor_size=anchor_size) # load weight net.load_state_dict(torch.load(args.trained_model, map_location=device)) net.to(device).eval() print('Finished loading model!') # evaluation test(net=net, device=device, dataset=dataset, transform=BaseTransform(input_size), vis_thresh=args.visual_threshold, class_colors=class_colors, class_names=class_names, class_indexs=class_indexs, dataset_name=args.dataset ) ================================================ FILE: tools.py ================================================ import numpy as np from data import * import torch.nn as nn import torch.nn.functional as F # We use ignore thresh to decide which anchor box can be kept. ignore_thresh = 0.5 class MSEWithLogitsLoss(nn.Module): def __init__(self, reduction='mean'): super(MSEWithLogitsLoss, self).__init__() self.reduction = reduction def forward(self, logits, targets, mask): inputs = torch.sigmoid(logits) # We ignore those whose tarhets == -1.0. pos_id = (mask==1.0).float() neg_id = (mask==0.0).float() pos_loss = pos_id * (inputs - targets)**2 neg_loss = neg_id * (inputs)**2 loss = 5.0*pos_loss + 1.0*neg_loss if self.reduction == 'mean': batch_size = logits.size(0) loss = torch.sum(loss) / batch_size return loss else: return loss def compute_iou(anchor_boxes, gt_box): """ Input: anchor_boxes : ndarray -> [[c_x_s, c_y_s, anchor_w, anchor_h], ..., [c_x_s, c_y_s, anchor_w, anchor_h]]. gt_box : ndarray -> [c_x_s, c_y_s, anchor_w, anchor_h]. Output: iou : ndarray -> [iou_1, iou_2, ..., iou_m], and m is equal to the number of anchor boxes. """ # compute the iou between anchor box and gt box # First, change [c_x_s, c_y_s, anchor_w, anchor_h] -> [xmin, ymin, xmax, ymax] # anchor box : ab_x1y1_x2y2 = np.zeros([len(anchor_boxes), 4]) ab_x1y1_x2y2[:, 0] = anchor_boxes[:, 0] - anchor_boxes[:, 2] / 2 # xmin ab_x1y1_x2y2[:, 1] = anchor_boxes[:, 1] - anchor_boxes[:, 3] / 2 # ymin ab_x1y1_x2y2[:, 2] = anchor_boxes[:, 0] + anchor_boxes[:, 2] / 2 # xmax ab_x1y1_x2y2[:, 3] = anchor_boxes[:, 1] + anchor_boxes[:, 3] / 2 # ymax w_ab, h_ab = anchor_boxes[:, 2], anchor_boxes[:, 3] # gt_box : # We need to expand gt_box(ndarray) to the shape of anchor_boxes(ndarray), in order to compute IoU easily. gt_box_expand = np.repeat(gt_box, len(anchor_boxes), axis=0) gb_x1y1_x2y2 = np.zeros([len(anchor_boxes), 4]) gb_x1y1_x2y2[:, 0] = gt_box_expand[:, 0] - gt_box_expand[:, 2] / 2 # xmin gb_x1y1_x2y2[:, 1] = gt_box_expand[:, 1] - gt_box_expand[:, 3] / 2 # ymin gb_x1y1_x2y2[:, 2] = gt_box_expand[:, 0] + gt_box_expand[:, 2] / 2 # xmax gb_x1y1_x2y2[:, 3] = gt_box_expand[:, 1] + gt_box_expand[:, 3] / 2 # ymin w_gt, h_gt = gt_box_expand[:, 2], gt_box_expand[:, 3] # Then we compute IoU between anchor_box and gt_box S_gt = w_gt * h_gt S_ab = w_ab * h_ab I_w = np.minimum(gb_x1y1_x2y2[:, 2], ab_x1y1_x2y2[:, 2]) - np.maximum(gb_x1y1_x2y2[:, 0], ab_x1y1_x2y2[:, 0]) I_h = np.minimum(gb_x1y1_x2y2[:, 3], ab_x1y1_x2y2[:, 3]) - np.maximum(gb_x1y1_x2y2[:, 1], ab_x1y1_x2y2[:, 1]) S_I = I_h * I_w U = S_gt + S_ab - S_I + 1e-20 IoU = S_I / U return IoU def set_anchors(anchor_size): """ Input: anchor_size : list -> [[h_1, w_1], [h_2, w_2], ..., [h_n, w_n]]. Output: anchor_boxes : ndarray -> [[0, 0, anchor_w, anchor_h], [0, 0, anchor_w, anchor_h], ... [0, 0, anchor_w, anchor_h]]. """ anchor_number = len(anchor_size) anchor_boxes = np.zeros([anchor_number, 4]) for index, size in enumerate(anchor_size): anchor_w, anchor_h = size anchor_boxes[index] = np.array([0, 0, anchor_w, anchor_h]) return anchor_boxes def generate_txtytwth(gt_label, w, h, s, all_anchor_size): xmin, ymin, xmax, ymax = gt_label[:-1] # compute the center, width and height c_x = (xmax + xmin) / 2 * w c_y = (ymax + ymin) / 2 * h box_w = (xmax - xmin) * w box_h = (ymax - ymin) * h if box_w < 1. or box_h < 1.: # print('A dirty data !!!') return False # map the center, width and height to the feature map size c_x_s = c_x / s c_y_s = c_y / s box_ws = box_w / s box_hs = box_h / s # the grid cell location grid_x = int(c_x_s) grid_y = int(c_y_s) # generate anchor boxes anchor_boxes = set_anchors(all_anchor_size) gt_box = np.array([[0, 0, box_ws, box_hs]]) # compute the IoU iou = compute_iou(anchor_boxes, gt_box) # We consider those anchor boxes whose IoU is more than ignore thresh, iou_mask = (iou > ignore_thresh) result = [] if iou_mask.sum() == 0: # We assign the anchor box with highest IoU score. index = np.argmax(iou) p_w, p_h = all_anchor_size[index] tx = c_x_s - grid_x ty = c_y_s - grid_y tw = np.log(box_ws / p_w) th = np.log(box_hs / p_h) weight = 2.0 - (box_w / w) * (box_h / h) result.append([index, grid_x, grid_y, tx, ty, tw, th, weight, xmin, ymin, xmax, ymax]) return result else: # There are more than one anchor boxes whose IoU are higher than ignore thresh. # But we only assign only one anchor box whose IoU is the best(objectness target is 1) and ignore other # anchor boxes whose(we set their objectness as -1 which means we will ignore them during computing obj loss ) # iou_ = iou * iou_mask # We get the index of the best IoU best_index = np.argmax(iou) for index, iou_m in enumerate(iou_mask): if iou_m: if index == best_index: p_w, p_h = all_anchor_size[index] tx = c_x_s - grid_x ty = c_y_s - grid_y tw = np.log(box_ws / p_w) th = np.log(box_hs / p_h) weight = 2.0 - (box_w / w) * (box_h / h) result.append([index, grid_x, grid_y, tx, ty, tw, th, weight, xmin, ymin, xmax, ymax]) else: # we ignore other anchor boxes even if their iou scores all higher than ignore thresh result.append([index, grid_x, grid_y, 0., 0., 0., 0., -1.0, 0., 0., 0., 0.]) return result def gt_creator(input_size, stride, label_lists, anchor_size): """ Input: input_size : list -> the size of image in the training stage. stride : int or list -> the downSample of the CNN, such as 32, 64 and so on. label_list : list -> [[[xmin, ymin, xmax, ymax, cls_ind], ... ], [[xmin, ymin, xmax, ymax, cls_ind], ... ]], and len(label_list) = batch_size; len(label_list[i]) = the number of class instance in a image; (xmin, ymin, xmax, ymax) : the coords of a bbox whose valus is between 0 and 1; cls_ind : the corresponding class label. Output: gt_tensor : ndarray -> shape = [batch_size, anchor_number, 1+1+4, grid_cell number ] """ # prepare the all empty gt datas batch_size = len(label_lists) h = w = input_size # We make gt labels by anchor-free method and anchor-based method. ws = w // stride hs = h // stride s = stride # We use anchor boxes to build training target. all_anchor_size = anchor_size anchor_number = len(all_anchor_size) gt_tensor = np.zeros([batch_size, hs, ws, anchor_number, 1+1+4+1+4]) for batch_index in range(batch_size): for gt_label in label_lists[batch_index]: # get a bbox coords gt_class = int(gt_label[-1]) results = generate_txtytwth(gt_label, w, h, s, all_anchor_size) if results: for result in results: index, grid_x, grid_y, tx, ty, tw, th, weight, xmin, ymin, xmax, ymax = result if weight > 0.: if grid_y < gt_tensor.shape[1] and grid_x < gt_tensor.shape[2]: gt_tensor[batch_index, grid_y, grid_x, index, 0] = 1.0 gt_tensor[batch_index, grid_y, grid_x, index, 1] = gt_class gt_tensor[batch_index, grid_y, grid_x, index, 2:6] = np.array([tx, ty, tw, th]) gt_tensor[batch_index, grid_y, grid_x, index, 6] = weight gt_tensor[batch_index, grid_y, grid_x, index, 7:] = np.array([xmin, ymin, xmax, ymax]) else: gt_tensor[batch_index, grid_y, grid_x, index, 0] = -1.0 gt_tensor[batch_index, grid_y, grid_x, index, 6] = -1.0 gt_tensor = gt_tensor.reshape(batch_size, hs * ws * anchor_number, 1+1+4+1+4) return gt_tensor def multi_gt_creator(input_size, strides, label_lists, anchor_size): """creator multi scales gt""" # prepare the all empty gt datas batch_size = len(label_lists) h = w = input_size num_scale = len(strides) gt_tensor = [] all_anchor_size = anchor_size anchor_number = len(all_anchor_size) // num_scale for s in strides: gt_tensor.append(np.zeros([batch_size, h//s, w//s, anchor_number, 1+1+4+1+4])) # generate gt datas for batch_index in range(batch_size): for gt_label in label_lists[batch_index]: # get a bbox coords gt_class = int(gt_label[-1]) xmin, ymin, xmax, ymax = gt_label[:-1] # compute the center, width and height c_x = (xmax + xmin) / 2 * w c_y = (ymax + ymin) / 2 * h box_w = (xmax - xmin) * w box_h = (ymax - ymin) * h if box_w < 1. or box_h < 1.: # print('A dirty data !!!') continue # compute the IoU anchor_boxes = set_anchors(all_anchor_size) gt_box = np.array([[0, 0, box_w, box_h]]) iou = compute_iou(anchor_boxes, gt_box) # We only consider those anchor boxes whose IoU is more than ignore thresh, iou_mask = (iou > ignore_thresh) if iou_mask.sum() == 0: # We assign the anchor box with highest IoU score. index = np.argmax(iou) # s_indx, ab_ind = index // num_scale, index % num_scale s_indx = index // anchor_number ab_ind = index - s_indx * anchor_number # get the corresponding stride s = strides[s_indx] # get the corresponding anchor box p_w, p_h = anchor_boxes[index, 2], anchor_boxes[index, 3] # compute the gride cell location c_x_s = c_x / s c_y_s = c_y / s grid_x = int(c_x_s) grid_y = int(c_y_s) # compute gt labels tx = c_x_s - grid_x ty = c_y_s - grid_y tw = np.log(box_w / p_w) th = np.log(box_h / p_h) weight = 2.0 - (box_w / w) * (box_h / h) if grid_y < gt_tensor[s_indx].shape[1] and grid_x < gt_tensor[s_indx].shape[2]: gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 0] = 1.0 gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 1] = gt_class gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 2:6] = np.array([tx, ty, tw, th]) gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 6] = weight gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 7:] = np.array([xmin, ymin, xmax, ymax]) else: # There are more than one anchor boxes whose IoU are higher than ignore thresh. # But we only assign only one anchor box whose IoU is the best(objectness target is 1) and ignore other # anchor boxes whose(we set their objectness as -1 which means we will ignore them during computing obj loss ) # iou_ = iou * iou_mask # We get the index of the best IoU best_index = np.argmax(iou) for index, iou_m in enumerate(iou_mask): if iou_m: if index == best_index: # s_indx, ab_ind = index // num_scale, index % num_scale s_indx = index // anchor_number ab_ind = index - s_indx * anchor_number # get the corresponding stride s = strides[s_indx] # get the corresponding anchor box p_w, p_h = anchor_boxes[index, 2], anchor_boxes[index, 3] # compute the gride cell location c_x_s = c_x / s c_y_s = c_y / s grid_x = int(c_x_s) grid_y = int(c_y_s) # compute gt labels tx = c_x_s - grid_x ty = c_y_s - grid_y tw = np.log(box_w / p_w) th = np.log(box_h / p_h) weight = 2.0 - (box_w / w) * (box_h / h) if grid_y < gt_tensor[s_indx].shape[1] and grid_x < gt_tensor[s_indx].shape[2]: gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 0] = 1.0 gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 1] = gt_class gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 2:6] = np.array([tx, ty, tw, th]) gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 6] = weight gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 7:] = np.array([xmin, ymin, xmax, ymax]) else: # we ignore other anchor boxes even if their iou scores are higher than ignore thresh # s_indx, ab_ind = index // num_scale, index % num_scale s_indx = index // anchor_number ab_ind = index - s_indx * anchor_number s = strides[s_indx] c_x_s = c_x / s c_y_s = c_y / s grid_x = int(c_x_s) grid_y = int(c_y_s) gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 0] = -1.0 gt_tensor[s_indx][batch_index, grid_y, grid_x, ab_ind, 6] = -1.0 gt_tensor = [gt.reshape(batch_size, -1, 1+1+4+1+4) for gt in gt_tensor] gt_tensor = np.concatenate(gt_tensor, 1) return gt_tensor def iou_score(bboxes_a, bboxes_b): """ bbox_1 : [B*N, 4] = [x1, y1, x2, y2] bbox_2 : [B*N, 4] = [x1, y1, x2, y2] """ tl = torch.max(bboxes_a[:, :2], bboxes_b[:, :2]) br = torch.min(bboxes_a[:, 2:], bboxes_b[:, 2:]) area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) en = (tl < br).type(tl.type()).prod(dim=1) area_i = torch.prod(br - tl, 1) * en # * ((tl < br).all()) return area_i / (area_a + area_b - area_i + 1e-14) def loss(pred_conf, pred_cls, pred_txtytwth, pred_iou, label): # loss func conf_loss_function = MSEWithLogitsLoss(reduction='mean') cls_loss_function = nn.CrossEntropyLoss(reduction='none') txty_loss_function = nn.BCEWithLogitsLoss(reduction='none') twth_loss_function = nn.MSELoss(reduction='none') iou_loss_function = nn.SmoothL1Loss(reduction='none') # pred pred_conf = pred_conf[:, :, 0] pred_cls = pred_cls.permute(0, 2, 1) pred_txty = pred_txtytwth[:, :, :2] pred_twth = pred_txtytwth[:, :, 2:] pred_iou = pred_iou[:, :, 0] # gt gt_conf = label[:, :, 0].float() gt_obj = label[:, :, 1].float() gt_cls = label[:, :, 2].long() gt_txty = label[:, :, 3:5].float() gt_twth = label[:, :, 5:7].float() gt_box_scale_weight = label[:, :, 7].float() gt_iou = (gt_box_scale_weight > 0.).float() gt_mask = (gt_box_scale_weight > 0.).float() batch_size = pred_conf.size(0) # objectness loss conf_loss = conf_loss_function(pred_conf, gt_conf, gt_obj) # class loss cls_loss = torch.sum(cls_loss_function(pred_cls, gt_cls) * gt_mask) / batch_size # box loss txty_loss = torch.sum(torch.sum(txty_loss_function(pred_txty, gt_txty), dim=-1) * gt_box_scale_weight * gt_mask) / batch_size twth_loss = torch.sum(torch.sum(twth_loss_function(pred_twth, gt_twth), dim=-1) * gt_box_scale_weight * gt_mask) / batch_size bbox_loss = txty_loss + twth_loss # iou loss iou_loss = torch.sum(iou_loss_function(pred_iou, gt_iou) * gt_mask) / batch_size return conf_loss, cls_loss, bbox_loss, iou_loss if __name__ == "__main__": gt_box = np.array([[0.0, 0.0, 10, 10]]) anchor_boxes = np.array([[0.0, 0.0, 10, 10], [0.0, 0.0, 4, 4], [0.0, 0.0, 8, 8], [0.0, 0.0, 16, 16] ]) iou = compute_iou(anchor_boxes, gt_box) print(iou) ================================================ FILE: train.py ================================================ from __future__ import division import os import random import argparse import time import cv2 import numpy as np from copy import deepcopy import torch import torch.optim as optim import torch.backends.cudnn as cudnn import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from data.voc0712 import VOCDetection from data.coco2017 import COCODataset from data import config from data import BaseTransform, detection_collate import tools from utils import distributed_utils from utils.com_paras_flops import FLOPs_and_Params from utils.augmentations import SSDAugmentation, ColorAugmentation from utils.cocoapi_evaluator import COCOAPIEvaluator from utils.vocapi_evaluator import VOCAPIEvaluator from utils.modules import ModelEMA def parse_args(): parser = argparse.ArgumentParser(description='YOLO Detection') # basic parser.add_argument('--cuda', action='store_true', default=False, help='use cuda.') parser.add_argument('-bs', '--batch_size', default=16, type=int, help='Batch size for training') parser.add_argument('--lr', default=1e-3, type=float, help='initial learning rate') parser.add_argument('--wp_epoch', type=int, default=2, help='The upper bound of warm-up') parser.add_argument('--start_epoch', type=int, default=0, help='start epoch to train') parser.add_argument('-r', '--resume', default=None, type=str, help='keep training') parser.add_argument('--momentum', default=0.9, type=float, help='Momentum value for optim') parser.add_argument('--weight_decay', default=5e-4, type=float, help='Weight decay for SGD') parser.add_argument('--num_workers', default=8, type=int, help='Number of workers used in dataloading') parser.add_argument('--num_gpu', default=1, type=int, help='Number of GPUs to train') parser.add_argument('--eval_epoch', type=int, default=10, help='interval between evaluations') parser.add_argument('--tfboard', action='store_true', default=False, help='use tensorboard') parser.add_argument('--save_folder', default='weights/', type=str, help='Gamma update for SGD') parser.add_argument('--vis', action='store_true', default=False, help='visualize target.') # model parser.add_argument('-v', '--version', default='yolo_v2', help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny') # dataset parser.add_argument('-root', '--data_root', default='/mnt/share/ssd2/dataset', help='dataset root') parser.add_argument('-d', '--dataset', default='voc', help='voc or coco') # train trick parser.add_argument('--no_warmup', action='store_true', default=False, help='do not use warmup') parser.add_argument('-ms', '--multi_scale', action='store_true', default=False, help='use multi-scale trick') parser.add_argument('--mosaic', action='store_true', default=False, help='use mosaic augmentation') parser.add_argument('--ema', action='store_true', default=False, help='use ema training trick') # DDP train parser.add_argument('-dist', '--distributed', action='store_true', default=False, help='distributed training') parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') parser.add_argument('--sybn', action='store_true', default=False, help='use sybn.') return parser.parse_args() def train(): args = parse_args() print("Setting Arguments.. : ", args) print("----------------------------------------------------------") # set distributed print('World size: {}'.format(distributed_utils.get_world_size())) if args.distributed: distributed_utils.init_distributed_mode(args) print("git:\n {}\n".format(distributed_utils.get_sha())) # cuda if args.cuda: print('use cuda') # cudnn.benchmark = True device = torch.device("cuda") else: device = torch.device("cpu") model_name = args.version print('Model: ', model_name) # load model and config file if model_name == 'yolov2_d19': from models.yolov2_d19 import YOLOv2D19 as yolo_net cfg = config.yolov2_d19_cfg elif model_name == 'yolov2_r50': from models.yolov2_r50 import YOLOv2R50 as yolo_net cfg = config.yolov2_r50_cfg elif model_name == 'yolov3': from models.yolov3 import YOLOv3 as yolo_net cfg = config.yolov3_d53_cfg elif model_name == 'yolov3_spp': from models.yolov3_spp import YOLOv3Spp as yolo_net cfg = config.yolov3_d53_cfg elif model_name == 'yolov3_tiny': from models.yolov3_tiny import YOLOv3tiny as yolo_net cfg = config.yolov3_tiny_cfg else: print('Unknown model name...') exit(0) # path to save model path_to_save = os.path.join(args.save_folder, args.dataset, args.version) os.makedirs(path_to_save, exist_ok=True) # multi-scale if args.multi_scale: print('use the multi-scale trick ...') train_size = cfg['train_size'] val_size = cfg['val_size'] else: train_size = val_size = cfg['train_size'] # Model ENA if args.ema: print('use EMA trick ...') # dataset and evaluator if args.dataset == 'voc': data_dir = os.path.join(args.data_root, 'VOCdevkit') num_classes = 20 dataset = VOCDetection(data_dir=data_dir, transform=SSDAugmentation(train_size)) evaluator = VOCAPIEvaluator(data_root=data_dir, img_size=val_size, device=device, transform=BaseTransform(val_size)) elif args.dataset == 'coco': data_dir = os.path.join(args.data_root, 'COCO') num_classes = 80 dataset = COCODataset( data_dir=data_dir, transform=SSDAugmentation(train_size)) evaluator = COCOAPIEvaluator( data_dir=data_dir, img_size=val_size, device=device, transform=BaseTransform(val_size)) else: print('unknow dataset !! Only support voc and coco !!') exit(0) print('Training model on:', dataset.name) print('The dataset size:', len(dataset)) print("----------------------------------------------------------") # build model anchor_size = cfg['anchor_size_voc'] if args.dataset == 'voc' else cfg['anchor_size_coco'] net = yolo_net(device=device, input_size=train_size, num_classes=num_classes, trainable=True, anchor_size=anchor_size) model = net model = model.to(device).train() # SyncBatchNorm if args.sybn and args.distributed: print('use SyncBatchNorm ...') model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) # DDP model_without_ddp = model if args.distributed: model = DDP(model, device_ids=[args.gpu]) model_without_ddp = model.module # compute FLOPs and Params if distributed_utils.is_main_process: model_copy = deepcopy(model_without_ddp) model_copy.trainable = False model_copy.eval() FLOPs_and_Params(model=model_copy, size=train_size, device=device) model_copy.trainable = True model_copy.train() if args.distributed: # wait for all processes to synchronize dist.barrier() # dataloader batch_size = args.batch_size * distributed_utils.get_world_size() if args.distributed and args.num_gpu > 1: dataloader = torch.utils.data.DataLoader( dataset=dataset, batch_size=batch_size, collate_fn=detection_collate, num_workers=args.num_workers, pin_memory=True, drop_last=True, sampler=torch.utils.data.distributed.DistributedSampler(dataset) ) else: # dataloader dataloader = torch.utils.data.DataLoader( dataset=dataset, shuffle=True, batch_size=batch_size, collate_fn=detection_collate, num_workers=args.num_workers, pin_memory=True, drop_last=True ) # keep training if args.resume is not None: print('keep training model: %s' % (args.resume)) model.load_state_dict(torch.load(args.resume, map_location=device)) # EMA ema = ModelEMA(model) if args.ema else None # use tfboard if args.tfboard: print('use tensorboard') from torch.utils.tensorboard import SummaryWriter c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) log_path = os.path.join('log/', args.dataset, c_time) os.makedirs(log_path, exist_ok=True) tblogger = SummaryWriter(log_path) # optimizer setup base_lr = (args.lr / 16) * batch_size tmp_lr = base_lr optimizer = optim.SGD(model.parameters(), lr=base_lr, momentum=args.momentum, weight_decay=args.weight_decay ) max_epoch = cfg['max_epoch'] epoch_size = len(dataloader) best_map = -1. warmup = not args.no_warmup t0 = time.time() # start training loop for epoch in range(args.start_epoch, max_epoch): if args.distributed: dataloader.sampler.set_epoch(epoch) # use step lr if epoch in cfg['lr_epoch']: tmp_lr = tmp_lr * 0.1 set_lr(optimizer, tmp_lr) for iter_i, (images, targets) in enumerate(dataloader): # WarmUp strategy for learning rate ni = iter_i + epoch * epoch_size # warmup if epoch < args.wp_epoch and warmup: nw = args.wp_epoch * epoch_size tmp_lr = base_lr * pow(ni / nw, 4) set_lr(optimizer, tmp_lr) elif epoch == args.wp_epoch and iter_i == 0 and warmup: # warmup is over warmup = False tmp_lr = base_lr set_lr(optimizer, tmp_lr) # multi-scale trick if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale: # randomly choose a new size r = cfg['random_size_range'] train_size = random.randint(r[0], r[1]) * 32 model.set_grid(train_size) if args.multi_scale: # interpolate images = torch.nn.functional.interpolate(images, size=train_size, mode='bilinear', align_corners=False) targets = [label.tolist() for label in targets] # visualize labels if args.vis: vis_data(images, targets, train_size) continue # label assignment if model_name in ['yolov2_d19', 'yolov2_r50']: targets = tools.gt_creator(input_size=train_size, stride=net.stride, label_lists=targets, anchor_size=anchor_size ) else: targets = tools.multi_gt_creator(input_size=train_size, strides=net.stride, label_lists=targets, anchor_size=anchor_size ) # to device images = images.float().to(device) targets = torch.tensor(targets).float().to(device) # forward conf_loss, cls_loss, box_loss, iou_loss = model(images, target=targets) # compute loss total_loss = conf_loss + cls_loss + box_loss + iou_loss loss_dict = dict(conf_loss=conf_loss, cls_loss=cls_loss, box_loss=box_loss, iou_loss=iou_loss, total_loss=total_loss ) loss_dict_reduced = distributed_utils.reduce_dict(loss_dict) # check NAN for loss if torch.isnan(total_loss): print('loss is nan !!') continue # backprop total_loss.backward() optimizer.step() optimizer.zero_grad() # ema if args.ema: ema.update(model) # display if distributed_utils.is_main_process() and iter_i % 10 == 0: if args.tfboard: # viz loss tblogger.add_scalar('conf loss', loss_dict_reduced['conf_loss'].item(), iter_i + epoch * epoch_size) tblogger.add_scalar('cls loss', loss_dict_reduced['cls_loss'].item(), iter_i + epoch * epoch_size) tblogger.add_scalar('box loss', loss_dict_reduced['box_loss'].item(), iter_i + epoch * epoch_size) tblogger.add_scalar('iou loss', loss_dict_reduced['iou_loss'].item(), iter_i + epoch * epoch_size) t1 = time.time() cur_lr = [param_group['lr'] for param_group in optimizer.param_groups] # basic infor log = '[Epoch: {}/{}]'.format(epoch+1, max_epoch) log += '[Iter: {}/{}]'.format(iter_i, epoch_size) log += '[lr: {:.6f}]'.format(cur_lr[0]) # loss infor for k in loss_dict_reduced.keys(): log += '[{}: {:.2f}]'.format(k, loss_dict[k]) # other infor log += '[time: {:.2f}]'.format(t1 - t0) log += '[size: {}]'.format(train_size) # print log infor print(log, flush=True) t0 = time.time() if distributed_utils.is_main_process(): # evaluation if (epoch % args.eval_epoch) == 0 or (epoch == max_epoch - 1): if args.ema: model_eval = ema.ema else: model_eval = model_without_ddp # check evaluator if evaluator is None: print('No evaluator ... save model and go on training.') print('Saving state, epoch: {}'.format(epoch + 1)) weight_name = '{}_epoch_{}.pth'.format(args.version, epoch + 1) checkpoint_path = os.path.join(path_to_save, weight_name) torch.save(model_eval.state_dict(), checkpoint_path) else: print('eval ...') # set eval mode model_eval.trainable = False model_eval.set_grid(val_size) model_eval.eval() # evaluate evaluator.evaluate(model_eval) cur_map = evaluator.map if cur_map > best_map: # update best-map best_map = cur_map # save model print('Saving state, epoch:', epoch + 1) weight_name = '{}_epoch_{}_{:.2f}.pth'.format(args.version, epoch + 1, best_map*100) checkpoint_path = os.path.join(path_to_save, weight_name) torch.save(model_eval.state_dict(), checkpoint_path) if args.tfboard: if args.dataset == 'voc': tblogger.add_scalar('07test/mAP', evaluator.map, epoch) elif args.dataset == 'coco': tblogger.add_scalar('val/AP50_95', evaluator.ap50_95, epoch) tblogger.add_scalar('val/AP50', evaluator.ap50, epoch) # set train mode. model_eval.trainable = True model_eval.set_grid(train_size) model_eval.train() # wait for all processes to synchronize if args.distributed: dist.barrier() if args.tfboard: tblogger.close() def set_lr(optimizer, lr): for param_group in optimizer.param_groups: param_group['lr'] = lr def vis_data(images, targets, input_size): # vis data mean=(0.406, 0.456, 0.485) std=(0.225, 0.224, 0.229) mean = np.array(mean, dtype=np.float32) std = np.array(std, dtype=np.float32) img = images[0].permute(1, 2, 0).cpu().numpy()[:, :, ::-1] img = ((img * std + mean)*255).astype(np.uint8) img = img.copy() for box in targets[0]: xmin, ymin, xmax, ymax = box[:-1] # print(xmin, ymin, xmax, ymax) xmin *= input_size ymin *= input_size xmax *= input_size ymax *= input_size cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255), 2) cv2.imshow('img', img) cv2.waitKey(0) if __name__ == '__main__': train() ================================================ FILE: utils/__init__.py ================================================ ================================================ FILE: utils/augmentations.py ================================================ import cv2 import numpy as np from numpy import random def intersect(box_a, box_b): max_xy = np.minimum(box_a[:, 2:], box_b[2:]) min_xy = np.maximum(box_a[:, :2], box_b[:2]) inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf) return inter[:, 0] * inter[:, 1] def jaccard_numpy(box_a, box_b): """Compute the jaccard overlap of two sets of boxes. The jaccard overlap is simply the intersection over union of two boxes. E.g.: A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) Args: box_a: Multiple bounding boxes, Shape: [num_boxes,4] box_b: Single bounding box, Shape: [4] Return: jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] """ inter = intersect(box_a, box_b) area_a = ((box_a[:, 2]-box_a[:, 0]) * (box_a[:, 3]-box_a[:, 1])) # [A,B] area_b = ((box_b[2]-box_b[0]) * (box_b[3]-box_b[1])) # [A,B] union = area_a + area_b - inter return inter / union # [A,B] class Compose(object): """Composes several augmentations together. Args: transforms (List[Transform]): list of transforms to compose. Example: >>> augmentations.Compose([ >>> transforms.CenterCrop(10), >>> transforms.ToTensor(), >>> ]) """ def __init__(self, transforms): self.transforms = transforms def __call__(self, img, boxes=None, labels=None): for t in self.transforms: img, boxes, labels = t(img, boxes, labels) return img, boxes, labels class ConvertFromInts(object): def __call__(self, image, boxes=None, labels=None): return image.astype(np.float32), boxes, labels class Normalize(object): def __init__(self, mean=None, std=None): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) def __call__(self, image, boxes=None, labels=None): image = image.astype(np.float32) image /= 255. image -= self.mean image /= self.std return image, boxes, labels class ToAbsoluteCoords(object): def __call__(self, image, boxes=None, labels=None): height, width, channels = image.shape boxes[:, 0] *= width boxes[:, 2] *= width boxes[:, 1] *= height boxes[:, 3] *= height return image, boxes, labels class ToPercentCoords(object): def __call__(self, image, boxes=None, labels=None): height, width, channels = image.shape boxes[:, 0] /= width boxes[:, 2] /= width boxes[:, 1] /= height boxes[:, 3] /= height return image, boxes, labels class Resize(object): def __init__(self, size=416): self.size = size def __call__(self, image, boxes=None, labels=None): image = cv2.resize(image, (self.size, self.size)) return image, boxes, labels class RandomSaturation(object): def __init__(self, lower=0.5, upper=1.5): self.lower = lower self.upper = upper assert self.upper >= self.lower, "contrast upper must be >= lower." assert self.lower >= 0, "contrast lower must be non-negative." def __call__(self, image, boxes=None, labels=None): if random.randint(2): image[:, :, 1] *= random.uniform(self.lower, self.upper) return image, boxes, labels class RandomHue(object): def __init__(self, delta=18.0): assert delta >= 0.0 and delta <= 360.0 self.delta = delta def __call__(self, image, boxes=None, labels=None): if random.randint(2): image[:, :, 0] += random.uniform(-self.delta, self.delta) image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 return image, boxes, labels class RandomLightingNoise(object): def __init__(self): self.perms = ((0, 1, 2), (0, 2, 1), (1, 0, 2), (1, 2, 0), (2, 0, 1), (2, 1, 0)) def __call__(self, image, boxes=None, labels=None): if random.randint(2): swap = self.perms[random.randint(len(self.perms))] shuffle = SwapChannels(swap) # shuffle channels image = shuffle(image) return image, boxes, labels class ConvertColor(object): def __init__(self, current='BGR', transform='HSV'): self.transform = transform self.current = current def __call__(self, image, boxes=None, labels=None): if self.current == 'BGR' and self.transform == 'HSV': image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) elif self.current == 'HSV' and self.transform == 'BGR': image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) else: raise NotImplementedError return image, boxes, labels class RandomContrast(object): def __init__(self, lower=0.5, upper=1.5): self.lower = lower self.upper = upper assert self.upper >= self.lower, "contrast upper must be >= lower." assert self.lower >= 0, "contrast lower must be non-negative." # expects float image def __call__(self, image, boxes=None, labels=None): if random.randint(2): alpha = random.uniform(self.lower, self.upper) image *= alpha return image, boxes, labels class RandomBrightness(object): def __init__(self, delta=32): assert delta >= 0.0 assert delta <= 255.0 self.delta = delta def __call__(self, image, boxes=None, labels=None): if random.randint(2): delta = random.uniform(-self.delta, self.delta) image += delta return image, boxes, labels class RandomSampleCrop(object): """Crop Arguments: img (Image): the image being input during training boxes (Tensor): the original bounding boxes in pt form labels (Tensor): the class labels for each bbox mode (float tuple): the min and max jaccard overlaps Return: (img, boxes, classes) img (Image): the cropped image boxes (Tensor): the adjusted bounding boxes in pt form labels (Tensor): the class labels for each bbox """ def __init__(self): self.sample_options = ( # using entire original input image None, # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 (0.1, None), (0.3, None), (0.7, None), (0.9, None), # randomly sample a patch (None, None), ) def __call__(self, image, boxes=None, labels=None): height, width, _ = image.shape while True: # randomly choose a mode sample_id = np.random.randint(len(self.sample_options)) mode = self.sample_options[sample_id] if mode is None: return image, boxes, labels min_iou, max_iou = mode if min_iou is None: min_iou = float('-inf') if max_iou is None: max_iou = float('inf') # max trails (50) for _ in range(50): current_image = image w = random.uniform(0.3 * width, width) h = random.uniform(0.3 * height, height) # aspect ratio constraint b/t .5 & 2 if h / w < 0.5 or h / w > 2: continue left = random.uniform(width - w) top = random.uniform(height - h) # convert to integer rect x1,y1,x2,y2 rect = np.array([int(left), int(top), int(left+w), int(top+h)]) # calculate IoU (jaccard overlap) b/t the cropped and gt boxes overlap = jaccard_numpy(boxes, rect) # is min and max overlap constraint satisfied? if not try again if overlap.min() < min_iou and max_iou < overlap.max(): continue # cut the crop from the image current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], :] # keep overlap with gt box IF center in sampled patch centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 # mask in all gt boxes that above and to the left of centers m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) # mask in all gt boxes that under and to the right of centers m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) # mask in that both m1 and m2 are true mask = m1 * m2 # have any valid boxes? try again if not if not mask.any(): continue # take only matching gt boxes current_boxes = boxes[mask, :].copy() # take only matching gt labels current_labels = labels[mask] # should we use the box left and top corner or the crop's current_boxes[:, :2] = np.maximum(current_boxes[:, :2], rect[:2]) # adjust to crop (by substracting crop's left,top) current_boxes[:, :2] -= rect[:2] current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], rect[2:]) # adjust to crop (by substracting crop's left,top) current_boxes[:, 2:] -= rect[:2] return current_image, current_boxes, current_labels class RandomMirror(object): def __call__(self, image, boxes, classes): _, width, _ = image.shape if random.randint(2): image = image[:, ::-1] boxes = boxes.copy() boxes[:, 0::2] = width - boxes[:, 2::-2] return image, boxes, classes class SwapChannels(object): """Transforms a tensorized image by swapping the channels in the order specified in the swap tuple. Args: swaps (int triple): final order of channels eg: (2, 1, 0) """ def __init__(self, swaps): self.swaps = swaps def __call__(self, image): """ Args: image (Tensor): image tensor to be transformed Return: a tensor with channels swapped according to swap """ # if torch.is_tensor(image): # image = image.data.cpu().numpy() # else: # image = np.array(image) image = image[:, :, self.swaps] return image class PhotometricDistort(object): def __init__(self): self.pd = [ RandomContrast(), ConvertColor(transform='HSV'), RandomSaturation(), RandomHue(), ConvertColor(current='HSV', transform='BGR'), RandomContrast() ] self.rand_brightness = RandomBrightness() # self.rand_light_noise = RandomLightingNoise() def __call__(self, image, boxes, labels): im = image.copy() im, boxes, labels = self.rand_brightness(im, boxes, labels) if random.randint(2): distort = Compose(self.pd[:-1]) else: distort = Compose(self.pd[1:]) im, boxes, labels = distort(im, boxes, labels) return im, boxes, labels # return self.rand_light_noise(im, boxes, labels) class SSDAugmentation(object): def __init__(self, size=416, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)): self.mean = mean self.size = size self.std = std self.augment = Compose([ ConvertFromInts(), ToAbsoluteCoords(), PhotometricDistort(), RandomSampleCrop(), RandomMirror(), ToPercentCoords(), Resize(self.size), Normalize(self.mean, self.std) ]) def __call__(self, img, boxes, labels): return self.augment(img, boxes, labels) class ColorAugmentation(object): def __init__(self, size=416, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)): self.mean = mean self.size = size self.std = std self.augment = Compose([ ConvertFromInts(), ToAbsoluteCoords(), PhotometricDistort(), RandomMirror(), ToPercentCoords(), Resize(self.size), Normalize(self.mean, self.std) ]) def __call__(self, img, boxes, labels): return self.augment(img, boxes, labels) ================================================ FILE: utils/cocoapi_evaluator.py ================================================ import json import tempfile from pycocotools.cocoeval import COCOeval from torch.autograd import Variable from data.coco2017 import * from data import * class COCOAPIEvaluator(): """ COCO AP Evaluation class. All the data in the val2017 dataset are processed \ and evaluated by COCO API. """ def __init__(self, data_dir, img_size, device, testset=False, transform=None): """ Args: data_dir (str): dataset root directory img_size (int): image size after preprocess. images are resized \ to squares whose shape is (img_size, img_size). confthre (float): confidence threshold ranging from 0 to 1, \ which is defined in the config file. nmsthre (float): IoU threshold of non-max supression ranging from 0 to 1. """ self.testset = testset if self.testset: json_file='image_info_test-dev2017.json' name = 'test2017' else: json_file='instances_val2017.json' name='val2017' self.dataset = COCODataset(data_dir=data_dir, json_file=json_file, name=name) self.img_size = img_size self.transform = transform self.device = device self.map = 0. self.ap50_95 = 0. self.ap50 = 0. def evaluate(self, model): """ COCO average precision (AP) Evaluation. Iterate inference on the test dataset and the results are evaluated by COCO API. Args: model : model object Returns: ap50_95 (float) : calculated COCO AP for IoU=50:95 ap50 (float) : calculated COCO AP for IoU=50 """ model.eval() ids = [] data_dict = [] num_images = len(self.dataset) print('total number of images: %d' % (num_images)) # start testing for index in range(num_images): # all the data in val2017 if index % 500 == 0: print('[Eval: %d / %d]'%(index, num_images)) img, id_ = self.dataset.pull_image(index) # load a batch if self.transform is not None: x = torch.from_numpy(self.transform(img)[0][:, :, (2, 1, 0)]).permute(2, 0, 1) x = x.unsqueeze(0).to(self.device) scale = np.array([[img.shape[1], img.shape[0], img.shape[1], img.shape[0]]]) id_ = int(id_) ids.append(id_) with torch.no_grad(): outputs = model(x) bboxes, scores, cls_inds = outputs bboxes *= scale for i, box in enumerate(bboxes): x1 = float(box[0]) y1 = float(box[1]) x2 = float(box[2]) y2 = float(box[3]) label = self.dataset.class_ids[int(cls_inds[i])] bbox = [x1, y1, x2 - x1, y2 - y1] score = float(scores[i]) # object score * class score A = {"image_id": id_, "category_id": label, "bbox": bbox, "score": score} # COCO json format data_dict.append(A) annType = ['segm', 'bbox', 'keypoints'] # Evaluate the Dt (detection) json comparing with the ground truth if len(data_dict) > 0: print('evaluating ......') cocoGt = self.dataset.coco # For test if self.testset: json.dump(data_dict, open('yolov2_2017.json', 'w')) cocoDt = cocoGt.loadRes('yolov2_2017.json') print('inference on test-dev is done !!') return -1, -1 # For val else: _, tmp = tempfile.mkstemp() json.dump(data_dict, open(tmp, 'w')) cocoDt = cocoGt.loadRes(tmp) cocoEval = COCOeval(self.dataset.coco, cocoDt, annType[1]) cocoEval.params.imgIds = ids cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() ap50_95, ap50 = cocoEval.stats[0], cocoEval.stats[1] print('ap50_95 : ', ap50_95) print('ap50 : ', ap50) self.map = ap50_95 self.ap50_95 = ap50_95 self.ap50 = ap50 return ap50, ap50_95 else: return 0, 0 ================================================ FILE: utils/com_paras_flops.py ================================================ import torch from thop import profile def FLOPs_and_Params(model, size, device): x = torch.randn(1, 3, size, size).to(device) model.trainable = False model.eval() flops, params = profile(model, inputs=(x, )) print('FLOPs : ', flops / 1e9, ' B') print('Params : ', params / 1e6, ' M') model.trainable = True model.train() if __name__ == "__main__": pass ================================================ FILE: utils/distributed_utils.py ================================================ # from github: https://github.com/ruinmessi/ASFF/blob/master/utils/distributed_util.py import torch import torch.distributed as dist import os import subprocess import pickle def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.tensor([tensor.numel()], device="cuda") size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) if local_size != max_size: padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list def reduce_dict(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that all processes have the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.all_reduce(values) if average: values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict def get_sha(): cwd = os.path.dirname(os.path.abspath(__file__)) def _run(command): return subprocess.check_output(command, cwd=cwd).decode('ascii').strip() sha = 'N/A' diff = "clean" branch = 'N/A' try: sha = _run(['git', 'rev-parse', 'HEAD']) subprocess.check_output(['git', 'diff'], cwd=cwd) diff = _run(['git', 'diff-index', 'HEAD']) diff = "has uncommited changes" if diff else "clean" branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) except Exception: pass message = f"sha: {sha}, status: {diff}, branch: {branch}" return message def setup_for_distributed(is_master): """ This function disables printing when not in master process """ import builtins as __builtin__ builtin_print = __builtin__.print def print(*args, **kwargs): force = kwargs.pop('force', False) if is_master or force: builtin_print(*args, **kwargs) __builtin__.print = print def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True def get_world_size(): if not is_dist_avail_and_initialized(): return 1 return dist.get_world_size() def get_rank(): if not is_dist_avail_and_initialized(): return 0 return dist.get_rank() def is_main_process(): return get_rank() == 0 def save_on_master(*args, **kwargs): if is_main_process(): torch.save(*args, **kwargs) def init_distributed_mode(args): if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: args.rank = int(os.environ["RANK"]) args.world_size = int(os.environ['WORLD_SIZE']) args.gpu = int(os.environ['LOCAL_RANK']) elif 'SLURM_PROCID' in os.environ: args.rank = int(os.environ['SLURM_PROCID']) args.gpu = args.rank % torch.cuda.device_count() else: print('Not using distributed mode') args.distributed = False return args.distributed = True torch.cuda.set_device(args.gpu) args.dist_backend = 'nccl' print('| distributed init (rank {}): {}'.format( args.rank, args.dist_url), flush=True) torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.distributed.barrier() setup_for_distributed(args.rank == 0) ================================================ FILE: utils/kmeans_anchor.py ================================================ import numpy as np import random import argparse import os import sys sys.path.append('..') from data.voc0712 import VOCDetection from data.coco2017 import COCODataset def parse_args(): parser = argparse.ArgumentParser(description='kmeans for anchor box') parser.add_argument('-root', '--data_root', default='/mnt/share/ssd2/dataset', help='dataset root') parser.add_argument('-d', '--dataset', default='coco', help='coco, voc.') parser.add_argument('-na', '--num_anchorbox', default=9, type=int, help='number of anchor box.') parser.add_argument('-size', '--input_size', default=416, type=int, help='input size.') parser.add_argument('--scale', action='store_true', default=False, help='divide the sizes of anchor boxes by 32 .') return parser.parse_args() args = parse_args() class Box(): def __init__(self, x, y, w, h): self.x = x self.y = y self.w = w self.h = h def iou(box1, box2): x1, y1, w1, h1 = box1.x, box1.y, box1.w, box1.h x2, y2, w2, h2 = box2.x, box2.y, box2.w, box2.h S_1 = w1 * h1 S_2 = w2 * h2 xmin_1, ymin_1 = x1 - w1 / 2, y1 - h1 / 2 xmax_1, ymax_1 = x1 + w1 / 2, y1 + h1 / 2 xmin_2, ymin_2 = x2 - w2 / 2, y2 - h2 / 2 xmax_2, ymax_2 = x2 + w2 / 2, y2 + h2 / 2 I_w = min(xmax_1, xmax_2) - max(xmin_1, xmin_2) I_h = min(ymax_1, ymax_2) - max(ymin_1, ymin_2) if I_w < 0 or I_h < 0: return 0 I = I_w * I_h IoU = I / (S_1 + S_2 - I) return IoU def init_centroids(boxes, n_anchors): """ We use kmeans++ to initialize centroids. """ centroids = [] boxes_num = len(boxes) centroid_index = int(np.random.choice(boxes_num, 1)[0]) centroids.append(boxes[centroid_index]) print(centroids[0].w,centroids[0].h) for centroid_index in range(0, n_anchors-1): sum_distance = 0 distance_thresh = 0 distance_list = [] cur_sum = 0 for box in boxes: min_distance = 1 for centroid_i, centroid in enumerate(centroids): distance = (1 - iou(box, centroid)) if distance < min_distance: min_distance = distance sum_distance += min_distance distance_list.append(min_distance) distance_thresh = sum_distance * np.random.random() for i in range(0, boxes_num): cur_sum += distance_list[i] if cur_sum > distance_thresh: centroids.append(boxes[i]) print(boxes[i].w, boxes[i].h) break return centroids def do_kmeans(n_anchors, boxes, centroids): loss = 0 groups = [] new_centroids = [] # for box in centroids: # print('box: ', box.x, box.y, box.w, box.h) # exit() for i in range(n_anchors): groups.append([]) new_centroids.append(Box(0, 0, 0, 0)) for box in boxes: min_distance = 1 group_index = 0 for centroid_index, centroid in enumerate(centroids): distance = (1 - iou(box, centroid)) if distance < min_distance: min_distance = distance group_index = centroid_index groups[group_index].append(box) loss += min_distance new_centroids[group_index].w += box.w new_centroids[group_index].h += box.h for i in range(n_anchors): new_centroids[i].w /= max(len(groups[i]), 1) new_centroids[i].h /= max(len(groups[i]), 1) return new_centroids, groups, loss# / len(boxes) def anchor_box_kmeans(total_gt_boxes, n_anchors, loss_convergence, iters, plus=True): """ This function will use k-means to get appropriate anchor boxes for train dataset. Input: total_gt_boxes: n_anchor : int -> the number of anchor boxes. loss_convergence : float -> threshold of iterating convergence. iters: int -> the number of iterations for training kmeans. Output: anchor_boxes : list -> [[w1, h1], [w2, h2], ..., [wn, hn]]. """ boxes = total_gt_boxes centroids = [] if plus: centroids = init_centroids(boxes, n_anchors) else: total_indexs = range(len(boxes)) sample_indexs = random.sample(total_indexs, n_anchors) for i in sample_indexs: centroids.append(boxes[i]) # iterate k-means centroids, groups, old_loss = do_kmeans(n_anchors, boxes, centroids) iterations = 1 while(True): centroids, groups, loss = do_kmeans(n_anchors, boxes, centroids) iterations += 1 print("Loss = %f" % loss) if abs(old_loss - loss) < loss_convergence or iterations > iters: break old_loss = loss for centroid in centroids: print(centroid.w, centroid.h) print("k-means result : ") for centroid in centroids: if args.scale: print("w, h: ", round(centroid.w / 32., 2), round(centroid.h / 32., 2), "area: ", round(centroid.w / 32., 2) * round(centroid.h / 32., 2)) else: print("w, h: ", round(centroid.w, 2), round(centroid.h, 2), "area: ", round(centroid.w, 2) * round(centroid.h, 2)) return centroids if __name__ == "__main__": n_anchors = args.num_anchorbox img_size = args.img_size dataset = args.dataset loss_convergence = 1e-6 iters_n = 1000 dataset_voc = VOCDetection(data_dir=os.path.join(args.root, 'VOCdevkit'), img_size=img_size) dataset_coco = COCODataset(data_dir=os.path.join(args.root, 'COCO'), img_size=img_size) boxes = [] print("The dataset size: ", len(dataset)) print("Loading the dataset ...") # VOC for i in range(len(dataset_voc)): if i % 5000 == 0: print('Loading voc data [%d / %d]' % (i+1, len(dataset_voc))) # For VOC img, _ = dataset_voc.pull_image(i) w, h = img.shape[1], img.shape[0] _, annotation = dataset_voc.pull_anno(i) # prepare bbox datas for box_and_label in annotation: box = box_and_label[:-1] xmin, ymin, xmax, ymax = box bw = (xmax - xmin) / w * img_size bh = (ymax - ymin) / h * img_size # check bbox if bw < 1.0 or bh < 1.0: continue boxes.append(Box(0, 0, bw, bh)) # COCO for i in range(len(dataset_coco)): if i % 5000 == 0: print('Loading coco datat [%d / %d]' % (i+1, len(dataset_coco))) # For COCO img, _ = dataset_coco.pull_image(i) w, h = img.shape[1], img.shape[0] annotation = dataset_coco.pull_anno(i) # prepare bbox datas for box_and_label in annotation: box = box_and_label[:-1] xmin, ymin, xmax, ymax = box bw = (xmax - xmin) / w * img_size bh = (ymax - ymin) / h * img_size # check bbox if bw < 1.0 or bh < 1.0: continue boxes.append(Box(0, 0, bw, bh)) print("Number of all bboxes: ", len(boxes)) print("Start k-means !") centroids = anchor_box_kmeans(boxes, n_anchors, loss_convergence, iters_n, plus=True) ================================================ FILE: utils/modules.py ================================================ import math import torch import torch.nn as nn from copy import deepcopy class Conv(nn.Module): def __init__(self, in_ch, out_ch, k=1, p=0, s=1, d=1, g=1, act=True): super(Conv, self).__init__() if act: self.convs = nn.Sequential( nn.Conv2d(in_ch, out_ch, k, stride=s, padding=p, dilation=d, groups=g), nn.BatchNorm2d(out_ch), nn.LeakyReLU(0.1, inplace=True) ) else: self.convs = nn.Sequential( nn.Conv2d(in_ch, out_ch, k, stride=s, padding=p, dilation=d, groups=g), nn.BatchNorm2d(out_ch) ) def forward(self, x): return self.convs(x) class UpSample(nn.Module): def __init__(self, size=None, scale_factor=None, mode='nearest', align_corner=None): super(UpSample, self).__init__() self.size = size self.scale_factor = scale_factor self.mode = mode self.align_corner = align_corner def forward(self, x): return torch.nn.functional.interpolate(x, size=self.size, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corner) class reorg_layer(nn.Module): def __init__(self, stride): super(reorg_layer, self).__init__() self.stride = stride def forward(self, x): batch_size, channels, height, width = x.size() _height, _width = height // self.stride, width // self.stride x = x.view(batch_size, channels, _height, self.stride, _width, self.stride).transpose(3, 4).contiguous() x = x.view(batch_size, channels, _height * _width, self.stride * self.stride).transpose(2, 3).contiguous() x = x.view(batch_size, channels, self.stride * self.stride, _height, _width).transpose(1, 2).contiguous() x = x.view(batch_size, -1, _height, _width) return x class SPP(nn.Module): """ Spatial Pyramid Pooling """ def __init__(self): super(SPP, self).__init__() def forward(self, x): x_1 = torch.nn.functional.max_pool2d(x, 5, stride=1, padding=2) x_2 = torch.nn.functional.max_pool2d(x, 9, stride=1, padding=4) x_3 = torch.nn.functional.max_pool2d(x, 13, stride=1, padding=6) x = torch.cat([x, x_1, x_2, x_3], dim=1) return x class ModelEMA(object): def __init__(self, model, decay=0.9999, updates=0): # create EMA self.ema = deepcopy(model).eval() self.updates = updates self.decay = lambda x: decay * (1 - math.exp(-x / 2000.)) for p in self.ema.parameters(): p.requires_grad_(False) def update(self, model): # Update EMA parameters with torch.no_grad(): self.updates += 1 d = self.decay(self.updates) msd = model.state_dict() for k, v in self.ema.state_dict().items(): if v.dtype.is_floating_point: v *= d v += (1. - d) * msd[k].detach() ================================================ FILE: utils/vocapi_evaluator.py ================================================ """Adapted from: @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn Licensed under The MIT License [see LICENSE for details] """ from torch.autograd import Variable from data.voc0712 import VOCDetection, VOC_CLASSES import sys import os import time import numpy as np import pickle import xml.etree.ElementTree as ET class VOCAPIEvaluator(): """ VOC AP Evaluation class """ def __init__(self, data_root, img_size, device, transform, set_type='test', year='2007', display=False): self.data_root = data_root self.img_size = img_size self.device = device self.transform = transform self.labelmap = VOC_CLASSES self.set_type = set_type self.year = year self.display = display # path self.devkit_path = data_root + 'VOC' + year self.annopath = os.path.join(data_root, 'VOC2007', 'Annotations', '%s.xml') self.imgpath = os.path.join(data_root, 'VOC2007', 'JPEGImages', '%s.jpg') self.imgsetpath = os.path.join(data_root, 'VOC2007', 'ImageSets', 'Main', set_type+'.txt') self.output_dir = self.get_output_dir('voc_eval/', self.set_type) # dataset self.dataset = VOCDetection(data_dir=data_root, image_sets=[('2007', set_type)], transform=transform ) def evaluate(self, net): net.eval() num_images = len(self.dataset) # all detections are collected into: # all_boxes[cls][image] = N x 5 array of detections in # (x1, y1, x2, y2, score) self.all_boxes = [[[] for _ in range(num_images)] for _ in range(len(self.labelmap))] # timers det_file = os.path.join(self.output_dir, 'detections.pkl') for i in range(num_images): im, gt, h, w = self.dataset.pull_item(i) x = Variable(im.unsqueeze(0)).to(self.device) t0 = time.time() # forward bboxes, scores, cls_inds = net(x) detect_time = time.time() - t0 scale = np.array([[w, h, w, h]]) bboxes *= scale for j in range(len(self.labelmap)): inds = np.where(cls_inds == j)[0] if len(inds) == 0: self.all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) continue c_bboxes = bboxes[inds] c_scores = scores[inds] c_dets = np.hstack((c_bboxes, c_scores[:, np.newaxis])).astype(np.float32, copy=False) self.all_boxes[j][i] = c_dets if i % 500 == 0: print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time)) with open(det_file, 'wb') as f: pickle.dump(self.all_boxes, f, pickle.HIGHEST_PROTOCOL) print('Evaluating detections') self.evaluate_detections(self.all_boxes) print('Mean AP: ', self.map) def parse_rec(self, filename): """ Parse a PASCAL VOC xml file """ tree = ET.parse(filename) objects = [] for obj in tree.findall('object'): obj_struct = {} obj_struct['name'] = obj.find('name').text obj_struct['pose'] = obj.find('pose').text obj_struct['truncated'] = int(obj.find('truncated').text) obj_struct['difficult'] = int(obj.find('difficult').text) bbox = obj.find('bndbox') obj_struct['bbox'] = [int(bbox.find('xmin').text), int(bbox.find('ymin').text), int(bbox.find('xmax').text), int(bbox.find('ymax').text)] objects.append(obj_struct) return objects def get_output_dir(self, name, phase): """Return the directory where experimental artifacts are placed. If the directory does not exist, it is created. A canonical path is built using the name from an imdb and a network (if not None). """ filedir = os.path.join(name, phase) if not os.path.exists(filedir): os.makedirs(filedir) return filedir def get_voc_results_file_template(self, cls): # VOCdevkit/VOC2007/results/det_test_aeroplane.txt filename = 'det_' + self.set_type + '_%s.txt' % (cls) filedir = os.path.join(self.devkit_path, 'results') if not os.path.exists(filedir): os.makedirs(filedir) path = os.path.join(filedir, filename) return path def write_voc_results_file(self, all_boxes): for cls_ind, cls in enumerate(self.labelmap): if self.display: print('Writing {:s} VOC results file'.format(cls)) filename = self.get_voc_results_file_template(cls) with open(filename, 'wt') as f: for im_ind, index in enumerate(self.dataset.ids): dets = all_boxes[cls_ind][im_ind] if dets == []: continue # the VOCdevkit expects 1-based indices for k in range(dets.shape[0]): f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. format(index[1], dets[k, -1], dets[k, 0] + 1, dets[k, 1] + 1, dets[k, 2] + 1, dets[k, 3] + 1)) def do_python_eval(self, use_07=True): cachedir = os.path.join(self.devkit_path, 'annotations_cache') aps = [] # The PASCAL VOC metric changed in 2010 use_07_metric = use_07 print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) if not os.path.isdir(self.output_dir): os.mkdir(self.output_dir) for i, cls in enumerate(self.labelmap): filename = self.get_voc_results_file_template(cls) rec, prec, ap = self.voc_eval(detpath=filename, classname=cls, cachedir=cachedir, ovthresh=0.5, use_07_metric=use_07_metric ) aps += [ap] print('AP for {} = {:.4f}'.format(cls, ap)) with open(os.path.join(self.output_dir, cls + '_pr.pkl'), 'wb') as f: pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) if self.display: self.map = np.mean(aps) print('Mean AP = {:.4f}'.format(np.mean(aps))) print('~~~~~~~~') print('Results:') for ap in aps: print('{:.3f}'.format(ap)) print('{:.3f}'.format(np.mean(aps))) print('~~~~~~~~') print('') print('--------------------------------------------------------------') print('Results computed with the **unofficial** Python eval code.') print('Results should be very close to the official MATLAB eval code.') print('--------------------------------------------------------------') else: self.map = np.mean(aps) print('Mean AP = {:.4f}'.format(np.mean(aps))) def voc_ap(self, rec, prec, use_07_metric=True): """ ap = voc_ap(rec, prec, [use_07_metric]) Compute VOC AP given precision and recall. If use_07_metric is true, uses the VOC 07 11 point method (default:True). """ if use_07_metric: # 11 point metric ap = 0. for t in np.arange(0., 1.1, 0.1): if np.sum(rec >= t) == 0: p = 0 else: p = np.max(prec[rec >= t]) ap = ap + p / 11. else: # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.], rec, [1.])) mpre = np.concatenate(([0.], prec, [0.])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap def voc_eval(self, detpath, classname, cachedir, ovthresh=0.5, use_07_metric=True): if not os.path.isdir(cachedir): os.mkdir(cachedir) cachefile = os.path.join(cachedir, 'annots.pkl') # read list of images with open(self.imgsetpath, 'r') as f: lines = f.readlines() imagenames = [x.strip() for x in lines] if not os.path.isfile(cachefile): # load annots recs = {} for i, imagename in enumerate(imagenames): recs[imagename] = self.parse_rec(self.annopath % (imagename)) if i % 100 == 0 and self.display: print('Reading annotation for {:d}/{:d}'.format( i + 1, len(imagenames))) # save if self.display: print('Saving cached annotations to {:s}'.format(cachefile)) with open(cachefile, 'wb') as f: pickle.dump(recs, f) else: # load with open(cachefile, 'rb') as f: recs = pickle.load(f) # extract gt objects for this class class_recs = {} npos = 0 for imagename in imagenames: R = [obj for obj in recs[imagename] if obj['name'] == classname] bbox = np.array([x['bbox'] for x in R]) difficult = np.array([x['difficult'] for x in R]).astype(np.bool) det = [False] * len(R) npos = npos + sum(~difficult) class_recs[imagename] = {'bbox': bbox, 'difficult': difficult, 'det': det} # read dets detfile = detpath.format(classname) with open(detfile, 'r') as f: lines = f.readlines() if any(lines) == 1: splitlines = [x.strip().split(' ') for x in lines] image_ids = [x[0] for x in splitlines] confidence = np.array([float(x[1]) for x in splitlines]) BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) # sort by confidence sorted_ind = np.argsort(-confidence) sorted_scores = np.sort(-confidence) BB = BB[sorted_ind, :] image_ids = [image_ids[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp = np.zeros(nd) fp = np.zeros(nd) for d in range(nd): R = class_recs[image_ids[d]] bb = BB[d, :].astype(float) ovmax = -np.inf BBGT = R['bbox'].astype(float) if BBGT.size > 0: # compute overlaps # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin, 0.) ih = np.maximum(iymax - iymin, 0.) inters = iw * ih uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + (BBGT[:, 2] - BBGT[:, 0]) * (BBGT[:, 3] - BBGT[:, 1]) - inters) overlaps = inters / uni ovmax = np.max(overlaps) jmax = np.argmax(overlaps) if ovmax > ovthresh: if not R['difficult'][jmax]: if not R['det'][jmax]: tp[d] = 1. R['det'][jmax] = 1 else: fp[d] = 1. else: fp[d] = 1. # compute precision recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / float(npos) # avoid divide by zero in case the first detection matches a difficult # ground truth prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = self.voc_ap(rec, prec, use_07_metric) else: rec = -1. prec = -1. ap = -1. return rec, prec, ap def evaluate_detections(self, box_list): self.write_voc_results_file(box_list) self.do_python_eval() if __name__ == '__main__': pass ================================================ FILE: weights/README.md ================================================ # yolo-v2-v3 and tiny model Hi, guys ! For researchers in China, you can download them from BaiduYunDisk. There are 5 models including yolo-v2, yolo-v3, yolo_v3_spp, slim-yolo-v2 and tiny-yolo-v3. The link is as following: link: https://pan.baidu.com/s/1rnmM8HGFzE2NTv6AkljJdg password: 5c8h I will upload all models to googledrive.