Repository: Epiphqny/CondInst Branch: master Commit: 7b666c32c04f Files: 46 Total size: 123.8 KB Directory structure: gitextract_p4h8yn0h/ ├── README.md ├── configs/ │ └── CondInst/ │ ├── Base-FCOS.yaml │ ├── MS_R_101_3x.yaml │ ├── MS_R_50_2x.yaml │ ├── MS_X_101_2x.yaml │ ├── R_50_1x.yaml │ └── vovnet/ │ ├── MS_V_39_3x.yaml │ ├── MS_V_57_3x.yaml │ ├── MS_V_99_3x.yaml │ └── README.md ├── demo/ │ ├── demo.py │ └── predictor.py ├── fcos/ │ ├── __init__.py │ ├── checkpoint/ │ │ ├── __init__.py │ │ └── adet_checkpoint.py │ ├── config/ │ │ ├── __init__.py │ │ ├── config.py │ │ └── defaults.py │ ├── data/ │ │ ├── __init__.py │ │ └── builtin.py │ ├── layers/ │ │ ├── __init__.py │ │ ├── conv_with_kaiming_uniform.py │ │ ├── csrc/ │ │ │ ├── cuda_version.cu │ │ │ ├── ml_nms/ │ │ │ │ ├── ml_nms.cu │ │ │ │ └── ml_nms.h │ │ │ └── vision.cpp │ │ ├── deform_conv.py │ │ ├── iou_loss.py │ │ └── ml_nms.py │ ├── modeling/ │ │ ├── __init__.py │ │ ├── backbone/ │ │ │ ├── __init__.py │ │ │ ├── fpn.py │ │ │ ├── mobilenet.py │ │ │ └── vovnet.py │ │ ├── fcos/ │ │ │ ├── __init__.py │ │ │ ├── fcos.py │ │ │ └── fcos_outputs.py │ │ ├── one_stage_detector.py │ │ └── poolers.py │ └── utils/ │ ├── comm.py │ └── measures.py ├── postprocessing.py ├── tools/ │ ├── compute_flops.py │ ├── convert_fcos_weight.py │ └── remove_optim_from_ckpt.py └── train_net.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # CondInst This repository is an unofficial pytorch implementation of [Conditional Convolutions for Instance Segmentation](https://arxiv.org/abs/2003.05664). The model with ResNet-101 backbone achieves 37.1 mAP on COCO val2017 set. ## Install The code is based on [detectron2](https://github.com/facebookresearch/detectron2). Please check [Install.md](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md) for installation instructions. ## Training Follows the same way as detectron2. Single GPU: ``` python train_net.py --config-file configs/CondInst/MS_R_101_3x.yaml ``` Multi GPU(for example 8): ``` python train_net.py --num-gpus 8 --config-file configs/CondInst/MS_R_101_3x.yaml ``` Please adjust the IMS_PER_BATCH in the config file according to the GPU memory. ## Notes I have replaced the original upsample with the aligned upsample according to the [author's issue](https://github.com/Epiphqny/CondInst/issues/1), and use the upsampled mask to calculate loss, this brings more gains but may cost more GPU memory, if you do not have much memory, use the original unupsampled version to calculate loss. ## Inference First replace the original detectron2 installed postprocessing.py with the [file](https://github.com/Epiphqny/CondInst/blob/master/postprocessing.py) in this repository, as the original file only suit for ROI obatined masks. The path should be like /miniconda3/envs/py37/lib/python3.7/site-packages/detectron2/modeling/postprocessing.py Single GPU: ``` python train_net.py --config-file configs/CondInst/MS_R_101_3x.yaml --eval-only MODEL.WEIGHTS /path/to/checkpoint_file ``` Multi GPU(for example 8): ``` python train_net.py --num-gpus 8 --config-file configs/CondInst/MS_R_101_3x.yaml --eval-only MODEL.WEIGHTS /path/to/checkpoint_file ``` ## Weights Trained model can be download in [Google drive](https://drive.google.com/file/d/17-g91zwJzt99G8APza0IaleWYLC3kTMK/view?usp=sharing) ## Results After training 36 epochs on the coco dataset using the resnet-101 backbone, the mAP is 0.371 on COCO val2017 dataset: ## Visualization ================================================ FILE: configs/CondInst/Base-FCOS.yaml ================================================ MODEL: META_ARCHITECTURE: "OneStageDetector" BACKBONE: NAME: "build_fcos_resnet_fpn_backbone" RESNETS: OUT_FEATURES: ["res3", "res4", "res5"] FPN: IN_FEATURES: ["res3", "res4", "res5"] PROPOSAL_GENERATOR: NAME: "FCOS" # PIXEL_MEAN: [102.9801, 115.9465, 122.7717] MASK_ON: True DATASETS: TRAIN: ("coco_2017_train",) TEST: ("coco_2017_val",) SOLVER: IMS_PER_BATCH: 4 BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate STEPS: (60000, 80000) MAX_ITER: 90000 INPUT: MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) MASK_FORMAT: "bitmask" ================================================ FILE: configs/CondInst/MS_R_101_3x.yaml ================================================ _BASE_: "Base-FCOS.yaml" MODEL: WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" RESNETS: DEPTH: 101 SOLVER: STEPS: (60000, 80000) MAX_ITER: 90000 OUTPUT_DIR: "output/fcos/R_101_3x" ================================================ FILE: configs/CondInst/MS_R_50_2x.yaml ================================================ _BASE_: "Base-FCOS.yaml" MODEL: WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" RESNETS: DEPTH: 50 SOLVER: STEPS: (120000, 160000) MAX_ITER: 180000 OUTPUT_DIR: "output/fcos/R_50_2x" ================================================ FILE: configs/CondInst/MS_X_101_2x.yaml ================================================ _BASE_: "Base-FCOS.yaml" MODEL: WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" PIXEL_STD: [57.375, 57.120, 58.395] RESNETS: STRIDE_IN_1X1: False # this is a C2 model NUM_GROUPS: 32 WIDTH_PER_GROUP: 8 DEPTH: 101 SOLVER: STEPS: (120000, 160000) MAX_ITER: 180000 OUTPUT_DIR: "output/fcos/X_101_2x" ================================================ FILE: configs/CondInst/R_50_1x.yaml ================================================ _BASE_: "Base-FCOS.yaml" MODEL: WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" RESNETS: DEPTH: 50 INPUT: MIN_SIZE_TRAIN: (800,) SOLVER: WARMUP_METHOD: "constant" WARMUP_FACTOR: 0.3333 WARMUP_ITERS: 500 OUTPUT_DIR: "output/fcos/R_50_1x" ================================================ FILE: configs/CondInst/vovnet/MS_V_39_3x.yaml ================================================ _BASE_: "../Base-FCOS.yaml" MODEL: WEIGHTS: "https://www.dropbox.com/s/q98pypf96rhtd8y/vovnet39_ese_detectron2.pth?dl=1" BACKBONE: NAME: "build_fcos_vovnet_fpn_backbone" FREEZE_AT: 0 VOVNET: CONV_BODY : "V-39-eSE" OUT_FEATURES: ["stage3", "stage4", "stage5"] FPN: IN_FEATURES: ["stage3", "stage4", "stage5"] SOLVER: STEPS: (210000, 250000) MAX_ITER: 270000 OUTPUT_DIR: "output/fcos/V_39_ms_3x" ================================================ FILE: configs/CondInst/vovnet/MS_V_57_3x.yaml ================================================ _BASE_: "../Base-FCOS.yaml" MODEL: WEIGHTS: "https://www.dropbox.com/s/8xl0cb3jj51f45a/vovnet57_ese_detectron2.pth?dl=1" BACKBONE: NAME: "build_fcos_vovnet_fpn_backbone" FREEZE_AT: 0 VOVNET: CONV_BODY : "V-57-eSE" OUT_FEATURES: ["stage3", "stage4", "stage5"] FPN: IN_FEATURES: ["stage3", "stage4", "stage5"] SOLVER: STEPS: (210000, 250000) MAX_ITER: 270000 OUTPUT_DIR: "output/fcos/V_57_ms_3x" ================================================ FILE: configs/CondInst/vovnet/MS_V_99_3x.yaml ================================================ _BASE_: "../Base-FCOS.yaml" MODEL: WEIGHTS: "https://www.dropbox.com/s/1mlv31coewx8trd/vovnet99_ese_detectron2.pth?dl=1" BACKBONE: NAME: "build_fcos_vovnet_fpn_backbone" FREEZE_AT: 0 VOVNET: CONV_BODY : "V-99-eSE" OUT_FEATURES: ["stage3", "stage4", "stage5"] FPN: IN_FEATURES: ["stage3", "stage4", "stage5"] SOLVER: STEPS: (210000, 250000) MAX_ITER: 270000 OUTPUT_DIR: "output/fcos/V_99_ms_3x" ================================================ FILE: configs/CondInst/vovnet/README.md ================================================ # [VoVNet-v2](https://github.com/youngwanLEE/CenterMask) backbone networks in [FCOS](https://github.com/aim-uofa/adet) **Efficient Backbone Network for Object Detection and Segmentation**\ Youngwan Lee [[`vovnet-detectron2`](https://github.com/youngwanLEE/vovnet-detectron2)][[`CenterMask(code)`](https://github.com/youngwanLEE/CenterMask)] [[`VoVNet-v1(arxiv)`](https://arxiv.org/abs/1904.09730)] [[`VoVNet-v2(arxiv)`](https://arxiv.org/abs/1911.06667)] [[`BibTeX`](#CitingVoVNet)]
## Comparison with Faster R-CNN and ResNet ### Note We measure the inference time of all models with batch size 1 on the same V100 GPU machine. - pytorch1.3.1 - CUDA 10.1 - cuDNN 7.3 |Method|Backbone|lr sched|inference time|AP|APs|APm|APl|download| |---|:--------:|:---:|:--:|--|----|----|---|--------| |Faster|R-50-FPN|3x|0.047|40.2|24.2|43.5|52.0|model \| metrics |Faster|**V2-39-FPN**|3x|0.047|42.7|27.1|45.6|54.0|model \| metrics |**FCOS**|**V2-39-FPN**|3x|0.045|43.5|28.1|47.2|54.5|model \| metrics || |Faster|R-101-FPN|3x|0.063|42.0|25.2|45.6|54.6|model \| metrics |Faster|**V2-57-FPN**|3x|0.054|43.3|27.5|46.7|55.3|model \| metrics |**FCOS**|**V2-57-FPN**|3x|0.051|44.4|28.8|47.2|56.3|model \| metrics || |Faster|X-101-FPN|3x|0.120|43.0|27.2|46.1|54.9|model \| metrics| |Faster|**V2-99-FPN**|3x|0.073|44.1|28.1|47.0|56.4|model \| metrics| |**FCOS**|**V2-99-FPN**|3x|0.070|45.2|29.2|48.4|57.3|model \| metrics| ## Citing VoVNet If you use VoVNet, please use the following BibTeX entry. ```BibTeX @inproceedings{lee2019energy, title = {An Energy and GPU-Computation Efficient Backbone Network for Real-Time Object Detection}, author = {Lee, Youngwan and Hwang, Joong-won and Lee, Sangrok and Bae, Yuseok and Park, Jongyoul}, booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops}, year = {2019} } @article{lee2019centermask, title={CenterMask: Real-Time Anchor-Free Instance Segmentation}, author={Lee, Youngwan and Park, Jongyoul}, journal={arXiv preprint arXiv:1911.06667}, year={2019} } ``` ================================================ FILE: demo/demo.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import argparse import glob import multiprocessing as mp import os import time import cv2 import tqdm from detectron2.data.detection_utils import read_image from detectron2.utils.logger import setup_logger from predictor import VisualizationDemo from adet.config import get_cfg # constants WINDOW_NAME = "COCO detections" def setup_cfg(args): # load config from file and command-line arguments cfg = get_cfg() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) # Set score_threshold for builtin models cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold cfg.MODEL.FCOS.INFERENCE_TH_TEST = args.confidence_threshold cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold cfg.freeze() return cfg def get_parser(): parser = argparse.ArgumentParser(description="Detectron2 Demo") parser.add_argument( "--config-file", default="configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_inference_acc_test.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.") parser.add_argument("--video-input", help="Path to video file.") parser.add_argument("--input", nargs="+", help="A list of space separated input images") parser.add_argument( "--output", help="A file or directory to save output visualizations. " "If not given, will show output in an OpenCV window.", ) parser.add_argument( "--confidence-threshold", type=float, default=0.5, help="Minimum score for instance predictions to be shown", ) parser.add_argument( "--opts", help="Modify config options using the command-line 'KEY VALUE' pairs", default=[], nargs=argparse.REMAINDER, ) return parser if __name__ == "__main__": mp.set_start_method("spawn", force=True) args = get_parser().parse_args() logger = setup_logger() logger.info("Arguments: " + str(args)) cfg = setup_cfg(args) demo = VisualizationDemo(cfg) if args.input: if os.path.isdir(args.input[0]): args.input = [os.path.join(args.input[0], fname) for fname in os.listdir(args.input[0])] elif len(args.input) == 1: args.input = glob.glob(os.path.expanduser(args.input[0])) assert args.input, "The input path(s) was not found" for path in tqdm.tqdm(args.input, disable=not args.output): # use PIL, to be consistent with evaluation img = read_image(path, format="BGR") start_time = time.time() predictions, visualized_output = demo.run_on_image(img) logger.info( "{}: detected {} instances in {:.2f}s".format( path, len(predictions["instances"]), time.time() - start_time ) ) if args.output: if os.path.isdir(args.output): assert os.path.isdir(args.output), args.output out_filename = os.path.join(args.output, os.path.basename(path)) else: assert len(args.input) == 1, "Please specify a directory with args.output" out_filename = args.output visualized_output.save(out_filename) else: cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) if cv2.waitKey(0) == 27: break # esc to quit elif args.webcam: assert args.input is None, "Cannot have both --input and --webcam!" cam = cv2.VideoCapture(0) for vis in tqdm.tqdm(demo.run_on_video(cam)): cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) cv2.imshow(WINDOW_NAME, vis) if cv2.waitKey(1) == 27: break # esc to quit cv2.destroyAllWindows() elif args.video_input: video = cv2.VideoCapture(args.video_input) width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) frames_per_second = video.get(cv2.CAP_PROP_FPS) num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) basename = os.path.basename(args.video_input) if args.output: if os.path.isdir(args.output): output_fname = os.path.join(args.output, basename) output_fname = os.path.splitext(output_fname)[0] + ".mkv" else: output_fname = args.output assert not os.path.isfile(output_fname), output_fname output_file = cv2.VideoWriter( filename=output_fname, # some installation of opencv may not support x264 (due to its license), # you can try other format (e.g. MPEG) fourcc=cv2.VideoWriter_fourcc(*"x264"), fps=float(frames_per_second), frameSize=(width, height), isColor=True, ) assert os.path.isfile(args.video_input) for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames): if args.output: output_file.write(vis_frame) else: cv2.namedWindow(basename, cv2.WINDOW_NORMAL) cv2.imshow(basename, vis_frame) if cv2.waitKey(1) == 27: break # esc to quit video.release() if args.output: output_file.release() else: cv2.destroyAllWindows() ================================================ FILE: demo/predictor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import numpy as np import atexit import bisect import multiprocessing as mp from collections import deque import cv2 import torch import matplotlib.pyplot as plt from detectron2.data import MetadataCatalog from detectron2.engine.defaults import DefaultPredictor from detectron2.utils.video_visualizer import VideoVisualizer from detectron2.utils.visualizer import ColorMode, Visualizer class VisualizationDemo(object): def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): """ Args: cfg (CfgNode): instance_mode (ColorMode): parallel (bool): whether to run the model in different processes from visualization. Useful since the visualization logic can be slow. """ self.metadata = MetadataCatalog.get( cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" ) self.cpu_device = torch.device("cpu") self.instance_mode = instance_mode self.parallel = parallel if parallel: num_gpu = torch.cuda.device_count() self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) else: self.predictor = DefaultPredictor(cfg) def run_on_image(self, image): """ Args: image (np.ndarray): an image of shape (H, W, C) (in BGR order). This is the format used by OpenCV. Returns: predictions (dict): the output of the model. vis_output (VisImage): the visualized image output. """ vis_output = None predictions = self.predictor(image) # Convert image from OpenCV BGR format to Matplotlib RGB format. image = image[:, :, ::-1] visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode) if "inst" in predictions: visualizer.vis_inst(predictions["inst"]) if "bases" in predictions: self.vis_bases(predictions["bases"]) if "panoptic_seg" in predictions: panoptic_seg, segments_info = predictions["panoptic_seg"] vis_output = visualizer.draw_panoptic_seg_predictions( panoptic_seg.to(self.cpu_device), segments_info ) else: if "sem_seg" in predictions: vis_output = visualizer.draw_sem_seg( predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)) if "instances" in predictions: instances = predictions["instances"].to(self.cpu_device) vis_output = visualizer.draw_instance_predictions(predictions=instances) return predictions, vis_output def _frame_from_video(self, video): while video.isOpened(): success, frame = video.read() if success: yield frame else: break def vis_bases(self, bases): basis_colors = [[2, 200, 255], [107, 220, 255], [30, 200, 255], [60, 220, 255]] bases = bases[0].squeeze() bases = (bases / 8).tanh().cpu().numpy() num_bases = len(bases) fig, axes = plt.subplots(nrows=num_bases // 2, ncols=2) for i, basis in enumerate(bases): basis = (basis + 1) / 2 basis = basis / basis.max() basis_viz = np.zeros((basis.shape[0], basis.shape[1], 3), dtype=np.uint8) basis_viz[:, :, 0] = basis_colors[i][0] basis_viz[:, :, 1] = basis_colors[i][1] basis_viz[:, :, 2] = np.uint8(basis * 255) basis_viz = cv2.cvtColor(basis_viz, cv2.COLOR_HSV2RGB) axes[i // 2][i % 2].imshow(basis_viz) plt.show() def run_on_video(self, video): """ Visualizes predictions on frames of the input video. Args: video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be either a webcam or a video file. Yields: ndarray: BGR visualizations of each video frame. """ video_visualizer = VideoVisualizer(self.metadata, self.instance_mode) def process_predictions(frame, predictions): frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) if "panoptic_seg" in predictions: panoptic_seg, segments_info = predictions["panoptic_seg"] vis_frame = video_visualizer.draw_panoptic_seg_predictions( frame, panoptic_seg.to(self.cpu_device), segments_info ) elif "instances" in predictions: predictions = predictions["instances"].to(self.cpu_device) vis_frame = video_visualizer.draw_instance_predictions(frame, predictions) elif "sem_seg" in predictions: vis_frame = video_visualizer.draw_sem_seg( frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) ) # Converts Matplotlib RGB format to OpenCV BGR format vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR) return vis_frame frame_gen = self._frame_from_video(video) if self.parallel: buffer_size = self.predictor.default_buffer_size frame_data = deque() for cnt, frame in enumerate(frame_gen): frame_data.append(frame) self.predictor.put(frame) if cnt >= buffer_size: frame = frame_data.popleft() predictions = self.predictor.get() yield process_predictions(frame, predictions) while len(frame_data): frame = frame_data.popleft() predictions = self.predictor.get() yield process_predictions(frame, predictions) else: for frame in frame_gen: yield process_predictions(frame, self.predictor(frame)) class AsyncPredictor: """ A predictor that runs the model asynchronously, possibly on >1 GPUs. Because rendering the visualization takes considerably amount of time, this helps improve throughput when rendering videos. """ class _StopToken: pass class _PredictWorker(mp.Process): def __init__(self, cfg, task_queue, result_queue): self.cfg = cfg self.task_queue = task_queue self.result_queue = result_queue super().__init__() def run(self): predictor = DefaultPredictor(self.cfg) while True: task = self.task_queue.get() if isinstance(task, AsyncPredictor._StopToken): break idx, data = task result = predictor(data) self.result_queue.put((idx, result)) def __init__(self, cfg, num_gpus: int = 1): """ Args: cfg (CfgNode): num_gpus (int): if 0, will run on CPU """ num_workers = max(num_gpus, 1) self.task_queue = mp.Queue(maxsize=num_workers * 3) self.result_queue = mp.Queue(maxsize=num_workers * 3) self.procs = [] for gpuid in range(max(num_gpus, 1)): cfg = cfg.clone() cfg.defrost() cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" self.procs.append( AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) ) self.put_idx = 0 self.get_idx = 0 self.result_rank = [] self.result_data = [] for p in self.procs: p.start() atexit.register(self.shutdown) def put(self, image): self.put_idx += 1 self.task_queue.put((self.put_idx, image)) def get(self): self.get_idx += 1 # the index needed for this request if len(self.result_rank) and self.result_rank[0] == self.get_idx: res = self.result_data[0] del self.result_data[0], self.result_rank[0] return res while True: # make sure the results are returned in the correct order idx, res = self.result_queue.get() if idx == self.get_idx: return res insert = bisect.bisect(self.result_rank, idx) self.result_rank.insert(insert, idx) self.result_data.insert(insert, res) def __len__(self): return self.put_idx - self.get_idx def __call__(self, image): self.put(image) return self.get() def shutdown(self): for _ in self.procs: self.task_queue.put(AsyncPredictor._StopToken()) @property def default_buffer_size(self): return len(self.procs) * 5 ================================================ FILE: fcos/__init__.py ================================================ from fcos import modeling __version__ = "0.1.1" ================================================ FILE: fcos/checkpoint/__init__.py ================================================ from .adet_checkpoint import AdetCheckpointer __all__ = ["AdetCheckpointer"] ================================================ FILE: fcos/checkpoint/adet_checkpoint.py ================================================ import pickle from fvcore.common.file_io import PathManager from detectron2.checkpoint import DetectionCheckpointer class AdetCheckpointer(DetectionCheckpointer): """ Same as :class:`DetectronCheckpointer`, but is able to convert models in AdelaiDet, such as LPF backbone. """ def _load_file(self, filename): if filename.endswith(".pkl"): with PathManager.open(filename, "rb") as f: data = pickle.load(f, encoding="latin1") if "model" in data and "__author__" in data: # file is in Detectron2 model zoo format self.logger.info("Reading a file from '{}'".format(data["__author__"])) return data else: # assume file is from Caffe2 / Detectron1 model zoo if "blobs" in data: # Detection models have "blobs", but ImageNet models don't data = data["blobs"] data = {k: v for k, v in data.items() if not k.endswith("_momentum")} return {"model": data, "__author__": "Caffe2", "matching_heuristics": True} loaded = super()._load_file(filename) # load native pth checkpoint if "model" not in loaded: loaded = {"model": loaded} if "lpf" in filename: loaded["matching_heuristics"] = True return loaded ================================================ FILE: fcos/config/__init__.py ================================================ from .config import get_cfg __all__ = [ "get_cfg", ] ================================================ FILE: fcos/config/config.py ================================================ from detectron2.config import CfgNode def get_cfg() -> CfgNode: """ Get a copy of the default config. Returns: a detectron2 CfgNode instance. """ from .defaults import _C return _C.clone() ================================================ FILE: fcos/config/defaults.py ================================================ from detectron2.config.defaults import _C from detectron2.config import CfgNode as CN # ---------------------------------------------------------------------------- # # Additional Configs # ---------------------------------------------------------------------------- # _C.MODEL.MOBILENET = False # ---------------------------------------------------------------------------- # # FCOS Head # ---------------------------------------------------------------------------- # _C.MODEL.FCOS = CN() # This is the number of foreground classes. _C.MODEL.FCOS.NUM_CLASSES = 80 _C.MODEL.FCOS.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"] _C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128] _C.MODEL.FCOS.PRIOR_PROB = 0.01 _C.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.05 _C.MODEL.FCOS.INFERENCE_TH_TEST = 0.05 _C.MODEL.FCOS.NMS_TH = 0.6 _C.MODEL.FCOS.PRE_NMS_TOPK_TRAIN = 1000 _C.MODEL.FCOS.PRE_NMS_TOPK_TEST = 1000 _C.MODEL.FCOS.POST_NMS_TOPK_TRAIN = 100 _C.MODEL.FCOS.POST_NMS_TOPK_TEST = 100 _C.MODEL.FCOS.TOP_LEVELS = 2 _C.MODEL.FCOS.NORM = "GN" # Support GN or none _C.MODEL.FCOS.USE_SCALE = True # Multiply centerness before threshold # This will affect the final performance by about 0.05 AP but save some time _C.MODEL.FCOS.THRESH_WITH_CTR = False # Focal loss parameters _C.MODEL.FCOS.LOSS_ALPHA = 0.25 _C.MODEL.FCOS.LOSS_GAMMA = 2.0 _C.MODEL.FCOS.SIZES_OF_INTEREST = [64, 128, 256, 512] _C.MODEL.FCOS.USE_RELU = True _C.MODEL.FCOS.USE_DEFORMABLE = False # the number of convolutions used in the cls and bbox tower _C.MODEL.FCOS.NUM_CLS_CONVS = 4 _C.MODEL.FCOS.NUM_BOX_CONVS = 4 _C.MODEL.FCOS.NUM_SHARE_CONVS = 0 _C.MODEL.FCOS.CENTER_SAMPLE = True _C.MODEL.FCOS.POS_RADIUS = 1.5 _C.MODEL.FCOS.LOC_LOSS_TYPE = 'giou' # ---------------------------------------------------------------------------- # # VoVNet backbone # ---------------------------------------------------------------------------- # _C.MODEL.VOVNET = CN() _C.MODEL.VOVNET.CONV_BODY = "V-39-eSE" _C.MODEL.VOVNET.OUT_FEATURES = ["stage2", "stage3", "stage4", "stage5"] # Options: FrozenBN, GN, "SyncBN", "BN" _C.MODEL.VOVNET.NORM = "FrozenBN" _C.MODEL.VOVNET.OUT_CHANNELS = 256 _C.MODEL.VOVNET.BACKBONE_OUT_CHANNELS = 256 ================================================ FILE: fcos/data/__init__.py ================================================ from . import builtin # ensure the builtin datasets are registered # from .dataset_mapper import DatasetMapperWithBasis # __all__ = ["DatasetMapperWithBasis"] ================================================ FILE: fcos/data/builtin.py ================================================ import os from detectron2.data.datasets.register_coco import register_coco_instances # register person in context dataset _PREDEFINED_SPLITS_PIC = { "pic_person_train": ("pic/image/train", "pic/annotations/train_person.json"), "pic_person_val": ("pic/image/val", "pic/annotations/val_person.json"), } metadata = { "thing_classes": ["person"] } def register_all_coco(root="datasets"): for key, (image_root, json_file) in _PREDEFINED_SPLITS_PIC.items(): # Assume pre-defined datasets live in `./datasets`. register_coco_instances( key, metadata, os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) register_all_coco() ================================================ FILE: fcos/layers/__init__.py ================================================ from .deform_conv import DFConv2d from .ml_nms import ml_nms from .iou_loss import IOULoss from .conv_with_kaiming_uniform import conv_with_kaiming_uniform __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: fcos/layers/conv_with_kaiming_uniform.py ================================================ from torch import nn from detectron2.layers import Conv2d from .deform_conv import DFConv2d from detectron2.layers.batch_norm import get_norm def conv_with_kaiming_uniform( norm=None, activation=None, use_deformable=False, use_sep=False): def make_conv( in_channels, out_channels, kernel_size, stride=1, dilation=1 ): if use_deformable: conv_func = DFConv2d else: conv_func = Conv2d if use_sep: assert in_channels == out_channels groups = in_channels else: groups = 1 conv = conv_func( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=dilation * (kernel_size - 1) // 2, dilation=dilation, groups=groups, bias=(norm is None) ) if not use_deformable: # Caffe2 implementation uses XavierFill, which in fact # corresponds to kaiming_uniform_ in PyTorch nn.init.kaiming_uniform_(conv.weight, a=1) if norm is None: nn.init.constant_(conv.bias, 0) module = [conv,] if norm is not None: if norm == "GN": norm_module = nn.GroupNorm(32, out_channels) else: norm_module = get_norm(norm, out_channels) module.append(norm_module) if activation is not None: module.append(nn.ReLU(inplace=True)) if len(module) > 1: return nn.Sequential(*module) return conv return make_conv ================================================ FILE: fcos/layers/csrc/cuda_version.cu ================================================ #include namespace adet { int get_cudart_version() { return CUDART_VERSION; } } // namespace adet ================================================ FILE: fcos/layers/csrc/ml_nms/ml_nms.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include #include #include #include #include int const threadsPerBlock = sizeof(unsigned long long) * 8; __device__ inline float devIoU(float const * const a, float const * const b) { if (a[5] != b[5]) { return 0.0; } float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); float interS = width * height; float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); return interS / (Sa + Sb - interS); } __global__ void ml_nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float *dev_boxes, unsigned long long *dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 6]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 6 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0]; block_boxes[threadIdx.x * 6 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1]; block_boxes[threadIdx.x * 6 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2]; block_boxes[threadIdx.x * 6 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3]; block_boxes[threadIdx.x * 6 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4]; block_boxes[threadIdx.x * 6 + 5] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const float *cur_box = dev_boxes + cur_box_idx * 6; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 6) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } namespace adet { // boxes is a N x 6 tensor at::Tensor ml_nms_cuda(const at::Tensor boxes, const float nms_overlap_thresh) { using scalar_t = float; AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); auto scores = boxes.select(1, 4); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto boxes_sorted = boxes.index_select(0, order_t); int boxes_num = boxes.size(0); const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); scalar_t* boxes_dev = boxes_sorted.data(); THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState unsigned long long* mask_dev = NULL; //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, // boxes_num * col_blocks * sizeof(unsigned long long))); mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), THCCeilDiv(boxes_num, threadsPerBlock)); dim3 threads(threadsPerBlock); ml_nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev); std::vector mask_host(boxes_num * col_blocks); THCudaCheck(cudaMemcpy(&mask_host[0], mask_dev, sizeof(unsigned long long) * boxes_num * col_blocks, cudaMemcpyDeviceToHost)); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); int64_t* keep_out = keep.data(); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; if (!(remv[nblock] & (1ULL << inblock))) { keep_out[num_to_keep++] = i; unsigned long long *p = &mask_host[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } } } THCudaFree(state, mask_dev); // TODO improve this part return std::get<0>(order_t.index({ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( order_t.device(), keep.scalar_type()) }).sort(0, false)); } } // namespace adet ================================================ FILE: fcos/layers/csrc/ml_nms/ml_nms.h ================================================ #pragma once #include namespace adet { #ifdef WITH_CUDA at::Tensor ml_nms_cuda( const at::Tensor dets, const float threshold); #endif at::Tensor ml_nms(const at::Tensor& dets, const at::Tensor& scores, const at::Tensor& labels, const float threshold) { if (dets.type().is_cuda()) { #ifdef WITH_CUDA // TODO raise error if not compiled with CUDA if (dets.numel() == 0) return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); auto b = at::cat({dets, scores.unsqueeze(1), labels.unsqueeze(1)}, 1); return ml_nms_cuda(b, threshold); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("CPU version not implemented"); } } // namespace adet ================================================ FILE: fcos/layers/csrc/vision.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved #include "ml_nms/ml_nms.h" namespace adet { #ifdef WITH_CUDA extern int get_cudart_version(); #endif std::string get_cuda_version() { #ifdef WITH_CUDA std::ostringstream oss; // copied from // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231 auto printCudaStyleVersion = [&](int v) { oss << (v / 1000) << "." << (v / 10 % 100); if (v % 10 != 0) { oss << "." << (v % 10); } }; printCudaStyleVersion(get_cudart_version()); return oss.str(); #else return std::string("not available"); #endif } // similar to // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp std::string get_compiler_version() { std::ostringstream ss; #if defined(__GNUC__) #ifndef __clang__ { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; } #endif #endif #if defined(__clang_major__) { ss << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; } #endif #if defined(_MSC_VER) { ss << "MSVC " << _MSC_FULL_VER; } #endif return ss.str(); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("ml_nms", &ml_nms, "Multi-Label NMS"); } } // namespace adet ================================================ FILE: fcos/layers/deform_conv.py ================================================ import torch from torch import nn from detectron2.layers import Conv2d class _NewEmptyTensorOp(torch.autograd.Function): @staticmethod def forward(ctx, x, new_shape): ctx.shape = x.shape return x.new_empty(new_shape) @staticmethod def backward(ctx, grad): shape = ctx.shape return _NewEmptyTensorOp.apply(grad, shape), None class DFConv2d(nn.Module): """ Deformable convolutional layer with configurable deformable groups, dilations and groups. Code is from: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/layers/misc.py """ def __init__( self, in_channels, out_channels, with_modulated_dcn=True, kernel_size=3, stride=1, groups=1, dilation=1, deformable_groups=1, bias=False, padding=None ): super(DFConv2d, self).__init__() if isinstance(kernel_size, (list, tuple)): assert isinstance(stride, (list, tuple)) assert isinstance(dilation, (list, tuple)) assert len(kernel_size) == 2 assert len(stride) == 2 assert len(dilation) == 2 padding = ( dilation[0] * (kernel_size[0] - 1) // 2, dilation[1] * (kernel_size[1] - 1) // 2 ) offset_base_channels = kernel_size[0] * kernel_size[1] else: padding = dilation * (kernel_size - 1) // 2 offset_base_channels = kernel_size * kernel_size if with_modulated_dcn: from .deform_conv import ModulatedDeformConv offset_channels = offset_base_channels * 3 # default: 27 conv_block = ModulatedDeformConv else: from .deform_conv import DeformConv offset_channels = offset_base_channels * 2 # default: 18 conv_block = DeformConv self.offset = Conv2d( in_channels, deformable_groups * offset_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=1, dilation=dilation ) for l in [self.offset, ]: nn.init.kaiming_uniform_(l.weight, a=1) torch.nn.init.constant_(l.bias, 0.) self.conv = conv_block( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, deformable_groups=deformable_groups, bias=bias ) self.with_modulated_dcn = with_modulated_dcn self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.offset_split = offset_base_channels * deformable_groups * 2 def forward(self, x, return_offset=False): if x.numel() > 0: if not self.with_modulated_dcn: offset_mask = self.offset(x) x = self.conv(x, offset_mask) else: offset_mask = self.offset(x) offset = offset_mask[:, :self.offset_split, :, :] mask = offset_mask[:, self.offset_split:, :, :].sigmoid() x = self.conv(x, offset, mask) if return_offset: return x, offset_mask return x # get output shape output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // d + 1 for i, p, di, k, d in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride ) ] output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) ================================================ FILE: fcos/layers/iou_loss.py ================================================ import torch from torch import nn class IOULoss(nn.Module): """ Intersetion Over Union (IoU) loss which supports three different IoU computations: * IoU * Linear IoU * gIoU """ def __init__(self, loc_loss_type='iou'): super(IOULoss, self).__init__() self.loc_loss_type = loc_loss_type def forward(self, pred, target, weight=None): """ Args: pred: Nx4 predicted bounding boxes target: Nx4 target bounding boxes weight: N loss weight for each instance """ pred_left = pred[:, 0] pred_top = pred[:, 1] pred_right = pred[:, 2] pred_bottom = pred[:, 3] target_left = target[:, 0] target_top = target[:, 1] target_right = target[:, 2] target_bottom = target[:, 3] target_aera = (target_left + target_right) * \ (target_top + target_bottom) pred_aera = (pred_left + pred_right) * \ (pred_top + pred_bottom) w_intersect = torch.min(pred_left, target_left) + \ torch.min(pred_right, target_right) h_intersect = torch.min(pred_bottom, target_bottom) + \ torch.min(pred_top, target_top) g_w_intersect = torch.max(pred_left, target_left) + \ torch.max(pred_right, target_right) g_h_intersect = torch.max(pred_bottom, target_bottom) + \ torch.max(pred_top, target_top) ac_uion = g_w_intersect * g_h_intersect area_intersect = w_intersect * h_intersect area_union = target_aera + pred_aera - area_intersect ious = (area_intersect + 1.0) / (area_union + 1.0) gious = ious - (ac_uion - area_union) / ac_uion if self.loc_loss_type == 'iou': losses = -torch.log(ious) elif self.loc_loss_type == 'linear_iou': losses = 1 - ious elif self.loc_loss_type == 'giou': losses = 1 - gious else: raise NotImplementedError if weight is not None: return (losses * weight).sum() else: return losses.sum() ================================================ FILE: fcos/layers/ml_nms.py ================================================ from detectron2.layers import batched_nms def ml_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores", label_field="labels"): """ Performs non-maximum suppression on a boxlist, with scores specified in a boxlist field via score_field. Args: boxlist (detectron2.structures.Boxes): nms_thresh (float): max_proposals (int): if > 0, then only the top max_proposals are kept after non-maximum suppression score_field (str): """ if nms_thresh <= 0: return boxlist boxes = boxlist.pred_boxes.tensor scores = boxlist.scores labels = boxlist.pred_classes keep = batched_nms(boxes, scores, labels, nms_thresh) if max_proposals > 0: keep = keep[: max_proposals] boxlist = boxlist[keep] return boxlist ================================================ FILE: fcos/modeling/__init__.py ================================================ from .fcos import FCOS from .backbone import build_fcos_resnet_fpn_backbone from .one_stage_detector import OneStageDetector _EXCLUDE = {"torch", "ShapeSpec"} __all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")] ================================================ FILE: fcos/modeling/backbone/__init__.py ================================================ from .fpn import build_fcos_resnet_fpn_backbone from .vovnet import build_vovnet_fpn_backbone, build_vovnet_backbone ================================================ FILE: fcos/modeling/backbone/fpn.py ================================================ from torch import nn import torch.nn.functional as F import fvcore.nn.weight_init as weight_init from detectron2.modeling.backbone import FPN, build_resnet_backbone from detectron2.layers import ShapeSpec from detectron2.modeling.backbone.build import BACKBONE_REGISTRY from .mobilenet import build_mnv2_backbone class LastLevelP6P7(nn.Module): """ This module is used in RetinaNet and FCOS to generate extra layers, P6 and P7 from C5 or P5 feature. """ def __init__(self, in_channels, out_channels, in_features="res5"): super().__init__() self.num_levels = 2 self.in_feature = in_features self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) for module in [self.p6, self.p7]: weight_init.c2_xavier_fill(module) def forward(self, x): p6 = self.p6(x) p7 = self.p7(F.relu(p6)) return [p6, p7] class LastLevelP6(nn.Module): """ This module is used in FCOS to generate extra layers """ def __init__(self, in_channels, out_channels, in_features="res5"): super().__init__() self.num_levels = 1 self.in_feature = in_features self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) for module in [self.p6]: weight_init.c2_xavier_fill(module) def forward(self, x): p6 = self.p6(x) return [p6] @BACKBONE_REGISTRY.register() def build_fcos_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ if cfg.MODEL.MOBILENET: bottom_up = build_mnv2_backbone(cfg, input_shape) else: bottom_up = build_resnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS top_levels = cfg.MODEL.FCOS.TOP_LEVELS in_channels_top = out_channels if top_levels == 2: top_block = LastLevelP6P7(in_channels_top, out_channels, "p5") if top_levels == 1: top_block = LastLevelP6(in_channels_top, out_channels, "p5") elif top_levels == 0: top_block = None backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=top_block, fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone ================================================ FILE: fcos/modeling/backbone/mobilenet.py ================================================ # taken from https://github.com/tonylins/pytorch-mobilenet-v2/ # Published by Ji Lin, tonylins # licensed under the Apache License, Version 2.0, January 2004 from torch import nn from torch.nn import BatchNorm2d #from detectron2.layers.batch_norm import NaiveSyncBatchNorm as BatchNorm2d from detectron2.layers import Conv2d from detectron2.modeling.backbone.build import BACKBONE_REGISTRY from detectron2.modeling.backbone import Backbone def conv_bn(inp, oup, stride): return nn.Sequential( Conv2d(inp, oup, 3, stride, 1, bias=False), BatchNorm2d(oup), nn.ReLU6(inplace=True) ) def conv_1x1_bn(inp, oup): return nn.Sequential( Conv2d(inp, oup, 1, 1, 0, bias=False), BatchNorm2d(oup), nn.ReLU6(inplace=True) ) class InvertedResidual(nn.Module): def __init__(self, inp, oup, stride, expand_ratio): super(InvertedResidual, self).__init__() self.stride = stride assert stride in [1, 2] hidden_dim = int(round(inp * expand_ratio)) self.use_res_connect = self.stride == 1 and inp == oup if expand_ratio == 1: self.conv = nn.Sequential( # dw Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), BatchNorm2d(hidden_dim), nn.ReLU6(inplace=True), # pw-linear Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), BatchNorm2d(oup), ) else: self.conv = nn.Sequential( # pw Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), BatchNorm2d(hidden_dim), nn.ReLU6(inplace=True), # dw Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), BatchNorm2d(hidden_dim), nn.ReLU6(inplace=True), # pw-linear Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), BatchNorm2d(oup), ) def forward(self, x): if self.use_res_connect: return x + self.conv(x) else: return self.conv(x) class MobileNetV2(Backbone): """ Should freeze bn """ def __init__(self, cfg, n_class=1000, input_size=224, width_mult=1.): super(MobileNetV2, self).__init__() block = InvertedResidual input_channel = 32 interverted_residual_setting = [ # t, c, n, s [1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], [6, 320, 1, 1], ] # building first layer assert input_size % 32 == 0 input_channel = int(input_channel * width_mult) self.return_features_indices = [3, 6, 13, 17] self.return_features_num_channels = [] self.features = nn.ModuleList([conv_bn(3, input_channel, 2)]) # building inverted residual blocks for t, c, n, s in interverted_residual_setting: output_channel = int(c * width_mult) for i in range(n): if i == 0: self.features.append(block(input_channel, output_channel, s, expand_ratio=t)) else: self.features.append(block(input_channel, output_channel, 1, expand_ratio=t)) input_channel = output_channel if len(self.features) - 1 in self.return_features_indices: self.return_features_num_channels.append(output_channel) self._initialize_weights() self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_AT) def _freeze_backbone(self, freeze_at): for layer_index in range(freeze_at): for p in self.features[layer_index].parameters(): p.requires_grad = False def forward(self, x): res = [] for i, m in enumerate(self.features): x = m(x) if i in self.return_features_indices: res.append(x) return {'res{}'.format(i + 2): r for i, r in enumerate(res)} def _initialize_weights(self): for m in self.modules(): if isinstance(m, Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, (2. / n) ** 0.5) if m.bias is not None: m.bias.data.zero_() elif isinstance(m, BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, nn.Linear): n = m.weight.size(1) m.weight.data.normal_(0, 0.01) m.bias.data.zero_() @BACKBONE_REGISTRY.register() def build_mnv2_backbone(cfg, input_shape): """ Create a ResNet instance from config. Returns: ResNet: a :class:`ResNet` instance. """ out_features = cfg.MODEL.RESNETS.OUT_FEATURES out_feature_channels = {"res2": 24, "res3": 32, "res4": 96, "res5": 320} out_feature_strides = {"res2": 4, "res3": 8, "res4": 16, "res5": 32} model = MobileNetV2(cfg) model._out_features = out_features model._out_feature_channels = out_feature_channels model._out_feature_strides = out_feature_strides return model ================================================ FILE: fcos/modeling/backbone/vovnet.py ================================================ # Copyright (c) Youngwan Lee (ETRI) All Rights Reserved. from collections import OrderedDict import torch import torch.nn as nn import torch.nn.functional as F import fvcore.nn.weight_init as weight_init from detectron2.modeling.backbone import Backbone from detectron2.modeling.backbone.build import BACKBONE_REGISTRY from detectron2.modeling.backbone.fpn import FPN from detectron2.layers import ( Conv2d, DeformConv, FrozenBatchNorm2d, ShapeSpec, get_norm, ) from .fpn import LastLevelP6, LastLevelP6P7 __all__ = [ "VoVNet", "build_vovnet_backbone", "build_vovnet_fpn_backbone" ] _NORM = False VoVNet19_eSE = { 'stage_conv_ch': [128, 160, 192, 224], 'stage_out_ch': [256, 512, 768, 1024], 'layer_per_block': 3, 'block_per_stage': [1, 1, 1, 1], 'eSE' : True } VoVNet39_eSE = { 'stage_conv_ch': [128, 160, 192, 224], 'stage_out_ch': [256, 512, 768, 1024], 'layer_per_block': 5, 'block_per_stage': [1, 1, 2, 2], 'eSE' : True } VoVNet57_eSE = { 'stage_conv_ch': [128, 160, 192, 224], 'stage_out_ch': [256, 512, 768, 1024], 'layer_per_block': 5, 'block_per_stage': [1, 1, 4, 3], 'eSE' : True } VoVNet99_eSE = { 'stage_conv_ch': [128, 160, 192, 224], 'stage_out_ch': [256, 512, 768, 1024], 'layer_per_block': 5, 'block_per_stage': [1, 3, 9, 3], 'eSE' : True } _STAGE_SPECS = { "V-19-eSE": VoVNet19_eSE, "V-39-eSE": VoVNet39_eSE, "V-57-eSE": VoVNet57_eSE, "V-99-eSE": VoVNet99_eSE } def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1): """3x3 convolution with padding""" return [ (f'{module_name}_{postfix}/conv', nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False)), (f'{module_name}_{postfix}/norm', get_norm(_NORM, out_channels)), (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)) ] def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0): """1x1 convolution with padding""" return [ (f'{module_name}_{postfix}/conv', nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False)), (f'{module_name}_{postfix}/norm', get_norm(_NORM, out_channels)), (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)) ] class Hsigmoid(nn.Module): def __init__(self, inplace=True): super(Hsigmoid, self).__init__() self.inplace = inplace def forward(self, x): return F.relu6(x + 3., inplace=self.inplace) / 6. class eSEModule(nn.Module): def __init__(self, channel, reduction=4): super(eSEModule, self).__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Conv2d(channel,channel, kernel_size=1, padding=0) self.hsigmoid = Hsigmoid() def forward(self, x): input = x x = self.avg_pool(x) x = self.fc(x) x = self.hsigmoid(x) return input * x class _OSA_module(nn.Module): def __init__(self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False): super(_OSA_module, self).__init__() self.identity = identity self.layers = nn.ModuleList() in_channel = in_ch for i in range(layer_per_block): self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i)))) in_channel = stage_ch # feature aggregation in_channel = in_ch + layer_per_block * stage_ch self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, 'concat'))) self.ese = eSEModule(concat_ch) def forward(self, x): identity_feat = x output = [] output.append(x) for layer in self.layers: x = layer(x) output.append(x) x = torch.cat(output, dim=1) xt = self.concat(x) xt = self.ese(xt) if self.identity: xt = xt + identity_feat return xt class _OSA_stage(nn.Sequential): def __init__(self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False): super(_OSA_stage, self).__init__() if not stage_num == 2: self.add_module('Pooling', nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)) if block_per_stage !=1: SE = False module_name = f'OSA{stage_num}_1' self.add_module(module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE)) for i in range(block_per_stage - 1): if i != block_per_stage -2: #last block SE = False module_name = f'OSA{stage_num}_{i + 2}' self.add_module(module_name, _OSA_module(concat_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, identity=True)) class VoVNet(Backbone): def __init__(self, cfg, input_ch, out_features=None): """ Args: input_ch(int) : the number of input channel out_features (list[str]): name of the layers whose outputs should be returned in forward. Can be anything in "stem", "stage2" ... """ super(VoVNet, self).__init__() global _NORM _NORM = cfg.MODEL.VOVNET.NORM stage_specs = _STAGE_SPECS[cfg.MODEL.VOVNET.CONV_BODY] config_stage_ch = stage_specs['stage_conv_ch'] config_concat_ch = stage_specs['stage_out_ch'] block_per_stage = stage_specs['block_per_stage'] layer_per_block = stage_specs['layer_per_block'] SE = stage_specs['eSE'] self._out_features = out_features # Stem module stem = conv3x3(input_ch, 64, 'stem', '1', 2) stem += conv3x3(64, 64, 'stem', '2', 1) stem += conv3x3(64, 128, 'stem', '3', 2) self.add_module('stem', nn.Sequential((OrderedDict(stem)))) current_stirde = 4 self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde} self._out_feature_channels = {"stem": 128} stem_out_ch = [128] in_ch_list = stem_out_ch + config_concat_ch[:-1] # OSA stages self.stage_names = [] for i in range(4): # num_stages name = 'stage%d' % (i + 2) # stage 2 ... stage 5 self.stage_names.append(name) self.add_module(name, _OSA_stage(in_ch_list[i], config_stage_ch[i], config_concat_ch[i], block_per_stage[i], layer_per_block, i + 2, SE)) self._out_feature_channels[name] = config_concat_ch[i] if not i == 0: self._out_feature_strides[name] = current_stirde = int( current_stirde * 2) # initialize weights self._initialize_weights() # Optionally freeze (requires_grad=False) parts of the backbone self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_AT) def _initialize_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight) def _freeze_backbone(self, freeze_at): if freeze_at < 0: return # freeze BN layers for m in self.modules(): if isinstance(m, nn.BatchNorm2d): freeze_bn_params(m) for stage_index in range(freeze_at): if stage_index == 0: m = self.stem # stage 0 is the stem else: m = getattr(self, "stage" + str(stage_index+1)) for p in m.parameters(): p.requires_grad = False FrozenBatchNorm2d.convert_frozen_batchnorm(self) def forward(self, x): outputs = {} x = self.stem(x) if "stem" in self._out_features: outputs["stem"] = x for name in self.stage_names: x = getattr(self, name)(x) if name in self._out_features: outputs[name] = x return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @BACKBONE_REGISTRY.register() def build_vovnet_backbone(cfg, input_shape): """ Create a VoVNet instance from config. Returns: VoVNet: a :class:`VoVNet` instance. """ out_features = cfg.MODEL.VOVNET.OUT_FEATURES return VoVNet(cfg, input_shape.channels, out_features=out_features) @BACKBONE_REGISTRY.register() def build_vovnet_fpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_vovnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=LastLevelMaxPool(), fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone @BACKBONE_REGISTRY.register() def build_fcos_vovnet_fpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_vovnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS top_levels = cfg.MODEL.FCOS.TOP_LEVELS in_channels_top = out_channels if top_levels == 2: top_block = LastLevelP6P7(in_channels_top, out_channels, "p5") if top_levels == 1: top_block = LastLevelP6(in_channels_top, out_channels, "p5") elif top_levels == 0: top_block = None backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=top_block, fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone ================================================ FILE: fcos/modeling/fcos/__init__.py ================================================ from .fcos import FCOS ================================================ FILE: fcos/modeling/fcos/fcos.py ================================================ import math from typing import List, Dict import torch from torch import nn from torch.nn import functional as F from detectron2.layers import ShapeSpec from detectron2.modeling.proposal_generator.build import PROPOSAL_GENERATOR_REGISTRY from fcos.layers import DFConv2d, IOULoss from .fcos_outputs import FCOSOutputs __all__ = ["FCOS"] INF = 100000000 class Scale(nn.Module): def __init__(self, init_value=1.0): super(Scale, self).__init__() self.scale = nn.Parameter(torch.FloatTensor([init_value])) def forward(self, input): return input * self.scale @PROPOSAL_GENERATOR_REGISTRY.register() class FCOS(nn.Module): """ Implement FCOS (https://arxiv.org/abs/1904.01355). """ def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): super().__init__() # fmt: off self.in_features = cfg.MODEL.FCOS.IN_FEATURES self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA self.center_sample = cfg.MODEL.FCOS.CENTER_SAMPLE self.strides = cfg.MODEL.FCOS.FPN_STRIDES self.radius = cfg.MODEL.FCOS.POS_RADIUS self.pre_nms_thresh_train = cfg.MODEL.FCOS.INFERENCE_TH_TRAIN self.pre_nms_thresh_test = cfg.MODEL.FCOS.INFERENCE_TH_TEST self.pre_nms_topk_train = cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN self.pre_nms_topk_test = cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST self.nms_thresh = cfg.MODEL.FCOS.NMS_TH self.post_nms_topk_train = cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN self.post_nms_topk_test = cfg.MODEL.FCOS.POST_NMS_TOPK_TEST self.thresh_with_ctr = cfg.MODEL.FCOS.THRESH_WITH_CTR # fmt: on self.iou_loss = IOULoss(cfg.MODEL.FCOS.LOC_LOSS_TYPE) # generate sizes of interest soi = [] prev_size = -1 for s in cfg.MODEL.FCOS.SIZES_OF_INTEREST: soi.append([prev_size, s]) prev_size = s soi.append([prev_size, INF]) self.sizes_of_interest = soi self.fcos_head = FCOSHead(cfg, [input_shape[f] for f in self.in_features]) def forward(self, images, features, gt_instances): """ Arguments: images (list[Tensor] or ImageList): images to be processed targets (list[BoxList]): ground-truth boxes present in the image (optional) Returns: result (list[BoxList] or dict[Tensor]): the output from the model. During training, it returns a dict[Tensor] which contains the losses. During testing, it returns list[BoxList] contains additional fields like `scores`, `labels` and `mask` (for Mask R-CNN models). """ features = [features[f] for f in self.in_features] locations = self.compute_locations(features) logits_pred, reg_pred, ctrness_pred, bbox_towers, controllers, masks = self.fcos_head(features) if self.training: pre_nms_thresh = self.pre_nms_thresh_train pre_nms_topk = self.pre_nms_topk_train post_nms_topk = self.post_nms_topk_train else: pre_nms_thresh = self.pre_nms_thresh_test pre_nms_topk = self.pre_nms_topk_test post_nms_topk = self.post_nms_topk_test outputs = FCOSOutputs( images, locations, logits_pred, reg_pred, ctrness_pred, self.focal_loss_alpha, self.focal_loss_gamma, self.iou_loss, self.center_sample, self.sizes_of_interest, self.strides, self.radius, self.fcos_head.num_classes, pre_nms_thresh, pre_nms_topk, self.nms_thresh, post_nms_topk, self.thresh_with_ctr, controllers, masks, gt_instances ) if self.training: losses, _ = outputs.losses() return None, losses else: proposals = outputs.predict_proposals() return proposals, {} def compute_locations(self, features): locations = [] for level, feature in enumerate(features): h, w = feature.size()[-2:] locations_per_level = self.compute_locations_per_level( h, w, self.fpn_strides[level], feature.device ) locations.append(locations_per_level) return locations def compute_locations_per_level(self, h, w, stride, device): shifts_x = torch.arange( 0, w * stride, step=stride, dtype=torch.float32, device=device ) shifts_y = torch.arange( 0, h * stride, step=stride, dtype=torch.float32, device=device ) shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2 return locations class FCOSHead(nn.Module): def __init__(self, cfg, input_shape: List[ShapeSpec]): """ Arguments: in_channels (int): number of channels of the input feature """ super().__init__() # TODO: Implement the sigmoid version first. self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES head_configs = {"cls": (cfg.MODEL.FCOS.NUM_CLS_CONVS, False), "bbox": (cfg.MODEL.FCOS.NUM_BOX_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE), "share": (cfg.MODEL.FCOS.NUM_SHARE_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE), "mask": (8,False)} norm = None if cfg.MODEL.FCOS.NORM == "none" else cfg.MODEL.FCOS.NORM in_channels = [s.channels for s in input_shape] assert len(set(in_channels)) == 1, "Each level must have the same channel!" in_channels = in_channels[0] for head in head_configs: tower = [] num_convs, use_deformable = head_configs[head] if use_deformable: conv_func = DFConv2d else: conv_func = nn.Conv2d for i in range(num_convs): tower.append(conv_func( in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True )) if norm == "GN": tower.append(nn.GroupNorm(32, in_channels)) tower.append(nn.ReLU()) self.add_module('{}_tower'.format(head), nn.Sequential(*tower)) self.cls_logits = nn.Conv2d( in_channels, self.num_classes, kernel_size=3, stride=1, padding=1 ) self.bbox_pred = nn.Conv2d( in_channels, 4, kernel_size=3, stride=1, padding=1 ) self.ctrness = nn.Conv2d( in_channels, 1, kernel_size=3, stride=1, padding=1 ) self.controller = nn.Conv2d( in_channels, 169, kernel_size=3, stride=1, padding=1 ) self.mask = nn.Conv2d( in_channels, 8, kernel_size=3, stride=1, padding=1 ) if cfg.MODEL.FCOS.USE_SCALE: self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in self.fpn_strides]) else: self.scales = None for modules in [ self.cls_tower, self.bbox_tower, self.share_tower, self.cls_logits, self.bbox_pred, self.ctrness, self.controller, self.mask, ]: for l in modules.modules(): if isinstance(l, nn.Conv2d): torch.nn.init.normal_(l.weight, std=0.01) torch.nn.init.constant_(l.bias, 0) # initialize the bias for focal loss prior_prob = cfg.MODEL.FCOS.PRIOR_PROB bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_logits.bias, bias_value) def forward(self, x): logits = [] bbox_reg = [] ctrness = [] bbox_towers = [] controllers = [] for l, feature in enumerate(x): feature = self.share_tower(feature) cls_tower = self.cls_tower(feature) bbox_tower = self.bbox_tower(feature) logits.append(self.cls_logits(cls_tower)) ctrness.append(self.ctrness(bbox_tower)) controllers.append(self.controller(bbox_tower)) reg = self.bbox_pred(bbox_tower) if self.scales is not None: reg = self.scales[l](reg) # Note that we use relu, as in the improved FCOS, instead of exp. bbox_reg.append(F.relu(reg)) masks = x[0] masks = self.mask_tower(masks) masks = self.mask(masks) return logits, bbox_reg, ctrness, bbox_towers, controllers, masks ================================================ FILE: fcos/modeling/fcos/fcos_outputs.py ================================================ import logging import torch import torch.nn.functional as F from detectron2.layers import cat from detectron2.structures import Instances, Boxes from fcos.utils.comm import get_world_size from fvcore.nn import sigmoid_focal_loss_jit from fcos.utils.comm import reduce_sum from fcos.layers import ml_nms #from detectron2.layers import interpolate logger = logging.getLogger(__name__) INF = 100000000 """ Shape shorthand in this module: N: number of images in the minibatch L: number of feature maps per image on which RPN is run Hi, Wi: height and width of the i-th feature map 4: size of the box parameterization Naming convention: labels: refers to the ground-truth class of an position. reg_targets: refers to the 4-d (left, top, right, bottom) distances that parameterize the ground-truth box. logits_pred: predicted classification scores in [-inf, +inf]; reg_pred: the predicted (left, top, right, bottom), corresponding to reg_targets ctrness_pred: predicted centerness scores """ def aligned_bilinear(tensor, factor): assert tensor.dim() == 4 assert factor >= 1 assert int(factor) == factor if factor == 1: return tensor h, w = tensor.size()[2:] tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate") oh = factor * h + 1 ow = factor * w + 1 tensor = F.interpolate( tensor, size=(oh, ow), mode='bilinear', align_corners=True ) tensor = F.pad( tensor, pad=(factor // 2, 0, factor // 2, 0), mode="replicate" ) return tensor[:, :, :oh - 1, :ow - 1] def compute_ctrness_targets(reg_targets): if len(reg_targets) == 0: return reg_targets.new_zeros(len(reg_targets)) left_right = reg_targets[:, [0, 2]] top_bottom = reg_targets[:, [1, 3]] ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \ (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) return torch.sqrt(ctrness) class FCOSOutputs(object): def __init__( self, images, locations, logits_pred, reg_pred, ctrness_pred, focal_loss_alpha, focal_loss_gamma, iou_loss, center_sample, sizes_of_interest, strides, radius, num_classes, pre_nms_thresh, pre_nms_top_n, nms_thresh, fpn_post_nms_top_n, thresh_with_ctr, controllers, masks, gt_instances=None, ): self.logits_pred = logits_pred self.reg_pred = reg_pred self.ctrness_pred = ctrness_pred self.locations = locations self.gt_instances = gt_instances self.num_feature_maps = len(logits_pred) self.num_images = len(images) self.image_sizes = images.image_sizes self.focal_loss_alpha = focal_loss_alpha self.focal_loss_gamma = focal_loss_gamma self.iou_loss = iou_loss self.center_sample = center_sample self.sizes_of_interest = sizes_of_interest self.strides = strides self.radius = radius self.num_classes = num_classes self.pre_nms_thresh = pre_nms_thresh self.pre_nms_top_n = pre_nms_top_n self.nms_thresh = nms_thresh self.fpn_post_nms_top_n = fpn_post_nms_top_n self.thresh_with_ctr = thresh_with_ctr self.controllers = controllers self.masks = masks def _transpose(self, training_targets, num_loc_list): ''' This function is used to transpose image first training targets to level first ones :return: level first training targets ''' for im_i in range(len(training_targets)): training_targets[im_i] = torch.split( training_targets[im_i], num_loc_list, dim=0 ) targets_level_first = [] for targets_per_level in zip(*training_targets): targets_level_first.append( torch.cat(targets_per_level, dim=0) ) return targets_level_first def _get_ground_truth(self): num_loc_list = [len(loc) for loc in self.locations] self.num_loc_list = num_loc_list # compute locations to size ranges loc_to_size_range = [] for l, loc_per_level in enumerate(self.locations): loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l]) loc_to_size_range.append( loc_to_size_range_per_level[None].expand(num_loc_list[l], -1) ) loc_to_size_range = torch.cat(loc_to_size_range, dim=0) locations = torch.cat(self.locations, dim=0) training_targets = self.compute_targets_for_locations( locations, self.gt_instances, loc_to_size_range ) # transpose im first training_targets to level first ones training_targets = { k: self._transpose(v, num_loc_list) for k, v in training_targets.items() } # we normalize reg_targets by FPN's strides here reg_targets = training_targets["reg_targets"] for l in range(len(reg_targets)): reg_targets[l] = reg_targets[l] / float(self.strides[l]) return training_targets def get_sample_region(self, gt, strides, num_loc_list, loc_xs, loc_ys, radius=1): num_gts = gt.shape[0] K = len(loc_xs) gt = gt[None].expand(K, num_gts, 4) center_x = (gt[..., 0] + gt[..., 2]) / 2 center_y = (gt[..., 1] + gt[..., 3]) / 2 center_gt = gt.new_zeros(gt.shape) # no gt if center_x.numel() == 0 or center_x[..., 0].sum() == 0: return loc_xs.new_zeros(loc_xs.shape, dtype=torch.uint8) beg = 0 for level, num_loc in enumerate(num_loc_list): end = beg + num_loc stride = strides[level] * radius xmin = center_x[beg:end] - stride ymin = center_y[beg:end] - stride xmax = center_x[beg:end] + stride ymax = center_y[beg:end] + stride # limit sample region in gt center_gt[beg:end, :, 0] = torch.where(xmin > gt[beg:end, :, 0], xmin, gt[beg:end, :, 0]) center_gt[beg:end, :, 1] = torch.where(ymin > gt[beg:end, :, 1], ymin, gt[beg:end, :, 1]) center_gt[beg:end, :, 2] = torch.where(xmax > gt[beg:end, :, 2], gt[beg:end, :, 2], xmax) center_gt[beg:end, :, 3] = torch.where(ymax > gt[beg:end, :, 3], gt[beg:end, :, 3], ymax) beg = end left = loc_xs[:, None] - center_gt[..., 0] right = center_gt[..., 2] - loc_xs[:, None] top = loc_ys[:, None] - center_gt[..., 1] bottom = center_gt[..., 3] - loc_ys[:, None] center_bbox = torch.stack((left, top, right, bottom), -1) inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0 return inside_gt_bbox_mask def compute_targets_for_locations(self, locations, targets, size_ranges): labels = [] reg_targets = [] matched_idxes = [] im_idxes = [] xs, ys = locations[:, 0], locations[:, 1] for im_i in range(len(targets)): targets_per_im = targets[im_i] bboxes = targets_per_im.gt_boxes.tensor labels_per_im = targets_per_im.gt_classes # no gt if bboxes.numel() == 0: labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes) reg_targets.append(locations.new_zeros((locations.size(0), 4))) continue area = targets_per_im.gt_boxes.area() l = xs[:, None] - bboxes[:, 0][None] t = ys[:, None] - bboxes[:, 1][None] r = bboxes[:, 2][None] - xs[:, None] b = bboxes[:, 3][None] - ys[:, None] reg_targets_per_im = torch.stack([l, t, r, b], dim=2) if self.center_sample: is_in_boxes = self.get_sample_region( bboxes, self.strides, self.num_loc_list, xs, ys, radius=self.radius ) else: is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0 max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0] # limit the regression range for each location is_cared_in_the_level = \ (max_reg_targets_per_im >= size_ranges[:, [0]]) & \ (max_reg_targets_per_im <= size_ranges[:, [1]]) locations_to_gt_area = area[None].repeat(len(locations), 1) locations_to_gt_area[is_in_boxes == 0] = INF locations_to_gt_area[is_cared_in_the_level == 0] = INF # if there are still more than one objects for a location, # we choose the one with minimal area locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1) reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds] labels_per_im = labels_per_im[locations_to_gt_inds] labels_per_im[locations_to_min_area == INF] = self.num_classes labels.append(labels_per_im) reg_targets.append(reg_targets_per_im) matched_idxes.append(locations_to_gt_inds) im_idxes.append(torch.tensor([im_i]*len(labels_per_im)).to(locations_to_gt_inds.device)) return {"labels": labels, "reg_targets": reg_targets, "matched_idxes": matched_idxes, "im_idxes": im_idxes} def losses(self): """ Return the losses from a set of FCOS predictions and their associated ground-truth. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. """ training_targets = self._get_ground_truth() labels, reg_targets, matched_idxes, im_idxes = training_targets["labels"], training_targets["reg_targets"], training_targets["matched_idxes"], training_targets["im_idxes"] # Collect all logits and regression predictions over feature maps # and images to arrive at the same shape as the labels and targets # The final ordering is L, N, H, W from slowest to fastest axis. logits_pred = cat( [ # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C) x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in self.logits_pred ], dim=0,) reg_pred = cat( [ # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B) x.permute(0, 2, 3, 1).reshape(-1, 4) for x in self.reg_pred ], dim=0,) ctrness_pred = cat( [ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,) x.reshape(-1) for x in self.ctrness_pred ], dim=0,) labels = cat( [ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,) x.reshape(-1) for x in labels ], dim=0,) reg_targets = cat( [ # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4) x.reshape(-1, 4) for x in reg_targets ], dim=0,) matched_idxes = cat( [ x.reshape(-1) for x in matched_idxes ], dim=0,) im_idxes = cat( [ x.reshape(-1) for x in im_idxes ], dim=0,) controllers_pred = cat( [ x.permute(0, 2, 3, 1).reshape(-1, 169) for x in self.controllers ], dim=0,) return self.fcos_losses( labels, reg_targets, logits_pred, reg_pred, ctrness_pred, controllers_pred, self.focal_loss_alpha, self.focal_loss_gamma, self.iou_loss, matched_idxes, im_idxes ) def predict_proposals(self): sampled_boxes = [] bundle = ( self.locations, self.logits_pred, self.reg_pred, self.ctrness_pred, self.strides ) for i, (l, o, r, c, s) in enumerate(zip(*bundle)): # recall that during training, we normalize regression targets with FPN's stride. # we denormalize them here. r = r * s controller = self.controllers[i] sampled_boxes.append( self.forward_for_single_feature_map( l, o, r, c, controller, self.image_sizes ) ) boxlists = list(zip(*sampled_boxes)) boxlists = [Instances.cat(boxlist) for boxlist in boxlists] boxlists = self.select_over_all_levels(boxlists) # for CondInst boxlists = self.forward_for_mask(boxlists) return boxlists def forward_for_mask(self, boxlists): N, dim, h, w = self.masks.shape grid_x = torch.arange(w).view(1,-1).float().repeat(h,1).cuda() / (w-1) * 2 - 1 grid_y = torch.arange(h).view(-1,1).float().repeat(1,w).cuda() / (h-1) * 2 - 1 x_map = grid_x.view(1, 1, h, w).repeat(N, 1, 1, 1) y_map = grid_y.view(1, 1, h, w).repeat(N, 1, 1, 1) masks_feat = torch.cat((self.masks, x_map, y_map), dim=1) o_h = int(h * self.strides[0]) o_w = int(w * self.strides[0]) for im in range(N): boxlist = boxlists[im] input_h, input_w = boxlist.image_size mask = masks_feat[None, im] ins_num = boxlist.controllers.shape[0] weights1 = boxlist.controllers[:,:80].reshape(-1,8,10).reshape(-1,10).unsqueeze(-1).unsqueeze(-1) bias1 = boxlist.controllers[:, 80:88].flatten() weights2 = boxlist.controllers[:, 88:152].reshape(-1,8,8).reshape(-1,8).unsqueeze(-1).unsqueeze(-1) bias2 = boxlist.controllers[:, 152:160].flatten() weights3 = boxlist.controllers[:, 160:168].unsqueeze(-1).unsqueeze(-1) bias3 = boxlist.controllers[:,168:169].flatten() conv1 = F.conv2d(mask,weights1,bias1).relu() conv2 = F.conv2d(conv1, weights2, bias2, groups = ins_num).relu() masks_per_image = F.conv2d(conv2, weights3, bias3, groups = ins_num) #masks = interpolate(masks_per_image, size = (o_h,o_w), mode="bilinear", align_corners=False).sigmoid() masks = aligned_bilinear(masks_per_image, self.strides[0]).sigmoid() masks = masks[:, :, :input_h, :input_w].permute(1,0,2,3) boxlist.pred_masks = masks return boxlists def forward_for_single_feature_map( self, locations, box_cls, reg_pred, ctrness, controller, image_sizes ): N, C, H, W = box_cls.shape # put in the same format as locations box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1) box_cls = box_cls.reshape(N, -1, C).sigmoid() box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1) box_regression = box_regression.reshape(N, -1, 4) ctrness = ctrness.view(N, 1, H, W).permute(0, 2, 3, 1) ctrness = ctrness.reshape(N, -1).sigmoid() controller = controller.view(N, 169, H, W).permute(0, 2, 3, 1) controller = controller.reshape(N, -1, 169) # if self.thresh_with_ctr is True, we multiply the classification # scores with centerness scores before applying the threshold. if self.thresh_with_ctr: box_cls = box_cls * ctrness[:, :, None] candidate_inds = box_cls > self.pre_nms_thresh pre_nms_top_n = candidate_inds.view(N, -1).sum(1) pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) if not self.thresh_with_ctr: box_cls = box_cls * ctrness[:, :, None] results = [] for i in range(N): per_box_cls = box_cls[i] per_candidate_inds = candidate_inds[i] per_box_cls = per_box_cls[per_candidate_inds] per_candidate_nonzeros = per_candidate_inds.nonzero() per_box_loc = per_candidate_nonzeros[:, 0] per_class = per_candidate_nonzeros[:, 1] per_box_regression = box_regression[i] per_box_regression = per_box_regression[per_box_loc] per_locations = locations[per_box_loc] per_controller = controller[i] per_controller = per_controller[per_box_loc] per_pre_nms_top_n = pre_nms_top_n[i] if per_candidate_inds.sum().item() > per_pre_nms_top_n.item(): per_box_cls, top_k_indices = \ per_box_cls.topk(per_pre_nms_top_n, sorted=False) per_class = per_class[top_k_indices] per_box_regression = per_box_regression[top_k_indices] per_locations = per_locations[top_k_indices] per_controller = per_controller[top_k_indices] detections = torch.stack([ per_locations[:, 0] - per_box_regression[:, 0], per_locations[:, 1] - per_box_regression[:, 1], per_locations[:, 0] + per_box_regression[:, 2], per_locations[:, 1] + per_box_regression[:, 3], ], dim=1) boxlist = Instances(image_sizes[i]) boxlist.pred_boxes = Boxes(detections) boxlist.scores = torch.sqrt(per_box_cls) boxlist.pred_classes = per_class boxlist.locations = per_locations boxlist.controllers = per_controller results.append(boxlist) return results def select_over_all_levels(self, boxlists): num_images = len(boxlists) results = [] for i in range(num_images): # multiclass nms result = ml_nms(boxlists[i], self.nms_thresh) number_of_detections = len(result) # Limit to max_per_image detections **over all classes** if number_of_detections > self.fpn_post_nms_top_n > 0: cls_scores = result.scores image_thresh, _ = torch.kthvalue( cls_scores.cpu(), number_of_detections - self.fpn_post_nms_top_n + 1 ) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] results.append(result) return results def prepare_masks(self, m_h, m_w, r_h, r_w, targets_masks): masks = [] for im_i in range(len(targets_masks)): mask_t = targets_masks[im_i] if len(mask_t) == 0: masks.append(mask_t.new_tensor([])) continue n, h, w = mask_t.shape mask = mask_t.new_zeros((n, r_h, r_w)) mask[:, :h, :w] = mask_t #resized_mask = aligned_bilinear(mask.float().unsqueeze(0), m_h/r_h)[0].gt(0) #resized_mask = interpolate( # input=mask.float().unsqueeze(0), size=(m_h, m_w), mode="bilinear", align_corners=False, # )[0].gt(0) #masks.append(resized_mask) masks.append(mask) return masks def dice_loss(self,input, target): smooth = 1. iflat = input.contiguous().view(-1) tflat = target.contiguous().view(-1) intersection = (iflat * tflat).sum() return 1 - ((2. * intersection + smooth) /((iflat*iflat).sum() + (tflat*tflat).sum() + smooth)) def fcos_losses( self, labels, reg_targets, logits_pred, reg_pred, ctrness_pred, controllers_pred, focal_loss_alpha, focal_loss_gamma, iou_loss, matched_idxes, im_idxes ): num_classes = logits_pred.size(1) labels = labels.flatten() pos_inds = torch.nonzero(labels != num_classes).squeeze(1) num_pos_local = pos_inds.numel() num_gpus = get_world_size() total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item() num_pos_avg = max(total_num_pos / num_gpus, 1.0) # prepare one_hot class_target = torch.zeros_like(logits_pred) class_target[pos_inds, labels[pos_inds]] = 1 class_loss = sigmoid_focal_loss_jit( logits_pred, class_target, alpha=focal_loss_alpha, gamma=focal_loss_gamma, reduction="sum", ) / num_pos_avg reg_pred = reg_pred[pos_inds] reg_targets = reg_targets[pos_inds] ctrness_pred = ctrness_pred[pos_inds] controllers_pred = controllers_pred[pos_inds] matched_idxes = matched_idxes[pos_inds] im_idxes = im_idxes[pos_inds] ctrness_targets = compute_ctrness_targets(reg_targets) ctrness_targets_sum = ctrness_targets.sum() ctrness_norm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6) reg_loss = iou_loss( reg_pred, reg_targets, ctrness_targets ) / ctrness_norm ctrness_loss = F.binary_cross_entropy_with_logits( ctrness_pred, ctrness_targets, reduction="sum" ) / num_pos_avg # for CondInst N, C, h, w = self.masks.shape grid_x = torch.arange(w).view(1,-1).float().repeat(h,1).cuda() / (w-1) * 2 - 1 grid_y = torch.arange(h).view(-1,1).float().repeat(1,w).cuda() / (h-1) * 2 - 1 x_map = grid_x.view(1, 1, h, w).repeat(N, 1, 1, 1) y_map = grid_y.view(1, 1, h, w).repeat(N, 1, 1, 1) masks_feat = torch.cat((self.masks, x_map, y_map), dim=1) r_h = int(h * self.strides[0]) r_w = int(w * self.strides[0]) targets_masks = [target_im.gt_masks.tensor for target_im in self.gt_instances] masks_t = self.prepare_masks(h, w, r_h, r_w, targets_masks) mask_loss = masks_feat[0].new_tensor(0.0) batch_ins = im_idxes.shape[0] # for each image for i in range(N): inds = (im_idxes==i).nonzero().flatten() ins_num = inds.shape[0] if ins_num > 0: controllers = controllers_pred[inds] mask_feat = masks_feat[None, i] weights1 = controllers[:, :80].reshape(-1,8,10).reshape(-1,10).unsqueeze(-1).unsqueeze(-1) bias1 = controllers[:, 80:88].flatten() weights2 = controllers[:, 88:152].reshape(-1,8,8).reshape(-1,8).unsqueeze(-1).unsqueeze(-1) bias2 = controllers[:, 152:160].flatten() weights3 = controllers[:, 160:168].unsqueeze(-1).unsqueeze(-1) bias3 = controllers[:,168:169].flatten() conv1 = F.conv2d(mask_feat,weights1,bias1).relu() conv2 = F.conv2d(conv1, weights2, bias2, groups = ins_num).relu() #masks_per_image = F.conv2d(conv2, weights3, bias3, groups = ins_num)[0].sigmoid() masks_per_image = F.conv2d(conv2, weights3, bias3, groups = ins_num) masks_per_image = aligned_bilinear(masks_per_image, self.strides[0])[0].sigmoid() for j in range(ins_num): ind = inds[j] mask_gt = masks_t[i][matched_idxes[ind]].float() mask_pred = masks_per_image[j] mask_loss += self.dice_loss(mask_pred, mask_gt) if batch_ins > 0: mask_loss = mask_loss / batch_ins losses = { "loss_fcos_cls": class_loss, "loss_fcos_loc": reg_loss, "loss_fcos_ctr": ctrness_loss, "loss_mask": mask_loss } return losses, {} ================================================ FILE: fcos/modeling/one_stage_detector.py ================================================ from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY from detectron2.modeling import ProposalNetwork @META_ARCH_REGISTRY.register() class OneStageDetector(ProposalNetwork): """ Same as :class:`detectron2.modeling.ProposalNetwork`. Uses "instances" as the return key instead of using "proposal". """ def forward(self, batched_inputs): if self.training: return super().forward(batched_inputs) processed_results = super().forward(batched_inputs) processed_results = [{"instances": r["proposals"]} for r in processed_results] return processed_results ================================================ FILE: fcos/modeling/poolers.py ================================================ import sys import torch from detectron2.layers import cat from detectron2.modeling.poolers import ( ROIPooler, convert_boxes_to_pooler_format, assign_boxes_to_levels ) __all__ = ["TopPooler"] def _box_max_size(boxes): box = boxes.tensor max_size = torch.max(box[:, 2] - box[:, 0], box[:, 3] - box[:, 1]) return max_size def assign_boxes_to_levels_by_length( box_lists, min_level, max_level, canonical_box_size, canonical_level): """ Map each box in `box_lists` to a feature map level index and return the assignment vector. Args: box_lists (list[detectron2.structures.Boxes]): A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. min_level (int): Smallest feature map level index. The input is considered index 0, the output of stage 1 is index 1, and so. max_level (int): Largest feature map level index. canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). canonical_level (int): The feature map level index on which a canonically-sized box should be placed. Returns: A tensor of length M, where M is the total number of boxes aggregated over all N batch images. The memory layout corresponds to the concatenation of boxes from all images. Each element is the feature map index, as an offset from `self.min_level`, for the corresponding box (so value i means the box is at `self.min_level + i`). """ eps = sys.float_info.epsilon box_sizes = cat([_box_max_size(boxes) for boxes in box_lists]) # Eqn.(1) in FPN paper level_assignments = torch.floor( canonical_level + torch.log2(box_sizes / canonical_box_size + eps) ) level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level) return level_assignments.to(torch.int64) - min_level class TopPooler(ROIPooler): """ ROIPooler with option to assign level by max length. Used by top modules. """ def __init__(self, output_size, scales, sampling_ratio, pooler_type, canonical_box_size=224, canonical_level=4, assign_crit="area",): super().__init__(output_size, scales, sampling_ratio, pooler_type, canonical_box_size=canonical_box_size, canonical_level=canonical_level) self.assign_crit = assign_crit def forward(self, x, box_lists): """ Args: x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those used to construct this module. box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. The box coordinates are defined on the original image and will be scaled by the `scales` argument of :class:`ROIPooler`. Returns: Tensor: A tensor of shape (M, C, output_size, output_size) where M is the total number of boxes aggregated over all N batch images and C is the number of channels in `x`. """ num_level_assignments = len(self.level_poolers) assert isinstance(x, list) and isinstance( box_lists, list ), "Arguments to pooler must be lists" assert ( len(x) == num_level_assignments ), "unequal value, num_level_assignments={}, but x is list of {} Tensors".format( num_level_assignments, len(x) ) assert len(box_lists) == x[0].size( 0 ), "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format( x[0].size(0), len(box_lists) ) pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists) if num_level_assignments == 1: return self.level_poolers[0](x[0], pooler_fmt_boxes) if self.assign_crit == "length": assign_method = assign_boxes_to_levels_by_length else: assign_method = assign_boxes_to_levels level_assignments = assign_method( box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level) num_boxes = len(pooler_fmt_boxes) num_channels = x[0].shape[1] output_size = self.output_size[0] dtype, device = x[0].dtype, x[0].device output = torch.zeros( (num_boxes, num_channels, output_size, output_size), dtype=dtype, device=device ) for level, (x_level, pooler) in enumerate(zip(x, self.level_poolers)): inds = torch.nonzero(level_assignments == level).squeeze(1) pooler_fmt_boxes_level = pooler_fmt_boxes[inds] output[inds] = pooler(x_level, pooler_fmt_boxes_level) return output ================================================ FILE: fcos/utils/comm.py ================================================ import torch.distributed as dist from detectron2.utils.comm import get_world_size def reduce_sum(tensor): world_size = get_world_size() if world_size < 2: return tensor tensor = tensor.clone() dist.all_reduce(tensor, op=dist.ReduceOp.SUM) return tensor ================================================ FILE: fcos/utils/measures.py ================================================ # coding: utf-8 # Adapted from https://github.com/ShichenLiu/CondenseNet/blob/master/utils.py from __future__ import absolute_import from __future__ import unicode_literals from __future__ import print_function from __future__ import division import operator from functools import reduce def get_num_gen(gen): return sum(1 for x in gen) def is_pruned(layer): try: layer.mask return True except AttributeError: return False def is_leaf(model): return get_num_gen(model.children()) == 0 def get_layer_info(layer): layer_str = str(layer) type_name = layer_str[:layer_str.find('(')].strip() return type_name def get_layer_param(model): return sum([reduce(operator.mul, i.size(), 1) for i in model.parameters()]) ### The input batch size should be 1 to call this function def measure_layer(layer, *args): global count_ops, count_params for x in args: delta_ops = 0 delta_params = 0 multi_add = 1 type_name = get_layer_info(layer) ### ops_conv if type_name in ['Conv2d']: out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0] - layer.kernel_size[0]) / layer.stride[0] + 1) out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1] - layer.kernel_size[1]) / layer.stride[1] + 1) delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add delta_params = get_layer_param(layer) elif type_name in ['ConvTranspose2d']: _, _, in_h, in_w = x.size() out_h = int((in_h-1)*layer.stride[0] - 2 * layer.padding[0] + layer.kernel_size[0] + layer.output_padding[0]) out_w = int((in_w-1)*layer.stride[1] - 2 * layer.padding[1] + layer.kernel_size[1] + layer.output_padding[1]) delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * \ layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add delta_params = get_layer_param(layer) ### ops_learned_conv elif type_name in ['LearnedGroupConv']: measure_layer(layer.relu, x) measure_layer(layer.norm, x) conv = layer.conv out_h = int((x.size()[2] + 2 * conv.padding[0] - conv.kernel_size[0]) / conv.stride[0] + 1) out_w = int((x.size()[3] + 2 * conv.padding[1] - conv.kernel_size[1]) / conv.stride[1] + 1) delta_ops = conv.in_channels * conv.out_channels * conv.kernel_size[0] * conv.kernel_size[1] * out_h * out_w / layer.condense_factor * multi_add delta_params = get_layer_param(conv) / layer.condense_factor ### ops_nonlinearity elif type_name in ['ReLU', 'ReLU6']: delta_ops = x.numel() delta_params = get_layer_param(layer) ### ops_pooling elif type_name in ['AvgPool2d', 'MaxPool2d']: in_w = x.size()[2] kernel_ops = layer.kernel_size * layer.kernel_size out_w = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1) out_h = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1) delta_ops = x.size()[0] * x.size()[1] * out_w * out_h * kernel_ops delta_params = get_layer_param(layer) elif type_name in ['LastLevelMaxPool']: pass elif type_name in ['AdaptiveAvgPool2d']: delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3] delta_params = get_layer_param(layer) elif type_name in ['ZeroPad2d', 'RetinaNetPostProcessor']: pass #delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3] #delta_params = get_layer_param(layer) ### ops_linear elif type_name in ['Linear']: weight_ops = layer.weight.numel() * multi_add bias_ops = layer.bias.numel() delta_ops = x.size()[0] * (weight_ops + bias_ops) delta_params = get_layer_param(layer) ### ops_nothing elif type_name in ['BatchNorm2d', 'Dropout2d', 'DropChannel', 'Dropout', 'FrozenBatchNorm2d', 'GroupNorm']: delta_params = get_layer_param(layer) elif type_name in ['SumTwo']: delta_ops = x.numel() elif type_name in ['AggregateCell']: if not layer.pre_transform: delta_ops = 2 * x.numel() # twice for each input else: measure_layer(layer.branch_1, x) measure_layer(layer.branch_2, x) delta_params = get_layer_param(layer) elif type_name in ['Identity', 'Zero']: pass elif type_name in ['Scale']: delta_params = get_layer_param(layer) delta_ops = x.numel() elif type_name in ['FCOSPostProcessor', 'RPNPostProcessor', 'KeypointPostProcessor', 'ROIAlign', 'PostProcessor', 'KeypointRCNNPredictor', 'NaiveSyncBatchNorm', 'Upsample', 'Sequential']: pass elif type_name in ['DeformConv']: # don't count bilinear offset_conv = list(layer.parameters())[0] delta_ops = reduce(operator.mul, offset_conv.size(), x.size()[2] * x.size()[3]) out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0] - layer.kernel_size[0]) / layer.stride[0] + 1) out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1] - layer.kernel_size[1]) / layer.stride[1] + 1) delta_ops += layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add delta_params = get_layer_param(layer) ### unknown layer type else: raise TypeError('unknown layer type: %s' % type_name) count_ops += delta_ops count_params += delta_params return def measure_model(model, x): global count_ops, count_params count_ops = 0 count_params = 0 def should_measure(x): return is_leaf(x) or is_pruned(x) def modify_forward(model): for child in model.children(): if should_measure(child): def new_forward(m): def lambda_forward(*args): measure_layer(m, *args) return m.old_forward(*args) return lambda_forward child.old_forward = child.forward child.forward = new_forward(child) else: modify_forward(child) def restore_forward(model): for child in model.children(): # leaf node if is_leaf(child) and hasattr(child, 'old_forward'): child.forward = child.old_forward child.old_forward = None else: restore_forward(child) modify_forward(model) out = model.forward(x) restore_forward(model) return out, count_ops, count_params ================================================ FILE: postprocessing.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved from torch.nn import functional as F from detectron2.layers import paste_masks_in_image from detectron2.structures import Instances def detector_postprocess(results, output_height, output_width, mask_threshold=0.5): """ Resize the output instances. The input images are often resized when entering an object detector. As a result, we often need the outputs of the detector in a different resolution from its inputs. This function will resize the raw outputs of an R-CNN detector to produce outputs according to the desired output resolution. Args: results (Instances): the raw outputs from the detector. `results.image_size` contains the input image resolution the detector sees. This object might be modified in-place. output_height, output_width: the desired output resolution. Returns: Instances: the resized output from the model, based on the output resolution """ scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0]) results = Instances((output_height, output_width), **results.get_fields()) if results.has("pred_boxes"): output_boxes = results.pred_boxes elif results.has("proposal_boxes"): output_boxes = results.proposal_boxes output_boxes.scale(scale_x, scale_y) output_boxes.clip(results.image_size) results = results[output_boxes.nonempty()] if results.has("pred_masks"): if results.pred_masks.shape[0]: results.pred_masks = F.interpolate(input=results.pred_masks, size=results.image_size,mode="bilinear", align_corners=False).gt(0.5).squeeze(1) #results.pred_masks = paste_masks_in_image( # results.pred_masks[:, 0, :, :], # N, 1, M, M # results.pred_boxes, # results.image_size, # threshold=mask_threshold, #) if results.has("pred_keypoints"): results.pred_keypoints[:, :, 0] *= scale_x results.pred_keypoints[:, :, 1] *= scale_y return results def sem_seg_postprocess(result, img_size, output_height, output_width): """ Return semantic segmentation predictions in the original resolution. The input images are often resized when entering semantic segmentor. Moreover, in same cases, they also padded inside segmentor to be divisible by maximum network stride. As a result, we often need the predictions of the segmentor in a different resolution from its inputs. Args: result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W), where C is the number of classes, and H, W are the height and width of the prediction. img_size (tuple): image size that segmentor is taking as input. output_height, output_width: the desired output resolution. Returns: semantic segmentation prediction (Tensor): A tensor of the shape (C, output_height, output_width) that contains per-pixel soft predictions. """ result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1) result = F.interpolate( result, size=(output_height, output_width), mode="bilinear", align_corners=False )[0] return result ================================================ FILE: tools/compute_flops.py ================================================ import torch from detectron2.engine import default_argument_parser, default_setup from adet.config import get_cfg from adet.utils.measures import measure_model from train_net import Trainer def setup(args): """ Create configs and perform basic setups. """ cfg = get_cfg() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() default_setup(cfg, args) return cfg def main(args): cfg = setup(args) model = Trainer.build_model(cfg) model.eval().cuda() input_size = (3, 512, 512) image = torch.zeros(*input_size) batched_input = {"image": image} ops, params = measure_model(model, [batched_input]) print('ops: {:.2f}G\tparams: {:.2f}M'.format(ops / 2**30, params / 2**20)) if __name__ == "__main__": args = default_argument_parser().parse_args() print("Command Line Args:", args) main(args) ================================================ FILE: tools/convert_fcos_weight.py ================================================ import argparse from collections import OrderedDict import torch def get_parser(): parser = argparse.ArgumentParser(description="FCOS Detectron2 Converter") parser.add_argument( "--model", default="weights/fcos_R_50_1x_official.pth", metavar="FILE", help="path to model weights", ) parser.add_argument( "--output", default="weights/fcos_R_50_1x_converted.pth", metavar="FILE", help="path to model weights", ) return parser def rename_resnet_param_names(ckpt_state_dict): converted_state_dict = OrderedDict() for key in ckpt_state_dict.keys(): value = ckpt_state_dict[key] key = key.replace("module.", "") key = key.replace("body", "bottom_up") # adding a . ahead to avoid renaming the fpn modules # this can happen after fpn renaming key = key.replace(".layer1", ".res2") key = key.replace(".layer2", ".res3") key = key.replace(".layer3", ".res4") key = key.replace(".layer4", ".res5") key = key.replace("downsample.0", "shortcut") key = key.replace("downsample.1", "shortcut.norm") key = key.replace("bn1", "conv1.norm") key = key.replace("bn2", "conv2.norm") key = key.replace("bn3", "conv3.norm") key = key.replace("fpn_inner2", "fpn_lateral3") key = key.replace("fpn_inner3", "fpn_lateral4") key = key.replace("fpn_inner4", "fpn_lateral5") key = key.replace("fpn_layer2", "fpn_output3") key = key.replace("fpn_layer3", "fpn_output4") key = key.replace("fpn_layer4", "fpn_output5") key = key.replace("top_blocks", "top_block") key = key.replace("fpn.", "") key = key.replace("rpn", "proposal_generator") key = key.replace("head", "fcos_head") converted_state_dict[key] = value return converted_state_dict if __name__ == "__main__": args = get_parser().parse_args() ckpt = torch.load(args.model) model = rename_resnet_param_names(ckpt["model"]) torch.save(model, args.output) ================================================ FILE: tools/remove_optim_from_ckpt.py ================================================ import argparse import torch def get_parser(): parser = argparse.ArgumentParser(description="Keep only model in ckpt") parser.add_argument( "--path", default="output/person/blendmask/R_50_1x/", help="path to model weights", ) parser.add_argument( "--name", default="R_50_1x.pth", help="name of output file", ) return parser if __name__ == "__main__": args = get_parser().parse_args() ckpt = torch.load(args.path + 'model_final.pth') model = ckpt["model"] torch.save(model, args.path + args.name) ================================================ FILE: train_net.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ Detection Training Script. This scripts reads a given config file and runs the training or evaluation. It is an entry point that is made to train standard models in detectron2. In order to let one script support training of many models, this script contains logic that are specific to these built-in models and therefore may not be suitable for your own project. For example, your research project perhaps only needs a single "evaluator". Therefore, we recommend you to use detectron2 as an library and take this file as an example of how to use the library. You may want to write your own script with your datasets and other customizations. """ import logging import os from collections import OrderedDict import torch from torch.nn.parallel import DistributedDataParallel import detectron2.utils.comm as comm from detectron2.data import MetadataCatalog, build_detection_train_loader from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch from detectron2.utils.events import EventStorage from detectron2.evaluation import ( CityscapesEvaluator, COCOEvaluator, COCOPanopticEvaluator, DatasetEvaluators, LVISEvaluator, PascalVOCDetectionEvaluator, SemSegEvaluator, verify_results, ) from detectron2.modeling import GeneralizedRCNNWithTTA from detectron2.data.dataset_mapper import DatasetMapper from fcos.config import get_cfg from fcos.checkpoint import AdetCheckpointer class Trainer(DefaultTrainer): """ This is the same Trainer except that we rewrite the `build_train_loader` method. """ def __init__(self, cfg): """ Args: cfg (CfgNode): Use the custom checkpointer, which loads other backbone models with matching heuristics. """ # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) super(DefaultTrainer, self).__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = AdetCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks()) def train_loop(self, start_iter: int, max_iter: int): """ Args: start_iter, max_iter (int): See docs above """ logger = logging.getLogger(__name__) logger.info("Starting training from iteration {}".format(start_iter)) self.iter = self.start_iter = start_iter self.max_iter = max_iter with EventStorage(start_iter) as self.storage: self.before_train() for self.iter in range(start_iter, max_iter): self.before_step() self.run_step() self.after_step() self.after_train() def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ self.train_loop(self.start_iter, self.max_iter) if hasattr(self, "_last_eval_results") and comm.is_main_process(): verify_results(self.cfg, self._last_eval_results) return self._last_eval_results @classmethod def build_train_loader(cls, cfg): """ Returns: iterable It calls :func:`detectron2.data.build_detection_train_loader` with a customized DatasetMapper, which adds categorical labels as a semantic mask. """ mapper = DatasetMapper(cfg, True) return build_detection_train_loader(cfg, mapper) @classmethod def build_evaluator(cls, cfg, dataset_name, output_folder=None): """ Create evaluator(s) for a given dataset. This uses the special metadata "evaluator_type" associated with each builtin dataset. For your own dataset, you can simply create an evaluator manually in your script and do not have to worry about the hacky if-else logic here. """ if output_folder is None: output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") evaluator_list = [] evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: evaluator_list.append( SemSegEvaluator( dataset_name, distributed=True, num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, output_dir=output_folder, ) ) if evaluator_type in ["coco", "coco_panoptic_seg"]: evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder)) if evaluator_type == "coco_panoptic_seg": evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) if evaluator_type == "cityscapes": assert ( torch.cuda.device_count() >= comm.get_rank() ), "CityscapesEvaluator currently do not work with multiple machines." return CityscapesEvaluator(dataset_name) if evaluator_type == "pascal_voc": return PascalVOCDetectionEvaluator(dataset_name) if evaluator_type == "lvis": return LVISEvaluator(dataset_name, cfg, True, output_folder) if len(evaluator_list) == 0: raise NotImplementedError( "no Evaluator for the dataset {} with the type {}".format( dataset_name, evaluator_type ) ) if len(evaluator_list) == 1: return evaluator_list[0] return DatasetEvaluators(evaluator_list) @classmethod def test_with_TTA(cls, cfg, model): logger = logging.getLogger("detectron2.trainer") # In the end of training, run an evaluation with TTA # Only support some R-CNN models. logger.info("Running inference with test-time augmentation ...") model = GeneralizedRCNNWithTTA(cfg, model) evaluators = [ cls.build_evaluator( cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") ) for name in cfg.DATASETS.TEST ] res = cls.test(cfg, model, evaluators) res = OrderedDict({k + "_TTA": v for k, v in res.items()}) return res def setup(args): """ Create configs and perform basic setups. """ cfg = get_cfg() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() default_setup(cfg, args) return cfg def main(args): cfg = setup(args) if args.eval_only: model = Trainer.build_model(cfg) AdetCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume ) res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, model)) return res """ If you'd like to do anything fancier than the standard training logic, consider writing your own training loop or subclassing the trainer. """ trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) if cfg.TEST.AUG.ENABLED: trainer.register_hooks( [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))] ) return trainer.train() if __name__ == "__main__": args = default_argument_parser().parse_args() print("Command Line Args:", args) launch( main, args.num_gpus, num_machines=args.num_machines, machine_rank=args.machine_rank, dist_url=args.dist_url, args=(args,), )