Repository: Epiphqny/CondInst
Branch: master
Commit: 7b666c32c04f
Files: 46
Total size: 123.8 KB
Directory structure:
gitextract_p4h8yn0h/
├── README.md
├── configs/
│ └── CondInst/
│ ├── Base-FCOS.yaml
│ ├── MS_R_101_3x.yaml
│ ├── MS_R_50_2x.yaml
│ ├── MS_X_101_2x.yaml
│ ├── R_50_1x.yaml
│ └── vovnet/
│ ├── MS_V_39_3x.yaml
│ ├── MS_V_57_3x.yaml
│ ├── MS_V_99_3x.yaml
│ └── README.md
├── demo/
│ ├── demo.py
│ └── predictor.py
├── fcos/
│ ├── __init__.py
│ ├── checkpoint/
│ │ ├── __init__.py
│ │ └── adet_checkpoint.py
│ ├── config/
│ │ ├── __init__.py
│ │ ├── config.py
│ │ └── defaults.py
│ ├── data/
│ │ ├── __init__.py
│ │ └── builtin.py
│ ├── layers/
│ │ ├── __init__.py
│ │ ├── conv_with_kaiming_uniform.py
│ │ ├── csrc/
│ │ │ ├── cuda_version.cu
│ │ │ ├── ml_nms/
│ │ │ │ ├── ml_nms.cu
│ │ │ │ └── ml_nms.h
│ │ │ └── vision.cpp
│ │ ├── deform_conv.py
│ │ ├── iou_loss.py
│ │ └── ml_nms.py
│ ├── modeling/
│ │ ├── __init__.py
│ │ ├── backbone/
│ │ │ ├── __init__.py
│ │ │ ├── fpn.py
│ │ │ ├── mobilenet.py
│ │ │ └── vovnet.py
│ │ ├── fcos/
│ │ │ ├── __init__.py
│ │ │ ├── fcos.py
│ │ │ └── fcos_outputs.py
│ │ ├── one_stage_detector.py
│ │ └── poolers.py
│ └── utils/
│ ├── comm.py
│ └── measures.py
├── postprocessing.py
├── tools/
│ ├── compute_flops.py
│ ├── convert_fcos_weight.py
│ └── remove_optim_from_ckpt.py
└── train_net.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
# CondInst
This repository is an unofficial pytorch implementation of [Conditional Convolutions for Instance Segmentation](https://arxiv.org/abs/2003.05664). The model with ResNet-101 backbone achieves 37.1 mAP on COCO val2017 set.
## Install
The code is based on [detectron2](https://github.com/facebookresearch/detectron2). Please check [Install.md](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md) for installation instructions.
## Training
Follows the same way as detectron2.
Single GPU:
```
python train_net.py --config-file configs/CondInst/MS_R_101_3x.yaml
```
Multi GPU(for example 8):
```
python train_net.py --num-gpus 8 --config-file configs/CondInst/MS_R_101_3x.yaml
```
Please adjust the IMS_PER_BATCH in the config file according to the GPU memory.
## Notes
I have replaced the original upsample with the aligned upsample according to the [author's issue](https://github.com/Epiphqny/CondInst/issues/1), and use the upsampled mask to calculate loss, this brings more gains but may cost more GPU memory, if you do not have much memory, use the original unupsampled version to calculate loss.
## Inference
First replace the original detectron2 installed postprocessing.py with the [file](https://github.com/Epiphqny/CondInst/blob/master/postprocessing.py) in this repository, as the original file only suit for ROI obatined masks.
The path should be like /miniconda3/envs/py37/lib/python3.7/site-packages/detectron2/modeling/postprocessing.py
Single GPU:
```
python train_net.py --config-file configs/CondInst/MS_R_101_3x.yaml --eval-only MODEL.WEIGHTS /path/to/checkpoint_file
```
Multi GPU(for example 8):
```
python train_net.py --num-gpus 8 --config-file configs/CondInst/MS_R_101_3x.yaml --eval-only MODEL.WEIGHTS /path/to/checkpoint_file
```
## Weights
Trained model can be download in [Google drive](https://drive.google.com/file/d/17-g91zwJzt99G8APza0IaleWYLC3kTMK/view?usp=sharing)
## Results
After training 36 epochs on the coco dataset using the resnet-101 backbone, the mAP is 0.371 on COCO val2017 dataset:
## Visualization
================================================
FILE: configs/CondInst/Base-FCOS.yaml
================================================
MODEL:
META_ARCHITECTURE: "OneStageDetector"
BACKBONE:
NAME: "build_fcos_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res3", "res4", "res5"]
PROPOSAL_GENERATOR:
NAME: "FCOS"
# PIXEL_MEAN: [102.9801, 115.9465, 122.7717]
MASK_ON: True
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 4
BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
MASK_FORMAT: "bitmask"
================================================
FILE: configs/CondInst/MS_R_101_3x.yaml
================================================
_BASE_: "Base-FCOS.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (60000, 80000)
MAX_ITER: 90000
OUTPUT_DIR: "output/fcos/R_101_3x"
================================================
FILE: configs/CondInst/MS_R_50_2x.yaml
================================================
_BASE_: "Base-FCOS.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (120000, 160000)
MAX_ITER: 180000
OUTPUT_DIR: "output/fcos/R_50_2x"
================================================
FILE: configs/CondInst/MS_X_101_2x.yaml
================================================
_BASE_: "Base-FCOS.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
SOLVER:
STEPS: (120000, 160000)
MAX_ITER: 180000
OUTPUT_DIR: "output/fcos/X_101_2x"
================================================
FILE: configs/CondInst/R_50_1x.yaml
================================================
_BASE_: "Base-FCOS.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
INPUT:
MIN_SIZE_TRAIN: (800,)
SOLVER:
WARMUP_METHOD: "constant"
WARMUP_FACTOR: 0.3333
WARMUP_ITERS: 500
OUTPUT_DIR: "output/fcos/R_50_1x"
================================================
FILE: configs/CondInst/vovnet/MS_V_39_3x.yaml
================================================
_BASE_: "../Base-FCOS.yaml"
MODEL:
WEIGHTS: "https://www.dropbox.com/s/q98pypf96rhtd8y/vovnet39_ese_detectron2.pth?dl=1"
BACKBONE:
NAME: "build_fcos_vovnet_fpn_backbone"
FREEZE_AT: 0
VOVNET:
CONV_BODY : "V-39-eSE"
OUT_FEATURES: ["stage3", "stage4", "stage5"]
FPN:
IN_FEATURES: ["stage3", "stage4", "stage5"]
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000
OUTPUT_DIR: "output/fcos/V_39_ms_3x"
================================================
FILE: configs/CondInst/vovnet/MS_V_57_3x.yaml
================================================
_BASE_: "../Base-FCOS.yaml"
MODEL:
WEIGHTS: "https://www.dropbox.com/s/8xl0cb3jj51f45a/vovnet57_ese_detectron2.pth?dl=1"
BACKBONE:
NAME: "build_fcos_vovnet_fpn_backbone"
FREEZE_AT: 0
VOVNET:
CONV_BODY : "V-57-eSE"
OUT_FEATURES: ["stage3", "stage4", "stage5"]
FPN:
IN_FEATURES: ["stage3", "stage4", "stage5"]
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000
OUTPUT_DIR: "output/fcos/V_57_ms_3x"
================================================
FILE: configs/CondInst/vovnet/MS_V_99_3x.yaml
================================================
_BASE_: "../Base-FCOS.yaml"
MODEL:
WEIGHTS: "https://www.dropbox.com/s/1mlv31coewx8trd/vovnet99_ese_detectron2.pth?dl=1"
BACKBONE:
NAME: "build_fcos_vovnet_fpn_backbone"
FREEZE_AT: 0
VOVNET:
CONV_BODY : "V-99-eSE"
OUT_FEATURES: ["stage3", "stage4", "stage5"]
FPN:
IN_FEATURES: ["stage3", "stage4", "stage5"]
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000
OUTPUT_DIR: "output/fcos/V_99_ms_3x"
================================================
FILE: configs/CondInst/vovnet/README.md
================================================
# [VoVNet-v2](https://github.com/youngwanLEE/CenterMask) backbone networks in [FCOS](https://github.com/aim-uofa/adet)
**Efficient Backbone Network for Object Detection and Segmentation**\
Youngwan Lee
[[`vovnet-detectron2`](https://github.com/youngwanLEE/vovnet-detectron2)][[`CenterMask(code)`](https://github.com/youngwanLEE/CenterMask)] [[`VoVNet-v1(arxiv)`](https://arxiv.org/abs/1904.09730)] [[`VoVNet-v2(arxiv)`](https://arxiv.org/abs/1911.06667)] [[`BibTeX`](#CitingVoVNet)]
## Comparison with Faster R-CNN and ResNet
### Note
We measure the inference time of all models with batch size 1 on the same V100 GPU machine.
- pytorch1.3.1
- CUDA 10.1
- cuDNN 7.3
|Method|Backbone|lr sched|inference time|AP|APs|APm|APl|download|
|---|:--------:|:---:|:--:|--|----|----|---|--------|
|Faster|R-50-FPN|3x|0.047|40.2|24.2|43.5|52.0|model \| metrics
|Faster|**V2-39-FPN**|3x|0.047|42.7|27.1|45.6|54.0|model \| metrics
|**FCOS**|**V2-39-FPN**|3x|0.045|43.5|28.1|47.2|54.5|model \| metrics
||
|Faster|R-101-FPN|3x|0.063|42.0|25.2|45.6|54.6|model \| metrics
|Faster|**V2-57-FPN**|3x|0.054|43.3|27.5|46.7|55.3|model \| metrics
|**FCOS**|**V2-57-FPN**|3x|0.051|44.4|28.8|47.2|56.3|model \| metrics
||
|Faster|X-101-FPN|3x|0.120|43.0|27.2|46.1|54.9|model \| metrics|
|Faster|**V2-99-FPN**|3x|0.073|44.1|28.1|47.0|56.4|model \| metrics|
|**FCOS**|**V2-99-FPN**|3x|0.070|45.2|29.2|48.4|57.3|model \| metrics|
## Citing VoVNet
If you use VoVNet, please use the following BibTeX entry.
```BibTeX
@inproceedings{lee2019energy,
title = {An Energy and GPU-Computation Efficient Backbone Network for Real-Time Object Detection},
author = {Lee, Youngwan and Hwang, Joong-won and Lee, Sangrok and Bae, Yuseok and Park, Jongyoul},
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops},
year = {2019}
}
@article{lee2019centermask,
title={CenterMask: Real-Time Anchor-Free Instance Segmentation},
author={Lee, Youngwan and Park, Jongyoul},
journal={arXiv preprint arXiv:1911.06667},
year={2019}
}
```
================================================
FILE: demo/demo.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import argparse
import glob
import multiprocessing as mp
import os
import time
import cv2
import tqdm
from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger
from predictor import VisualizationDemo
from adet.config import get_cfg
# constants
WINDOW_NAME = "COCO detections"
def setup_cfg(args):
# load config from file and command-line arguments
cfg = get_cfg()
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
# Set score_threshold for builtin models
cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
cfg.MODEL.FCOS.INFERENCE_TH_TEST = args.confidence_threshold
cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
cfg.freeze()
return cfg
def get_parser():
parser = argparse.ArgumentParser(description="Detectron2 Demo")
parser.add_argument(
"--config-file",
default="configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_inference_acc_test.yaml",
metavar="FILE",
help="path to config file",
)
parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
parser.add_argument("--video-input", help="Path to video file.")
parser.add_argument("--input", nargs="+", help="A list of space separated input images")
parser.add_argument(
"--output",
help="A file or directory to save output visualizations. "
"If not given, will show output in an OpenCV window.",
)
parser.add_argument(
"--confidence-threshold",
type=float,
default=0.5,
help="Minimum score for instance predictions to be shown",
)
parser.add_argument(
"--opts",
help="Modify config options using the command-line 'KEY VALUE' pairs",
default=[],
nargs=argparse.REMAINDER,
)
return parser
if __name__ == "__main__":
mp.set_start_method("spawn", force=True)
args = get_parser().parse_args()
logger = setup_logger()
logger.info("Arguments: " + str(args))
cfg = setup_cfg(args)
demo = VisualizationDemo(cfg)
if args.input:
if os.path.isdir(args.input[0]):
args.input = [os.path.join(args.input[0], fname) for fname in os.listdir(args.input[0])]
elif len(args.input) == 1:
args.input = glob.glob(os.path.expanduser(args.input[0]))
assert args.input, "The input path(s) was not found"
for path in tqdm.tqdm(args.input, disable=not args.output):
# use PIL, to be consistent with evaluation
img = read_image(path, format="BGR")
start_time = time.time()
predictions, visualized_output = demo.run_on_image(img)
logger.info(
"{}: detected {} instances in {:.2f}s".format(
path, len(predictions["instances"]), time.time() - start_time
)
)
if args.output:
if os.path.isdir(args.output):
assert os.path.isdir(args.output), args.output
out_filename = os.path.join(args.output, os.path.basename(path))
else:
assert len(args.input) == 1, "Please specify a directory with args.output"
out_filename = args.output
visualized_output.save(out_filename)
else:
cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
if cv2.waitKey(0) == 27:
break # esc to quit
elif args.webcam:
assert args.input is None, "Cannot have both --input and --webcam!"
cam = cv2.VideoCapture(0)
for vis in tqdm.tqdm(demo.run_on_video(cam)):
cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
cv2.imshow(WINDOW_NAME, vis)
if cv2.waitKey(1) == 27:
break # esc to quit
cv2.destroyAllWindows()
elif args.video_input:
video = cv2.VideoCapture(args.video_input)
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames_per_second = video.get(cv2.CAP_PROP_FPS)
num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
basename = os.path.basename(args.video_input)
if args.output:
if os.path.isdir(args.output):
output_fname = os.path.join(args.output, basename)
output_fname = os.path.splitext(output_fname)[0] + ".mkv"
else:
output_fname = args.output
assert not os.path.isfile(output_fname), output_fname
output_file = cv2.VideoWriter(
filename=output_fname,
# some installation of opencv may not support x264 (due to its license),
# you can try other format (e.g. MPEG)
fourcc=cv2.VideoWriter_fourcc(*"x264"),
fps=float(frames_per_second),
frameSize=(width, height),
isColor=True,
)
assert os.path.isfile(args.video_input)
for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
if args.output:
output_file.write(vis_frame)
else:
cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
cv2.imshow(basename, vis_frame)
if cv2.waitKey(1) == 27:
break # esc to quit
video.release()
if args.output:
output_file.release()
else:
cv2.destroyAllWindows()
================================================
FILE: demo/predictor.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
import atexit
import bisect
import multiprocessing as mp
from collections import deque
import cv2
import torch
import matplotlib.pyplot as plt
from detectron2.data import MetadataCatalog
from detectron2.engine.defaults import DefaultPredictor
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode, Visualizer
class VisualizationDemo(object):
def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
"""
Args:
cfg (CfgNode):
instance_mode (ColorMode):
parallel (bool): whether to run the model in different processes from visualization.
Useful since the visualization logic can be slow.
"""
self.metadata = MetadataCatalog.get(
cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
)
self.cpu_device = torch.device("cpu")
self.instance_mode = instance_mode
self.parallel = parallel
if parallel:
num_gpu = torch.cuda.device_count()
self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
else:
self.predictor = DefaultPredictor(cfg)
def run_on_image(self, image):
"""
Args:
image (np.ndarray): an image of shape (H, W, C) (in BGR order).
This is the format used by OpenCV.
Returns:
predictions (dict): the output of the model.
vis_output (VisImage): the visualized image output.
"""
vis_output = None
predictions = self.predictor(image)
# Convert image from OpenCV BGR format to Matplotlib RGB format.
image = image[:, :, ::-1]
visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
if "inst" in predictions:
visualizer.vis_inst(predictions["inst"])
if "bases" in predictions:
self.vis_bases(predictions["bases"])
if "panoptic_seg" in predictions:
panoptic_seg, segments_info = predictions["panoptic_seg"]
vis_output = visualizer.draw_panoptic_seg_predictions(
panoptic_seg.to(self.cpu_device), segments_info
)
else:
if "sem_seg" in predictions:
vis_output = visualizer.draw_sem_seg(
predictions["sem_seg"].argmax(dim=0).to(self.cpu_device))
if "instances" in predictions:
instances = predictions["instances"].to(self.cpu_device)
vis_output = visualizer.draw_instance_predictions(predictions=instances)
return predictions, vis_output
def _frame_from_video(self, video):
while video.isOpened():
success, frame = video.read()
if success:
yield frame
else:
break
def vis_bases(self, bases):
basis_colors = [[2, 200, 255], [107, 220, 255], [30, 200, 255], [60, 220, 255]]
bases = bases[0].squeeze()
bases = (bases / 8).tanh().cpu().numpy()
num_bases = len(bases)
fig, axes = plt.subplots(nrows=num_bases // 2, ncols=2)
for i, basis in enumerate(bases):
basis = (basis + 1) / 2
basis = basis / basis.max()
basis_viz = np.zeros((basis.shape[0], basis.shape[1], 3), dtype=np.uint8)
basis_viz[:, :, 0] = basis_colors[i][0]
basis_viz[:, :, 1] = basis_colors[i][1]
basis_viz[:, :, 2] = np.uint8(basis * 255)
basis_viz = cv2.cvtColor(basis_viz, cv2.COLOR_HSV2RGB)
axes[i // 2][i % 2].imshow(basis_viz)
plt.show()
def run_on_video(self, video):
"""
Visualizes predictions on frames of the input video.
Args:
video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
either a webcam or a video file.
Yields:
ndarray: BGR visualizations of each video frame.
"""
video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
def process_predictions(frame, predictions):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
if "panoptic_seg" in predictions:
panoptic_seg, segments_info = predictions["panoptic_seg"]
vis_frame = video_visualizer.draw_panoptic_seg_predictions(
frame, panoptic_seg.to(self.cpu_device), segments_info
)
elif "instances" in predictions:
predictions = predictions["instances"].to(self.cpu_device)
vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
elif "sem_seg" in predictions:
vis_frame = video_visualizer.draw_sem_seg(
frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
)
# Converts Matplotlib RGB format to OpenCV BGR format
vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
return vis_frame
frame_gen = self._frame_from_video(video)
if self.parallel:
buffer_size = self.predictor.default_buffer_size
frame_data = deque()
for cnt, frame in enumerate(frame_gen):
frame_data.append(frame)
self.predictor.put(frame)
if cnt >= buffer_size:
frame = frame_data.popleft()
predictions = self.predictor.get()
yield process_predictions(frame, predictions)
while len(frame_data):
frame = frame_data.popleft()
predictions = self.predictor.get()
yield process_predictions(frame, predictions)
else:
for frame in frame_gen:
yield process_predictions(frame, self.predictor(frame))
class AsyncPredictor:
"""
A predictor that runs the model asynchronously, possibly on >1 GPUs.
Because rendering the visualization takes considerably amount of time,
this helps improve throughput when rendering videos.
"""
class _StopToken:
pass
class _PredictWorker(mp.Process):
def __init__(self, cfg, task_queue, result_queue):
self.cfg = cfg
self.task_queue = task_queue
self.result_queue = result_queue
super().__init__()
def run(self):
predictor = DefaultPredictor(self.cfg)
while True:
task = self.task_queue.get()
if isinstance(task, AsyncPredictor._StopToken):
break
idx, data = task
result = predictor(data)
self.result_queue.put((idx, result))
def __init__(self, cfg, num_gpus: int = 1):
"""
Args:
cfg (CfgNode):
num_gpus (int): if 0, will run on CPU
"""
num_workers = max(num_gpus, 1)
self.task_queue = mp.Queue(maxsize=num_workers * 3)
self.result_queue = mp.Queue(maxsize=num_workers * 3)
self.procs = []
for gpuid in range(max(num_gpus, 1)):
cfg = cfg.clone()
cfg.defrost()
cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
self.procs.append(
AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
)
self.put_idx = 0
self.get_idx = 0
self.result_rank = []
self.result_data = []
for p in self.procs:
p.start()
atexit.register(self.shutdown)
def put(self, image):
self.put_idx += 1
self.task_queue.put((self.put_idx, image))
def get(self):
self.get_idx += 1 # the index needed for this request
if len(self.result_rank) and self.result_rank[0] == self.get_idx:
res = self.result_data[0]
del self.result_data[0], self.result_rank[0]
return res
while True:
# make sure the results are returned in the correct order
idx, res = self.result_queue.get()
if idx == self.get_idx:
return res
insert = bisect.bisect(self.result_rank, idx)
self.result_rank.insert(insert, idx)
self.result_data.insert(insert, res)
def __len__(self):
return self.put_idx - self.get_idx
def __call__(self, image):
self.put(image)
return self.get()
def shutdown(self):
for _ in self.procs:
self.task_queue.put(AsyncPredictor._StopToken())
@property
def default_buffer_size(self):
return len(self.procs) * 5
================================================
FILE: fcos/__init__.py
================================================
from fcos import modeling
__version__ = "0.1.1"
================================================
FILE: fcos/checkpoint/__init__.py
================================================
from .adet_checkpoint import AdetCheckpointer
__all__ = ["AdetCheckpointer"]
================================================
FILE: fcos/checkpoint/adet_checkpoint.py
================================================
import pickle
from fvcore.common.file_io import PathManager
from detectron2.checkpoint import DetectionCheckpointer
class AdetCheckpointer(DetectionCheckpointer):
"""
Same as :class:`DetectronCheckpointer`, but is able to convert models
in AdelaiDet, such as LPF backbone.
"""
def _load_file(self, filename):
if filename.endswith(".pkl"):
with PathManager.open(filename, "rb") as f:
data = pickle.load(f, encoding="latin1")
if "model" in data and "__author__" in data:
# file is in Detectron2 model zoo format
self.logger.info("Reading a file from '{}'".format(data["__author__"]))
return data
else:
# assume file is from Caffe2 / Detectron1 model zoo
if "blobs" in data:
# Detection models have "blobs", but ImageNet models don't
data = data["blobs"]
data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
loaded = super()._load_file(filename) # load native pth checkpoint
if "model" not in loaded:
loaded = {"model": loaded}
if "lpf" in filename:
loaded["matching_heuristics"] = True
return loaded
================================================
FILE: fcos/config/__init__.py
================================================
from .config import get_cfg
__all__ = [
"get_cfg",
]
================================================
FILE: fcos/config/config.py
================================================
from detectron2.config import CfgNode
def get_cfg() -> CfgNode:
"""
Get a copy of the default config.
Returns:
a detectron2 CfgNode instance.
"""
from .defaults import _C
return _C.clone()
================================================
FILE: fcos/config/defaults.py
================================================
from detectron2.config.defaults import _C
from detectron2.config import CfgNode as CN
# ---------------------------------------------------------------------------- #
# Additional Configs
# ---------------------------------------------------------------------------- #
_C.MODEL.MOBILENET = False
# ---------------------------------------------------------------------------- #
# FCOS Head
# ---------------------------------------------------------------------------- #
_C.MODEL.FCOS = CN()
# This is the number of foreground classes.
_C.MODEL.FCOS.NUM_CLASSES = 80
_C.MODEL.FCOS.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
_C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
_C.MODEL.FCOS.PRIOR_PROB = 0.01
_C.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.05
_C.MODEL.FCOS.INFERENCE_TH_TEST = 0.05
_C.MODEL.FCOS.NMS_TH = 0.6
_C.MODEL.FCOS.PRE_NMS_TOPK_TRAIN = 1000
_C.MODEL.FCOS.PRE_NMS_TOPK_TEST = 1000
_C.MODEL.FCOS.POST_NMS_TOPK_TRAIN = 100
_C.MODEL.FCOS.POST_NMS_TOPK_TEST = 100
_C.MODEL.FCOS.TOP_LEVELS = 2
_C.MODEL.FCOS.NORM = "GN" # Support GN or none
_C.MODEL.FCOS.USE_SCALE = True
# Multiply centerness before threshold
# This will affect the final performance by about 0.05 AP but save some time
_C.MODEL.FCOS.THRESH_WITH_CTR = False
# Focal loss parameters
_C.MODEL.FCOS.LOSS_ALPHA = 0.25
_C.MODEL.FCOS.LOSS_GAMMA = 2.0
_C.MODEL.FCOS.SIZES_OF_INTEREST = [64, 128, 256, 512]
_C.MODEL.FCOS.USE_RELU = True
_C.MODEL.FCOS.USE_DEFORMABLE = False
# the number of convolutions used in the cls and bbox tower
_C.MODEL.FCOS.NUM_CLS_CONVS = 4
_C.MODEL.FCOS.NUM_BOX_CONVS = 4
_C.MODEL.FCOS.NUM_SHARE_CONVS = 0
_C.MODEL.FCOS.CENTER_SAMPLE = True
_C.MODEL.FCOS.POS_RADIUS = 1.5
_C.MODEL.FCOS.LOC_LOSS_TYPE = 'giou'
# ---------------------------------------------------------------------------- #
# VoVNet backbone
# ---------------------------------------------------------------------------- #
_C.MODEL.VOVNET = CN()
_C.MODEL.VOVNET.CONV_BODY = "V-39-eSE"
_C.MODEL.VOVNET.OUT_FEATURES = ["stage2", "stage3", "stage4", "stage5"]
# Options: FrozenBN, GN, "SyncBN", "BN"
_C.MODEL.VOVNET.NORM = "FrozenBN"
_C.MODEL.VOVNET.OUT_CHANNELS = 256
_C.MODEL.VOVNET.BACKBONE_OUT_CHANNELS = 256
================================================
FILE: fcos/data/__init__.py
================================================
from . import builtin # ensure the builtin datasets are registered
# from .dataset_mapper import DatasetMapperWithBasis
# __all__ = ["DatasetMapperWithBasis"]
================================================
FILE: fcos/data/builtin.py
================================================
import os
from detectron2.data.datasets.register_coco import register_coco_instances
# register person in context dataset
_PREDEFINED_SPLITS_PIC = {
"pic_person_train": ("pic/image/train", "pic/annotations/train_person.json"),
"pic_person_val": ("pic/image/val", "pic/annotations/val_person.json"),
}
metadata = {
"thing_classes": ["person"]
}
def register_all_coco(root="datasets"):
for key, (image_root, json_file) in _PREDEFINED_SPLITS_PIC.items():
# Assume pre-defined datasets live in `./datasets`.
register_coco_instances(
key,
metadata,
os.path.join(root, json_file) if "://" not in json_file else json_file,
os.path.join(root, image_root),
)
register_all_coco()
================================================
FILE: fcos/layers/__init__.py
================================================
from .deform_conv import DFConv2d
from .ml_nms import ml_nms
from .iou_loss import IOULoss
from .conv_with_kaiming_uniform import conv_with_kaiming_uniform
__all__ = [k for k in globals().keys() if not k.startswith("_")]
================================================
FILE: fcos/layers/conv_with_kaiming_uniform.py
================================================
from torch import nn
from detectron2.layers import Conv2d
from .deform_conv import DFConv2d
from detectron2.layers.batch_norm import get_norm
def conv_with_kaiming_uniform(
norm=None, activation=None,
use_deformable=False, use_sep=False):
def make_conv(
in_channels, out_channels, kernel_size, stride=1, dilation=1
):
if use_deformable:
conv_func = DFConv2d
else:
conv_func = Conv2d
if use_sep:
assert in_channels == out_channels
groups = in_channels
else:
groups = 1
conv = conv_func(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=dilation * (kernel_size - 1) // 2,
dilation=dilation,
groups=groups,
bias=(norm is None)
)
if not use_deformable:
# Caffe2 implementation uses XavierFill, which in fact
# corresponds to kaiming_uniform_ in PyTorch
nn.init.kaiming_uniform_(conv.weight, a=1)
if norm is None:
nn.init.constant_(conv.bias, 0)
module = [conv,]
if norm is not None:
if norm == "GN":
norm_module = nn.GroupNorm(32, out_channels)
else:
norm_module = get_norm(norm, out_channels)
module.append(norm_module)
if activation is not None:
module.append(nn.ReLU(inplace=True))
if len(module) > 1:
return nn.Sequential(*module)
return conv
return make_conv
================================================
FILE: fcos/layers/csrc/cuda_version.cu
================================================
#include
namespace adet {
int get_cudart_version() {
return CUDART_VERSION;
}
} // namespace adet
================================================
FILE: fcos/layers/csrc/ml_nms/ml_nms.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include
#include
#include
#include
#include
#include
int const threadsPerBlock = sizeof(unsigned long long) * 8;
__device__ inline float devIoU(float const * const a, float const * const b) {
if (a[5] != b[5]) {
return 0.0;
}
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void ml_nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 6];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 6 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
block_boxes[threadIdx.x * 6 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
block_boxes[threadIdx.x * 6 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
block_boxes[threadIdx.x * 6 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
block_boxes[threadIdx.x * 6 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
block_boxes[threadIdx.x * 6 + 5] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 6;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 6) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
namespace adet {
// boxes is a N x 6 tensor
at::Tensor ml_nms_cuda(const at::Tensor boxes, const float nms_overlap_thresh) {
using scalar_t = float;
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
auto scores = boxes.select(1, 4);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
int boxes_num = boxes.size(0);
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
scalar_t* boxes_dev = boxes_sorted.data();
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
unsigned long long* mask_dev = NULL;
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
// boxes_num * col_blocks * sizeof(unsigned long long)));
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
THCCeilDiv(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
ml_nms_kernel<<>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);
std::vector mask_host(boxes_num * col_blocks);
THCudaCheck(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
std::vector remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data();
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
THCudaFree(state, mask_dev);
// TODO improve this part
return std::get<0>(order_t.index({
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
order_t.device(), keep.scalar_type())
}).sort(0, false));
}
} // namespace adet
================================================
FILE: fcos/layers/csrc/ml_nms/ml_nms.h
================================================
#pragma once
#include
namespace adet {
#ifdef WITH_CUDA
at::Tensor ml_nms_cuda(
const at::Tensor dets,
const float threshold);
#endif
at::Tensor ml_nms(const at::Tensor& dets,
const at::Tensor& scores,
const at::Tensor& labels,
const float threshold) {
if (dets.type().is_cuda()) {
#ifdef WITH_CUDA
// TODO raise error if not compiled with CUDA
if (dets.numel() == 0)
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
auto b = at::cat({dets, scores.unsqueeze(1), labels.unsqueeze(1)}, 1);
return ml_nms_cuda(b, threshold);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("CPU version not implemented");
}
} // namespace adet
================================================
FILE: fcos/layers/csrc/vision.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "ml_nms/ml_nms.h"
namespace adet {
#ifdef WITH_CUDA
extern int get_cudart_version();
#endif
std::string get_cuda_version() {
#ifdef WITH_CUDA
std::ostringstream oss;
// copied from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
auto printCudaStyleVersion = [&](int v) {
oss << (v / 1000) << "." << (v / 10 % 100);
if (v % 10 != 0) {
oss << "." << (v % 10);
}
};
printCudaStyleVersion(get_cudart_version());
return oss.str();
#else
return std::string("not available");
#endif
}
// similar to
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
std::string get_compiler_version() {
std::ostringstream ss;
#if defined(__GNUC__)
#ifndef __clang__
{ ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
#endif
#endif
#if defined(__clang_major__)
{
ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
<< __clang_patchlevel__;
}
#endif
#if defined(_MSC_VER)
{ ss << "MSVC " << _MSC_FULL_VER; }
#endif
return ss.str();
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("ml_nms", &ml_nms, "Multi-Label NMS");
}
} // namespace adet
================================================
FILE: fcos/layers/deform_conv.py
================================================
import torch
from torch import nn
from detectron2.layers import Conv2d
class _NewEmptyTensorOp(torch.autograd.Function):
@staticmethod
def forward(ctx, x, new_shape):
ctx.shape = x.shape
return x.new_empty(new_shape)
@staticmethod
def backward(ctx, grad):
shape = ctx.shape
return _NewEmptyTensorOp.apply(grad, shape), None
class DFConv2d(nn.Module):
"""
Deformable convolutional layer with configurable
deformable groups, dilations and groups.
Code is from:
https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/layers/misc.py
"""
def __init__(
self,
in_channels,
out_channels,
with_modulated_dcn=True,
kernel_size=3,
stride=1,
groups=1,
dilation=1,
deformable_groups=1,
bias=False,
padding=None
):
super(DFConv2d, self).__init__()
if isinstance(kernel_size, (list, tuple)):
assert isinstance(stride, (list, tuple))
assert isinstance(dilation, (list, tuple))
assert len(kernel_size) == 2
assert len(stride) == 2
assert len(dilation) == 2
padding = (
dilation[0] * (kernel_size[0] - 1) // 2,
dilation[1] * (kernel_size[1] - 1) // 2
)
offset_base_channels = kernel_size[0] * kernel_size[1]
else:
padding = dilation * (kernel_size - 1) // 2
offset_base_channels = kernel_size * kernel_size
if with_modulated_dcn:
from .deform_conv import ModulatedDeformConv
offset_channels = offset_base_channels * 3 # default: 27
conv_block = ModulatedDeformConv
else:
from .deform_conv import DeformConv
offset_channels = offset_base_channels * 2 # default: 18
conv_block = DeformConv
self.offset = Conv2d(
in_channels,
deformable_groups * offset_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=1,
dilation=dilation
)
for l in [self.offset, ]:
nn.init.kaiming_uniform_(l.weight, a=1)
torch.nn.init.constant_(l.bias, 0.)
self.conv = conv_block(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
deformable_groups=deformable_groups,
bias=bias
)
self.with_modulated_dcn = with_modulated_dcn
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.dilation = dilation
self.offset_split = offset_base_channels * deformable_groups * 2
def forward(self, x, return_offset=False):
if x.numel() > 0:
if not self.with_modulated_dcn:
offset_mask = self.offset(x)
x = self.conv(x, offset_mask)
else:
offset_mask = self.offset(x)
offset = offset_mask[:, :self.offset_split, :, :]
mask = offset_mask[:, self.offset_split:, :, :].sigmoid()
x = self.conv(x, offset, mask)
if return_offset:
return x, offset_mask
return x
# get output shape
output_shape = [
(i + 2 * p - (di * (k - 1) + 1)) // d + 1
for i, p, di, k, d in zip(
x.shape[-2:],
self.padding,
self.dilation,
self.kernel_size,
self.stride
)
]
output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape
return _NewEmptyTensorOp.apply(x, output_shape)
================================================
FILE: fcos/layers/iou_loss.py
================================================
import torch
from torch import nn
class IOULoss(nn.Module):
"""
Intersetion Over Union (IoU) loss which supports three
different IoU computations:
* IoU
* Linear IoU
* gIoU
"""
def __init__(self, loc_loss_type='iou'):
super(IOULoss, self).__init__()
self.loc_loss_type = loc_loss_type
def forward(self, pred, target, weight=None):
"""
Args:
pred: Nx4 predicted bounding boxes
target: Nx4 target bounding boxes
weight: N loss weight for each instance
"""
pred_left = pred[:, 0]
pred_top = pred[:, 1]
pred_right = pred[:, 2]
pred_bottom = pred[:, 3]
target_left = target[:, 0]
target_top = target[:, 1]
target_right = target[:, 2]
target_bottom = target[:, 3]
target_aera = (target_left + target_right) * \
(target_top + target_bottom)
pred_aera = (pred_left + pred_right) * \
(pred_top + pred_bottom)
w_intersect = torch.min(pred_left, target_left) + \
torch.min(pred_right, target_right)
h_intersect = torch.min(pred_bottom, target_bottom) + \
torch.min(pred_top, target_top)
g_w_intersect = torch.max(pred_left, target_left) + \
torch.max(pred_right, target_right)
g_h_intersect = torch.max(pred_bottom, target_bottom) + \
torch.max(pred_top, target_top)
ac_uion = g_w_intersect * g_h_intersect
area_intersect = w_intersect * h_intersect
area_union = target_aera + pred_aera - area_intersect
ious = (area_intersect + 1.0) / (area_union + 1.0)
gious = ious - (ac_uion - area_union) / ac_uion
if self.loc_loss_type == 'iou':
losses = -torch.log(ious)
elif self.loc_loss_type == 'linear_iou':
losses = 1 - ious
elif self.loc_loss_type == 'giou':
losses = 1 - gious
else:
raise NotImplementedError
if weight is not None:
return (losses * weight).sum()
else:
return losses.sum()
================================================
FILE: fcos/layers/ml_nms.py
================================================
from detectron2.layers import batched_nms
def ml_nms(boxlist, nms_thresh, max_proposals=-1,
score_field="scores", label_field="labels"):
"""
Performs non-maximum suppression on a boxlist, with scores specified
in a boxlist field via score_field.
Args:
boxlist (detectron2.structures.Boxes):
nms_thresh (float):
max_proposals (int): if > 0, then only the top max_proposals are kept
after non-maximum suppression
score_field (str):
"""
if nms_thresh <= 0:
return boxlist
boxes = boxlist.pred_boxes.tensor
scores = boxlist.scores
labels = boxlist.pred_classes
keep = batched_nms(boxes, scores, labels, nms_thresh)
if max_proposals > 0:
keep = keep[: max_proposals]
boxlist = boxlist[keep]
return boxlist
================================================
FILE: fcos/modeling/__init__.py
================================================
from .fcos import FCOS
from .backbone import build_fcos_resnet_fpn_backbone
from .one_stage_detector import OneStageDetector
_EXCLUDE = {"torch", "ShapeSpec"}
__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
================================================
FILE: fcos/modeling/backbone/__init__.py
================================================
from .fpn import build_fcos_resnet_fpn_backbone
from .vovnet import build_vovnet_fpn_backbone, build_vovnet_backbone
================================================
FILE: fcos/modeling/backbone/fpn.py
================================================
from torch import nn
import torch.nn.functional as F
import fvcore.nn.weight_init as weight_init
from detectron2.modeling.backbone import FPN, build_resnet_backbone
from detectron2.layers import ShapeSpec
from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
from .mobilenet import build_mnv2_backbone
class LastLevelP6P7(nn.Module):
"""
This module is used in RetinaNet and FCOS to generate extra layers, P6 and P7 from
C5 or P5 feature.
"""
def __init__(self, in_channels, out_channels, in_features="res5"):
super().__init__()
self.num_levels = 2
self.in_feature = in_features
self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
for module in [self.p6, self.p7]:
weight_init.c2_xavier_fill(module)
def forward(self, x):
p6 = self.p6(x)
p7 = self.p7(F.relu(p6))
return [p6, p7]
class LastLevelP6(nn.Module):
"""
This module is used in FCOS to generate extra layers
"""
def __init__(self, in_channels, out_channels, in_features="res5"):
super().__init__()
self.num_levels = 1
self.in_feature = in_features
self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
for module in [self.p6]:
weight_init.c2_xavier_fill(module)
def forward(self, x):
p6 = self.p6(x)
return [p6]
@BACKBONE_REGISTRY.register()
def build_fcos_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
"""
Args:
cfg: a detectron2 CfgNode
Returns:
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
"""
if cfg.MODEL.MOBILENET:
bottom_up = build_mnv2_backbone(cfg, input_shape)
else:
bottom_up = build_resnet_backbone(cfg, input_shape)
in_features = cfg.MODEL.FPN.IN_FEATURES
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
top_levels = cfg.MODEL.FCOS.TOP_LEVELS
in_channels_top = out_channels
if top_levels == 2:
top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
if top_levels == 1:
top_block = LastLevelP6(in_channels_top, out_channels, "p5")
elif top_levels == 0:
top_block = None
backbone = FPN(
bottom_up=bottom_up,
in_features=in_features,
out_channels=out_channels,
norm=cfg.MODEL.FPN.NORM,
top_block=top_block,
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
)
return backbone
================================================
FILE: fcos/modeling/backbone/mobilenet.py
================================================
# taken from https://github.com/tonylins/pytorch-mobilenet-v2/
# Published by Ji Lin, tonylins
# licensed under the Apache License, Version 2.0, January 2004
from torch import nn
from torch.nn import BatchNorm2d
#from detectron2.layers.batch_norm import NaiveSyncBatchNorm as BatchNorm2d
from detectron2.layers import Conv2d
from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
from detectron2.modeling.backbone import Backbone
def conv_bn(inp, oup, stride):
return nn.Sequential(
Conv2d(inp, oup, 3, stride, 1, bias=False),
BatchNorm2d(oup),
nn.ReLU6(inplace=True)
)
def conv_1x1_bn(inp, oup):
return nn.Sequential(
Conv2d(inp, oup, 1, 1, 0, bias=False),
BatchNorm2d(oup),
nn.ReLU6(inplace=True)
)
class InvertedResidual(nn.Module):
def __init__(self, inp, oup, stride, expand_ratio):
super(InvertedResidual, self).__init__()
self.stride = stride
assert stride in [1, 2]
hidden_dim = int(round(inp * expand_ratio))
self.use_res_connect = self.stride == 1 and inp == oup
if expand_ratio == 1:
self.conv = nn.Sequential(
# dw
Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# pw-linear
Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
BatchNorm2d(oup),
)
else:
self.conv = nn.Sequential(
# pw
Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# dw
Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
BatchNorm2d(hidden_dim),
nn.ReLU6(inplace=True),
# pw-linear
Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
BatchNorm2d(oup),
)
def forward(self, x):
if self.use_res_connect:
return x + self.conv(x)
else:
return self.conv(x)
class MobileNetV2(Backbone):
"""
Should freeze bn
"""
def __init__(self, cfg, n_class=1000, input_size=224, width_mult=1.):
super(MobileNetV2, self).__init__()
block = InvertedResidual
input_channel = 32
interverted_residual_setting = [
# t, c, n, s
[1, 16, 1, 1],
[6, 24, 2, 2],
[6, 32, 3, 2],
[6, 64, 4, 2],
[6, 96, 3, 1],
[6, 160, 3, 2],
[6, 320, 1, 1],
]
# building first layer
assert input_size % 32 == 0
input_channel = int(input_channel * width_mult)
self.return_features_indices = [3, 6, 13, 17]
self.return_features_num_channels = []
self.features = nn.ModuleList([conv_bn(3, input_channel, 2)])
# building inverted residual blocks
for t, c, n, s in interverted_residual_setting:
output_channel = int(c * width_mult)
for i in range(n):
if i == 0:
self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
else:
self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
input_channel = output_channel
if len(self.features) - 1 in self.return_features_indices:
self.return_features_num_channels.append(output_channel)
self._initialize_weights()
self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_AT)
def _freeze_backbone(self, freeze_at):
for layer_index in range(freeze_at):
for p in self.features[layer_index].parameters():
p.requires_grad = False
def forward(self, x):
res = []
for i, m in enumerate(self.features):
x = m(x)
if i in self.return_features_indices:
res.append(x)
return {'res{}'.format(i + 2): r for i, r in enumerate(res)}
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, (2. / n) ** 0.5)
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
n = m.weight.size(1)
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()
@BACKBONE_REGISTRY.register()
def build_mnv2_backbone(cfg, input_shape):
"""
Create a ResNet instance from config.
Returns:
ResNet: a :class:`ResNet` instance.
"""
out_features = cfg.MODEL.RESNETS.OUT_FEATURES
out_feature_channels = {"res2": 24, "res3": 32,
"res4": 96, "res5": 320}
out_feature_strides = {"res2": 4, "res3": 8, "res4": 16, "res5": 32}
model = MobileNetV2(cfg)
model._out_features = out_features
model._out_feature_channels = out_feature_channels
model._out_feature_strides = out_feature_strides
return model
================================================
FILE: fcos/modeling/backbone/vovnet.py
================================================
# Copyright (c) Youngwan Lee (ETRI) All Rights Reserved.
from collections import OrderedDict
import torch
import torch.nn as nn
import torch.nn.functional as F
import fvcore.nn.weight_init as weight_init
from detectron2.modeling.backbone import Backbone
from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
from detectron2.modeling.backbone.fpn import FPN
from detectron2.layers import (
Conv2d,
DeformConv,
FrozenBatchNorm2d,
ShapeSpec,
get_norm,
)
from .fpn import LastLevelP6, LastLevelP6P7
__all__ = [
"VoVNet",
"build_vovnet_backbone",
"build_vovnet_fpn_backbone"
]
_NORM = False
VoVNet19_eSE = {
'stage_conv_ch': [128, 160, 192, 224],
'stage_out_ch': [256, 512, 768, 1024],
'layer_per_block': 3,
'block_per_stage': [1, 1, 1, 1],
'eSE' : True
}
VoVNet39_eSE = {
'stage_conv_ch': [128, 160, 192, 224],
'stage_out_ch': [256, 512, 768, 1024],
'layer_per_block': 5,
'block_per_stage': [1, 1, 2, 2],
'eSE' : True
}
VoVNet57_eSE = {
'stage_conv_ch': [128, 160, 192, 224],
'stage_out_ch': [256, 512, 768, 1024],
'layer_per_block': 5,
'block_per_stage': [1, 1, 4, 3],
'eSE' : True
}
VoVNet99_eSE = {
'stage_conv_ch': [128, 160, 192, 224],
'stage_out_ch': [256, 512, 768, 1024],
'layer_per_block': 5,
'block_per_stage': [1, 3, 9, 3],
'eSE' : True
}
_STAGE_SPECS = {
"V-19-eSE": VoVNet19_eSE,
"V-39-eSE": VoVNet39_eSE,
"V-57-eSE": VoVNet57_eSE,
"V-99-eSE": VoVNet99_eSE
}
def conv3x3(in_channels, out_channels, module_name, postfix,
stride=1, groups=1, kernel_size=3, padding=1):
"""3x3 convolution with padding"""
return [
(f'{module_name}_{postfix}/conv',
nn.Conv2d(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False)),
(f'{module_name}_{postfix}/norm', get_norm(_NORM, out_channels)),
(f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True))
]
def conv1x1(in_channels, out_channels, module_name, postfix,
stride=1, groups=1, kernel_size=1, padding=0):
"""1x1 convolution with padding"""
return [
(f'{module_name}_{postfix}/conv',
nn.Conv2d(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False)),
(f'{module_name}_{postfix}/norm', get_norm(_NORM, out_channels)),
(f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True))
]
class Hsigmoid(nn.Module):
def __init__(self, inplace=True):
super(Hsigmoid, self).__init__()
self.inplace = inplace
def forward(self, x):
return F.relu6(x + 3., inplace=self.inplace) / 6.
class eSEModule(nn.Module):
def __init__(self, channel, reduction=4):
super(eSEModule, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Conv2d(channel,channel, kernel_size=1,
padding=0)
self.hsigmoid = Hsigmoid()
def forward(self, x):
input = x
x = self.avg_pool(x)
x = self.fc(x)
x = self.hsigmoid(x)
return input * x
class _OSA_module(nn.Module):
def __init__(self,
in_ch,
stage_ch,
concat_ch,
layer_per_block,
module_name,
SE=False,
identity=False):
super(_OSA_module, self).__init__()
self.identity = identity
self.layers = nn.ModuleList()
in_channel = in_ch
for i in range(layer_per_block):
self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i))))
in_channel = stage_ch
# feature aggregation
in_channel = in_ch + layer_per_block * stage_ch
self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, 'concat')))
self.ese = eSEModule(concat_ch)
def forward(self, x):
identity_feat = x
output = []
output.append(x)
for layer in self.layers:
x = layer(x)
output.append(x)
x = torch.cat(output, dim=1)
xt = self.concat(x)
xt = self.ese(xt)
if self.identity:
xt = xt + identity_feat
return xt
class _OSA_stage(nn.Sequential):
def __init__(self,
in_ch,
stage_ch,
concat_ch,
block_per_stage,
layer_per_block,
stage_num,
SE=False):
super(_OSA_stage, self).__init__()
if not stage_num == 2:
self.add_module('Pooling', nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
if block_per_stage !=1:
SE = False
module_name = f'OSA{stage_num}_1'
self.add_module(module_name, _OSA_module(in_ch,
stage_ch,
concat_ch,
layer_per_block,
module_name,
SE))
for i in range(block_per_stage - 1):
if i != block_per_stage -2: #last block
SE = False
module_name = f'OSA{stage_num}_{i + 2}'
self.add_module(module_name,
_OSA_module(concat_ch,
stage_ch,
concat_ch,
layer_per_block,
module_name,
SE,
identity=True))
class VoVNet(Backbone):
def __init__(self, cfg, input_ch, out_features=None):
"""
Args:
input_ch(int) : the number of input channel
out_features (list[str]): name of the layers whose outputs should
be returned in forward. Can be anything in "stem", "stage2" ...
"""
super(VoVNet, self).__init__()
global _NORM
_NORM = cfg.MODEL.VOVNET.NORM
stage_specs = _STAGE_SPECS[cfg.MODEL.VOVNET.CONV_BODY]
config_stage_ch = stage_specs['stage_conv_ch']
config_concat_ch = stage_specs['stage_out_ch']
block_per_stage = stage_specs['block_per_stage']
layer_per_block = stage_specs['layer_per_block']
SE = stage_specs['eSE']
self._out_features = out_features
# Stem module
stem = conv3x3(input_ch, 64, 'stem', '1', 2)
stem += conv3x3(64, 64, 'stem', '2', 1)
stem += conv3x3(64, 128, 'stem', '3', 2)
self.add_module('stem', nn.Sequential((OrderedDict(stem))))
current_stirde = 4
self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde}
self._out_feature_channels = {"stem": 128}
stem_out_ch = [128]
in_ch_list = stem_out_ch + config_concat_ch[:-1]
# OSA stages
self.stage_names = []
for i in range(4): # num_stages
name = 'stage%d' % (i + 2) # stage 2 ... stage 5
self.stage_names.append(name)
self.add_module(name, _OSA_stage(in_ch_list[i],
config_stage_ch[i],
config_concat_ch[i],
block_per_stage[i],
layer_per_block,
i + 2,
SE))
self._out_feature_channels[name] = config_concat_ch[i]
if not i == 0:
self._out_feature_strides[name] = current_stirde = int(
current_stirde * 2)
# initialize weights
self._initialize_weights()
# Optionally freeze (requires_grad=False) parts of the backbone
self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_AT)
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
def _freeze_backbone(self, freeze_at):
if freeze_at < 0:
return
# freeze BN layers
for m in self.modules():
if isinstance(m, nn.BatchNorm2d):
freeze_bn_params(m)
for stage_index in range(freeze_at):
if stage_index == 0:
m = self.stem # stage 0 is the stem
else:
m = getattr(self, "stage" + str(stage_index+1))
for p in m.parameters():
p.requires_grad = False
FrozenBatchNorm2d.convert_frozen_batchnorm(self)
def forward(self, x):
outputs = {}
x = self.stem(x)
if "stem" in self._out_features:
outputs["stem"] = x
for name in self.stage_names:
x = getattr(self, name)(x)
if name in self._out_features:
outputs[name] = x
return outputs
def output_shape(self):
return {
name: ShapeSpec(
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
)
for name in self._out_features
}
@BACKBONE_REGISTRY.register()
def build_vovnet_backbone(cfg, input_shape):
"""
Create a VoVNet instance from config.
Returns:
VoVNet: a :class:`VoVNet` instance.
"""
out_features = cfg.MODEL.VOVNET.OUT_FEATURES
return VoVNet(cfg, input_shape.channels, out_features=out_features)
@BACKBONE_REGISTRY.register()
def build_vovnet_fpn_backbone(cfg, input_shape: ShapeSpec):
"""
Args:
cfg: a detectron2 CfgNode
Returns:
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
"""
bottom_up = build_vovnet_backbone(cfg, input_shape)
in_features = cfg.MODEL.FPN.IN_FEATURES
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
backbone = FPN(
bottom_up=bottom_up,
in_features=in_features,
out_channels=out_channels,
norm=cfg.MODEL.FPN.NORM,
top_block=LastLevelMaxPool(),
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
)
return backbone
@BACKBONE_REGISTRY.register()
def build_fcos_vovnet_fpn_backbone(cfg, input_shape: ShapeSpec):
"""
Args:
cfg: a detectron2 CfgNode
Returns:
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
"""
bottom_up = build_vovnet_backbone(cfg, input_shape)
in_features = cfg.MODEL.FPN.IN_FEATURES
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
top_levels = cfg.MODEL.FCOS.TOP_LEVELS
in_channels_top = out_channels
if top_levels == 2:
top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
if top_levels == 1:
top_block = LastLevelP6(in_channels_top, out_channels, "p5")
elif top_levels == 0:
top_block = None
backbone = FPN(
bottom_up=bottom_up,
in_features=in_features,
out_channels=out_channels,
norm=cfg.MODEL.FPN.NORM,
top_block=top_block,
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
)
return backbone
================================================
FILE: fcos/modeling/fcos/__init__.py
================================================
from .fcos import FCOS
================================================
FILE: fcos/modeling/fcos/fcos.py
================================================
import math
from typing import List, Dict
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.layers import ShapeSpec
from detectron2.modeling.proposal_generator.build import PROPOSAL_GENERATOR_REGISTRY
from fcos.layers import DFConv2d, IOULoss
from .fcos_outputs import FCOSOutputs
__all__ = ["FCOS"]
INF = 100000000
class Scale(nn.Module):
def __init__(self, init_value=1.0):
super(Scale, self).__init__()
self.scale = nn.Parameter(torch.FloatTensor([init_value]))
def forward(self, input):
return input * self.scale
@PROPOSAL_GENERATOR_REGISTRY.register()
class FCOS(nn.Module):
"""
Implement FCOS (https://arxiv.org/abs/1904.01355).
"""
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
super().__init__()
# fmt: off
self.in_features = cfg.MODEL.FCOS.IN_FEATURES
self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA
self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA
self.center_sample = cfg.MODEL.FCOS.CENTER_SAMPLE
self.strides = cfg.MODEL.FCOS.FPN_STRIDES
self.radius = cfg.MODEL.FCOS.POS_RADIUS
self.pre_nms_thresh_train = cfg.MODEL.FCOS.INFERENCE_TH_TRAIN
self.pre_nms_thresh_test = cfg.MODEL.FCOS.INFERENCE_TH_TEST
self.pre_nms_topk_train = cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN
self.pre_nms_topk_test = cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST
self.nms_thresh = cfg.MODEL.FCOS.NMS_TH
self.post_nms_topk_train = cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN
self.post_nms_topk_test = cfg.MODEL.FCOS.POST_NMS_TOPK_TEST
self.thresh_with_ctr = cfg.MODEL.FCOS.THRESH_WITH_CTR
# fmt: on
self.iou_loss = IOULoss(cfg.MODEL.FCOS.LOC_LOSS_TYPE)
# generate sizes of interest
soi = []
prev_size = -1
for s in cfg.MODEL.FCOS.SIZES_OF_INTEREST:
soi.append([prev_size, s])
prev_size = s
soi.append([prev_size, INF])
self.sizes_of_interest = soi
self.fcos_head = FCOSHead(cfg, [input_shape[f] for f in self.in_features])
def forward(self, images, features, gt_instances):
"""
Arguments:
images (list[Tensor] or ImageList): images to be processed
targets (list[BoxList]): ground-truth boxes present in the image (optional)
Returns:
result (list[BoxList] or dict[Tensor]): the output from the model.
During training, it returns a dict[Tensor] which contains the losses.
During testing, it returns list[BoxList] contains additional fields
like `scores`, `labels` and `mask` (for Mask R-CNN models).
"""
features = [features[f] for f in self.in_features]
locations = self.compute_locations(features)
logits_pred, reg_pred, ctrness_pred, bbox_towers, controllers, masks = self.fcos_head(features)
if self.training:
pre_nms_thresh = self.pre_nms_thresh_train
pre_nms_topk = self.pre_nms_topk_train
post_nms_topk = self.post_nms_topk_train
else:
pre_nms_thresh = self.pre_nms_thresh_test
pre_nms_topk = self.pre_nms_topk_test
post_nms_topk = self.post_nms_topk_test
outputs = FCOSOutputs(
images,
locations,
logits_pred,
reg_pred,
ctrness_pred,
self.focal_loss_alpha,
self.focal_loss_gamma,
self.iou_loss,
self.center_sample,
self.sizes_of_interest,
self.strides,
self.radius,
self.fcos_head.num_classes,
pre_nms_thresh,
pre_nms_topk,
self.nms_thresh,
post_nms_topk,
self.thresh_with_ctr,
controllers,
masks,
gt_instances
)
if self.training:
losses, _ = outputs.losses()
return None, losses
else:
proposals = outputs.predict_proposals()
return proposals, {}
def compute_locations(self, features):
locations = []
for level, feature in enumerate(features):
h, w = feature.size()[-2:]
locations_per_level = self.compute_locations_per_level(
h, w, self.fpn_strides[level],
feature.device
)
locations.append(locations_per_level)
return locations
def compute_locations_per_level(self, h, w, stride, device):
shifts_x = torch.arange(
0, w * stride, step=stride,
dtype=torch.float32, device=device
)
shifts_y = torch.arange(
0, h * stride, step=stride,
dtype=torch.float32, device=device
)
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
shift_x = shift_x.reshape(-1)
shift_y = shift_y.reshape(-1)
locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
return locations
class FCOSHead(nn.Module):
def __init__(self, cfg, input_shape: List[ShapeSpec]):
"""
Arguments:
in_channels (int): number of channels of the input feature
"""
super().__init__()
# TODO: Implement the sigmoid version first.
self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES
self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
head_configs = {"cls": (cfg.MODEL.FCOS.NUM_CLS_CONVS,
False),
"bbox": (cfg.MODEL.FCOS.NUM_BOX_CONVS,
cfg.MODEL.FCOS.USE_DEFORMABLE),
"share": (cfg.MODEL.FCOS.NUM_SHARE_CONVS,
cfg.MODEL.FCOS.USE_DEFORMABLE),
"mask": (8,False)}
norm = None if cfg.MODEL.FCOS.NORM == "none" else cfg.MODEL.FCOS.NORM
in_channels = [s.channels for s in input_shape]
assert len(set(in_channels)) == 1, "Each level must have the same channel!"
in_channels = in_channels[0]
for head in head_configs:
tower = []
num_convs, use_deformable = head_configs[head]
if use_deformable:
conv_func = DFConv2d
else:
conv_func = nn.Conv2d
for i in range(num_convs):
tower.append(conv_func(
in_channels, in_channels,
kernel_size=3, stride=1,
padding=1, bias=True
))
if norm == "GN":
tower.append(nn.GroupNorm(32, in_channels))
tower.append(nn.ReLU())
self.add_module('{}_tower'.format(head),
nn.Sequential(*tower))
self.cls_logits = nn.Conv2d(
in_channels, self.num_classes,
kernel_size=3, stride=1,
padding=1
)
self.bbox_pred = nn.Conv2d(
in_channels, 4, kernel_size=3,
stride=1, padding=1
)
self.ctrness = nn.Conv2d(
in_channels, 1, kernel_size=3,
stride=1, padding=1
)
self.controller = nn.Conv2d(
in_channels, 169, kernel_size=3,
stride=1, padding=1
)
self.mask = nn.Conv2d(
in_channels, 8,
kernel_size=3, stride=1,
padding=1
)
if cfg.MODEL.FCOS.USE_SCALE:
self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in self.fpn_strides])
else:
self.scales = None
for modules in [
self.cls_tower, self.bbox_tower,
self.share_tower, self.cls_logits,
self.bbox_pred, self.ctrness,
self.controller, self.mask,
]:
for l in modules.modules():
if isinstance(l, nn.Conv2d):
torch.nn.init.normal_(l.weight, std=0.01)
torch.nn.init.constant_(l.bias, 0)
# initialize the bias for focal loss
prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
bias_value = -math.log((1 - prior_prob) / prior_prob)
torch.nn.init.constant_(self.cls_logits.bias, bias_value)
def forward(self, x):
logits = []
bbox_reg = []
ctrness = []
bbox_towers = []
controllers = []
for l, feature in enumerate(x):
feature = self.share_tower(feature)
cls_tower = self.cls_tower(feature)
bbox_tower = self.bbox_tower(feature)
logits.append(self.cls_logits(cls_tower))
ctrness.append(self.ctrness(bbox_tower))
controllers.append(self.controller(bbox_tower))
reg = self.bbox_pred(bbox_tower)
if self.scales is not None:
reg = self.scales[l](reg)
# Note that we use relu, as in the improved FCOS, instead of exp.
bbox_reg.append(F.relu(reg))
masks = x[0]
masks = self.mask_tower(masks)
masks = self.mask(masks)
return logits, bbox_reg, ctrness, bbox_towers, controllers, masks
================================================
FILE: fcos/modeling/fcos/fcos_outputs.py
================================================
import logging
import torch
import torch.nn.functional as F
from detectron2.layers import cat
from detectron2.structures import Instances, Boxes
from fcos.utils.comm import get_world_size
from fvcore.nn import sigmoid_focal_loss_jit
from fcos.utils.comm import reduce_sum
from fcos.layers import ml_nms
#from detectron2.layers import interpolate
logger = logging.getLogger(__name__)
INF = 100000000
"""
Shape shorthand in this module:
N: number of images in the minibatch
L: number of feature maps per image on which RPN is run
Hi, Wi: height and width of the i-th feature map
4: size of the box parameterization
Naming convention:
labels: refers to the ground-truth class of an position.
reg_targets: refers to the 4-d (left, top, right, bottom) distances that parameterize the ground-truth box.
logits_pred: predicted classification scores in [-inf, +inf];
reg_pred: the predicted (left, top, right, bottom), corresponding to reg_targets
ctrness_pred: predicted centerness scores
"""
def aligned_bilinear(tensor, factor):
assert tensor.dim() == 4
assert factor >= 1
assert int(factor) == factor
if factor == 1:
return tensor
h, w = tensor.size()[2:]
tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
oh = factor * h + 1
ow = factor * w + 1
tensor = F.interpolate(
tensor, size=(oh, ow),
mode='bilinear',
align_corners=True
)
tensor = F.pad(
tensor, pad=(factor // 2, 0, factor // 2, 0),
mode="replicate"
)
return tensor[:, :, :oh - 1, :ow - 1]
def compute_ctrness_targets(reg_targets):
if len(reg_targets) == 0:
return reg_targets.new_zeros(len(reg_targets))
left_right = reg_targets[:, [0, 2]]
top_bottom = reg_targets[:, [1, 3]]
ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
(top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
return torch.sqrt(ctrness)
class FCOSOutputs(object):
def __init__(
self,
images,
locations,
logits_pred,
reg_pred,
ctrness_pred,
focal_loss_alpha,
focal_loss_gamma,
iou_loss,
center_sample,
sizes_of_interest,
strides,
radius,
num_classes,
pre_nms_thresh,
pre_nms_top_n,
nms_thresh,
fpn_post_nms_top_n,
thresh_with_ctr,
controllers,
masks,
gt_instances=None,
):
self.logits_pred = logits_pred
self.reg_pred = reg_pred
self.ctrness_pred = ctrness_pred
self.locations = locations
self.gt_instances = gt_instances
self.num_feature_maps = len(logits_pred)
self.num_images = len(images)
self.image_sizes = images.image_sizes
self.focal_loss_alpha = focal_loss_alpha
self.focal_loss_gamma = focal_loss_gamma
self.iou_loss = iou_loss
self.center_sample = center_sample
self.sizes_of_interest = sizes_of_interest
self.strides = strides
self.radius = radius
self.num_classes = num_classes
self.pre_nms_thresh = pre_nms_thresh
self.pre_nms_top_n = pre_nms_top_n
self.nms_thresh = nms_thresh
self.fpn_post_nms_top_n = fpn_post_nms_top_n
self.thresh_with_ctr = thresh_with_ctr
self.controllers = controllers
self.masks = masks
def _transpose(self, training_targets, num_loc_list):
'''
This function is used to transpose image first training targets to level first ones
:return: level first training targets
'''
for im_i in range(len(training_targets)):
training_targets[im_i] = torch.split(
training_targets[im_i], num_loc_list, dim=0
)
targets_level_first = []
for targets_per_level in zip(*training_targets):
targets_level_first.append(
torch.cat(targets_per_level, dim=0)
)
return targets_level_first
def _get_ground_truth(self):
num_loc_list = [len(loc) for loc in self.locations]
self.num_loc_list = num_loc_list
# compute locations to size ranges
loc_to_size_range = []
for l, loc_per_level in enumerate(self.locations):
loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
loc_to_size_range.append(
loc_to_size_range_per_level[None].expand(num_loc_list[l], -1)
)
loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
locations = torch.cat(self.locations, dim=0)
training_targets = self.compute_targets_for_locations(
locations, self.gt_instances, loc_to_size_range
)
# transpose im first training_targets to level first ones
training_targets = {
k: self._transpose(v, num_loc_list) for k, v in training_targets.items()
}
# we normalize reg_targets by FPN's strides here
reg_targets = training_targets["reg_targets"]
for l in range(len(reg_targets)):
reg_targets[l] = reg_targets[l] / float(self.strides[l])
return training_targets
def get_sample_region(self, gt, strides, num_loc_list, loc_xs, loc_ys, radius=1):
num_gts = gt.shape[0]
K = len(loc_xs)
gt = gt[None].expand(K, num_gts, 4)
center_x = (gt[..., 0] + gt[..., 2]) / 2
center_y = (gt[..., 1] + gt[..., 3]) / 2
center_gt = gt.new_zeros(gt.shape)
# no gt
if center_x.numel() == 0 or center_x[..., 0].sum() == 0:
return loc_xs.new_zeros(loc_xs.shape, dtype=torch.uint8)
beg = 0
for level, num_loc in enumerate(num_loc_list):
end = beg + num_loc
stride = strides[level] * radius
xmin = center_x[beg:end] - stride
ymin = center_y[beg:end] - stride
xmax = center_x[beg:end] + stride
ymax = center_y[beg:end] + stride
# limit sample region in gt
center_gt[beg:end, :, 0] = torch.where(xmin > gt[beg:end, :, 0], xmin, gt[beg:end, :, 0])
center_gt[beg:end, :, 1] = torch.where(ymin > gt[beg:end, :, 1], ymin, gt[beg:end, :, 1])
center_gt[beg:end, :, 2] = torch.where(xmax > gt[beg:end, :, 2], gt[beg:end, :, 2], xmax)
center_gt[beg:end, :, 3] = torch.where(ymax > gt[beg:end, :, 3], gt[beg:end, :, 3], ymax)
beg = end
left = loc_xs[:, None] - center_gt[..., 0]
right = center_gt[..., 2] - loc_xs[:, None]
top = loc_ys[:, None] - center_gt[..., 1]
bottom = center_gt[..., 3] - loc_ys[:, None]
center_bbox = torch.stack((left, top, right, bottom), -1)
inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
return inside_gt_bbox_mask
def compute_targets_for_locations(self, locations, targets, size_ranges):
labels = []
reg_targets = []
matched_idxes = []
im_idxes = []
xs, ys = locations[:, 0], locations[:, 1]
for im_i in range(len(targets)):
targets_per_im = targets[im_i]
bboxes = targets_per_im.gt_boxes.tensor
labels_per_im = targets_per_im.gt_classes
# no gt
if bboxes.numel() == 0:
labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
reg_targets.append(locations.new_zeros((locations.size(0), 4)))
continue
area = targets_per_im.gt_boxes.area()
l = xs[:, None] - bboxes[:, 0][None]
t = ys[:, None] - bboxes[:, 1][None]
r = bboxes[:, 2][None] - xs[:, None]
b = bboxes[:, 3][None] - ys[:, None]
reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
if self.center_sample:
is_in_boxes = self.get_sample_region(
bboxes, self.strides, self.num_loc_list,
xs, ys, radius=self.radius
)
else:
is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0
max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0]
# limit the regression range for each location
is_cared_in_the_level = \
(max_reg_targets_per_im >= size_ranges[:, [0]]) & \
(max_reg_targets_per_im <= size_ranges[:, [1]])
locations_to_gt_area = area[None].repeat(len(locations), 1)
locations_to_gt_area[is_in_boxes == 0] = INF
locations_to_gt_area[is_cared_in_the_level == 0] = INF
# if there are still more than one objects for a location,
# we choose the one with minimal area
locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds]
labels_per_im = labels_per_im[locations_to_gt_inds]
labels_per_im[locations_to_min_area == INF] = self.num_classes
labels.append(labels_per_im)
reg_targets.append(reg_targets_per_im)
matched_idxes.append(locations_to_gt_inds)
im_idxes.append(torch.tensor([im_i]*len(labels_per_im)).to(locations_to_gt_inds.device))
return {"labels": labels, "reg_targets": reg_targets, "matched_idxes": matched_idxes, "im_idxes": im_idxes}
def losses(self):
"""
Return the losses from a set of FCOS predictions and their associated ground-truth.
Returns:
dict[loss name -> loss value]: A dict mapping from loss name to loss value.
"""
training_targets = self._get_ground_truth()
labels, reg_targets, matched_idxes, im_idxes = training_targets["labels"], training_targets["reg_targets"], training_targets["matched_idxes"], training_targets["im_idxes"]
# Collect all logits and regression predictions over feature maps
# and images to arrive at the same shape as the labels and targets
# The final ordering is L, N, H, W from slowest to fastest axis.
logits_pred = cat(
[
# Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C)
x.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
for x in self.logits_pred
], dim=0,)
reg_pred = cat(
[
# Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B)
x.permute(0, 2, 3, 1).reshape(-1, 4)
for x in self.reg_pred
], dim=0,)
ctrness_pred = cat(
[
# Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
x.reshape(-1) for x in self.ctrness_pred
], dim=0,)
labels = cat(
[
# Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
x.reshape(-1) for x in labels
], dim=0,)
reg_targets = cat(
[
# Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4)
x.reshape(-1, 4) for x in reg_targets
], dim=0,)
matched_idxes = cat(
[
x.reshape(-1) for x in matched_idxes
], dim=0,)
im_idxes = cat(
[
x.reshape(-1) for x in im_idxes
], dim=0,)
controllers_pred = cat(
[
x.permute(0, 2, 3, 1).reshape(-1, 169) for x in self.controllers
], dim=0,)
return self.fcos_losses(
labels,
reg_targets,
logits_pred,
reg_pred,
ctrness_pred,
controllers_pred,
self.focal_loss_alpha,
self.focal_loss_gamma,
self.iou_loss,
matched_idxes,
im_idxes
)
def predict_proposals(self):
sampled_boxes = []
bundle = (
self.locations, self.logits_pred,
self.reg_pred, self.ctrness_pred,
self.strides
)
for i, (l, o, r, c, s) in enumerate(zip(*bundle)):
# recall that during training, we normalize regression targets with FPN's stride.
# we denormalize them here.
r = r * s
controller = self.controllers[i]
sampled_boxes.append(
self.forward_for_single_feature_map(
l, o, r, c, controller, self.image_sizes
)
)
boxlists = list(zip(*sampled_boxes))
boxlists = [Instances.cat(boxlist) for boxlist in boxlists]
boxlists = self.select_over_all_levels(boxlists)
# for CondInst
boxlists = self.forward_for_mask(boxlists)
return boxlists
def forward_for_mask(self, boxlists):
N, dim, h, w = self.masks.shape
grid_x = torch.arange(w).view(1,-1).float().repeat(h,1).cuda() / (w-1) * 2 - 1
grid_y = torch.arange(h).view(-1,1).float().repeat(1,w).cuda() / (h-1) * 2 - 1
x_map = grid_x.view(1, 1, h, w).repeat(N, 1, 1, 1)
y_map = grid_y.view(1, 1, h, w).repeat(N, 1, 1, 1)
masks_feat = torch.cat((self.masks, x_map, y_map), dim=1)
o_h = int(h * self.strides[0])
o_w = int(w * self.strides[0])
for im in range(N):
boxlist = boxlists[im]
input_h, input_w = boxlist.image_size
mask = masks_feat[None, im]
ins_num = boxlist.controllers.shape[0]
weights1 = boxlist.controllers[:,:80].reshape(-1,8,10).reshape(-1,10).unsqueeze(-1).unsqueeze(-1)
bias1 = boxlist.controllers[:, 80:88].flatten()
weights2 = boxlist.controllers[:, 88:152].reshape(-1,8,8).reshape(-1,8).unsqueeze(-1).unsqueeze(-1)
bias2 = boxlist.controllers[:, 152:160].flatten()
weights3 = boxlist.controllers[:, 160:168].unsqueeze(-1).unsqueeze(-1)
bias3 = boxlist.controllers[:,168:169].flatten()
conv1 = F.conv2d(mask,weights1,bias1).relu()
conv2 = F.conv2d(conv1, weights2, bias2, groups = ins_num).relu()
masks_per_image = F.conv2d(conv2, weights3, bias3, groups = ins_num)
#masks = interpolate(masks_per_image, size = (o_h,o_w), mode="bilinear", align_corners=False).sigmoid()
masks = aligned_bilinear(masks_per_image, self.strides[0]).sigmoid()
masks = masks[:, :, :input_h, :input_w].permute(1,0,2,3)
boxlist.pred_masks = masks
return boxlists
def forward_for_single_feature_map(
self, locations, box_cls,
reg_pred, ctrness, controller, image_sizes
):
N, C, H, W = box_cls.shape
# put in the same format as locations
box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1)
box_cls = box_cls.reshape(N, -1, C).sigmoid()
box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1)
box_regression = box_regression.reshape(N, -1, 4)
ctrness = ctrness.view(N, 1, H, W).permute(0, 2, 3, 1)
ctrness = ctrness.reshape(N, -1).sigmoid()
controller = controller.view(N, 169, H, W).permute(0, 2, 3, 1)
controller = controller.reshape(N, -1, 169)
# if self.thresh_with_ctr is True, we multiply the classification
# scores with centerness scores before applying the threshold.
if self.thresh_with_ctr:
box_cls = box_cls * ctrness[:, :, None]
candidate_inds = box_cls > self.pre_nms_thresh
pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)
if not self.thresh_with_ctr:
box_cls = box_cls * ctrness[:, :, None]
results = []
for i in range(N):
per_box_cls = box_cls[i]
per_candidate_inds = candidate_inds[i]
per_box_cls = per_box_cls[per_candidate_inds]
per_candidate_nonzeros = per_candidate_inds.nonzero()
per_box_loc = per_candidate_nonzeros[:, 0]
per_class = per_candidate_nonzeros[:, 1]
per_box_regression = box_regression[i]
per_box_regression = per_box_regression[per_box_loc]
per_locations = locations[per_box_loc]
per_controller = controller[i]
per_controller = per_controller[per_box_loc]
per_pre_nms_top_n = pre_nms_top_n[i]
if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
per_box_cls, top_k_indices = \
per_box_cls.topk(per_pre_nms_top_n, sorted=False)
per_class = per_class[top_k_indices]
per_box_regression = per_box_regression[top_k_indices]
per_locations = per_locations[top_k_indices]
per_controller = per_controller[top_k_indices]
detections = torch.stack([
per_locations[:, 0] - per_box_regression[:, 0],
per_locations[:, 1] - per_box_regression[:, 1],
per_locations[:, 0] + per_box_regression[:, 2],
per_locations[:, 1] + per_box_regression[:, 3],
], dim=1)
boxlist = Instances(image_sizes[i])
boxlist.pred_boxes = Boxes(detections)
boxlist.scores = torch.sqrt(per_box_cls)
boxlist.pred_classes = per_class
boxlist.locations = per_locations
boxlist.controllers = per_controller
results.append(boxlist)
return results
def select_over_all_levels(self, boxlists):
num_images = len(boxlists)
results = []
for i in range(num_images):
# multiclass nms
result = ml_nms(boxlists[i], self.nms_thresh)
number_of_detections = len(result)
# Limit to max_per_image detections **over all classes**
if number_of_detections > self.fpn_post_nms_top_n > 0:
cls_scores = result.scores
image_thresh, _ = torch.kthvalue(
cls_scores.cpu(),
number_of_detections - self.fpn_post_nms_top_n + 1
)
keep = cls_scores >= image_thresh.item()
keep = torch.nonzero(keep).squeeze(1)
result = result[keep]
results.append(result)
return results
def prepare_masks(self, m_h, m_w, r_h, r_w, targets_masks):
masks = []
for im_i in range(len(targets_masks)):
mask_t = targets_masks[im_i]
if len(mask_t) == 0:
masks.append(mask_t.new_tensor([]))
continue
n, h, w = mask_t.shape
mask = mask_t.new_zeros((n, r_h, r_w))
mask[:, :h, :w] = mask_t
#resized_mask = aligned_bilinear(mask.float().unsqueeze(0), m_h/r_h)[0].gt(0)
#resized_mask = interpolate(
# input=mask.float().unsqueeze(0), size=(m_h, m_w), mode="bilinear", align_corners=False,
# )[0].gt(0)
#masks.append(resized_mask)
masks.append(mask)
return masks
def dice_loss(self,input, target):
smooth = 1.
iflat = input.contiguous().view(-1)
tflat = target.contiguous().view(-1)
intersection = (iflat * tflat).sum()
return 1 - ((2. * intersection + smooth) /((iflat*iflat).sum() + (tflat*tflat).sum() + smooth))
def fcos_losses(
self,
labels,
reg_targets,
logits_pred,
reg_pred,
ctrness_pred,
controllers_pred,
focal_loss_alpha,
focal_loss_gamma,
iou_loss,
matched_idxes,
im_idxes
):
num_classes = logits_pred.size(1)
labels = labels.flatten()
pos_inds = torch.nonzero(labels != num_classes).squeeze(1)
num_pos_local = pos_inds.numel()
num_gpus = get_world_size()
total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
num_pos_avg = max(total_num_pos / num_gpus, 1.0)
# prepare one_hot
class_target = torch.zeros_like(logits_pred)
class_target[pos_inds, labels[pos_inds]] = 1
class_loss = sigmoid_focal_loss_jit(
logits_pred,
class_target,
alpha=focal_loss_alpha,
gamma=focal_loss_gamma,
reduction="sum",
) / num_pos_avg
reg_pred = reg_pred[pos_inds]
reg_targets = reg_targets[pos_inds]
ctrness_pred = ctrness_pred[pos_inds]
controllers_pred = controllers_pred[pos_inds]
matched_idxes = matched_idxes[pos_inds]
im_idxes = im_idxes[pos_inds]
ctrness_targets = compute_ctrness_targets(reg_targets)
ctrness_targets_sum = ctrness_targets.sum()
ctrness_norm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
reg_loss = iou_loss(
reg_pred,
reg_targets,
ctrness_targets
) / ctrness_norm
ctrness_loss = F.binary_cross_entropy_with_logits(
ctrness_pred,
ctrness_targets,
reduction="sum"
) / num_pos_avg
# for CondInst
N, C, h, w = self.masks.shape
grid_x = torch.arange(w).view(1,-1).float().repeat(h,1).cuda() / (w-1) * 2 - 1
grid_y = torch.arange(h).view(-1,1).float().repeat(1,w).cuda() / (h-1) * 2 - 1
x_map = grid_x.view(1, 1, h, w).repeat(N, 1, 1, 1)
y_map = grid_y.view(1, 1, h, w).repeat(N, 1, 1, 1)
masks_feat = torch.cat((self.masks, x_map, y_map), dim=1)
r_h = int(h * self.strides[0])
r_w = int(w * self.strides[0])
targets_masks = [target_im.gt_masks.tensor for target_im in self.gt_instances]
masks_t = self.prepare_masks(h, w, r_h, r_w, targets_masks)
mask_loss = masks_feat[0].new_tensor(0.0)
batch_ins = im_idxes.shape[0]
# for each image
for i in range(N):
inds = (im_idxes==i).nonzero().flatten()
ins_num = inds.shape[0]
if ins_num > 0:
controllers = controllers_pred[inds]
mask_feat = masks_feat[None, i]
weights1 = controllers[:, :80].reshape(-1,8,10).reshape(-1,10).unsqueeze(-1).unsqueeze(-1)
bias1 = controllers[:, 80:88].flatten()
weights2 = controllers[:, 88:152].reshape(-1,8,8).reshape(-1,8).unsqueeze(-1).unsqueeze(-1)
bias2 = controllers[:, 152:160].flatten()
weights3 = controllers[:, 160:168].unsqueeze(-1).unsqueeze(-1)
bias3 = controllers[:,168:169].flatten()
conv1 = F.conv2d(mask_feat,weights1,bias1).relu()
conv2 = F.conv2d(conv1, weights2, bias2, groups = ins_num).relu()
#masks_per_image = F.conv2d(conv2, weights3, bias3, groups = ins_num)[0].sigmoid()
masks_per_image = F.conv2d(conv2, weights3, bias3, groups = ins_num)
masks_per_image = aligned_bilinear(masks_per_image, self.strides[0])[0].sigmoid()
for j in range(ins_num):
ind = inds[j]
mask_gt = masks_t[i][matched_idxes[ind]].float()
mask_pred = masks_per_image[j]
mask_loss += self.dice_loss(mask_pred, mask_gt)
if batch_ins > 0:
mask_loss = mask_loss / batch_ins
losses = {
"loss_fcos_cls": class_loss,
"loss_fcos_loc": reg_loss,
"loss_fcos_ctr": ctrness_loss,
"loss_mask": mask_loss
}
return losses, {}
================================================
FILE: fcos/modeling/one_stage_detector.py
================================================
from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from detectron2.modeling import ProposalNetwork
@META_ARCH_REGISTRY.register()
class OneStageDetector(ProposalNetwork):
"""
Same as :class:`detectron2.modeling.ProposalNetwork`.
Uses "instances" as the return key instead of using "proposal".
"""
def forward(self, batched_inputs):
if self.training:
return super().forward(batched_inputs)
processed_results = super().forward(batched_inputs)
processed_results = [{"instances": r["proposals"]} for r in processed_results]
return processed_results
================================================
FILE: fcos/modeling/poolers.py
================================================
import sys
import torch
from detectron2.layers import cat
from detectron2.modeling.poolers import (
ROIPooler, convert_boxes_to_pooler_format, assign_boxes_to_levels
)
__all__ = ["TopPooler"]
def _box_max_size(boxes):
box = boxes.tensor
max_size = torch.max(box[:, 2] - box[:, 0], box[:, 3] - box[:, 1])
return max_size
def assign_boxes_to_levels_by_length(
box_lists, min_level, max_level, canonical_box_size, canonical_level):
"""
Map each box in `box_lists` to a feature map level index and return the assignment
vector.
Args:
box_lists (list[detectron2.structures.Boxes]): A list of N Boxes or N RotatedBoxes,
where N is the number of images in the batch.
min_level (int): Smallest feature map level index. The input is considered index 0,
the output of stage 1 is index 1, and so.
max_level (int): Largest feature map level index.
canonical_box_size (int): A canonical box size in pixels (sqrt(box area)).
canonical_level (int): The feature map level index on which a canonically-sized box
should be placed.
Returns:
A tensor of length M, where M is the total number of boxes aggregated over all
N batch images. The memory layout corresponds to the concatenation of boxes
from all images. Each element is the feature map index, as an offset from
`self.min_level`, for the corresponding box (so value i means the box is at
`self.min_level + i`).
"""
eps = sys.float_info.epsilon
box_sizes = cat([_box_max_size(boxes) for boxes in box_lists])
# Eqn.(1) in FPN paper
level_assignments = torch.floor(
canonical_level + torch.log2(box_sizes / canonical_box_size + eps)
)
level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
return level_assignments.to(torch.int64) - min_level
class TopPooler(ROIPooler):
"""
ROIPooler with option to assign level by max length. Used by top modules.
"""
def __init__(self,
output_size,
scales,
sampling_ratio,
pooler_type,
canonical_box_size=224,
canonical_level=4,
assign_crit="area",):
super().__init__(output_size, scales, sampling_ratio, pooler_type,
canonical_box_size=canonical_box_size,
canonical_level=canonical_level)
self.assign_crit = assign_crit
def forward(self, x, box_lists):
"""
Args:
x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those
used to construct this module.
box_lists (list[Boxes] | list[RotatedBoxes]):
A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
The box coordinates are defined on the original image and
will be scaled by the `scales` argument of :class:`ROIPooler`.
Returns:
Tensor:
A tensor of shape (M, C, output_size, output_size) where M is the total number of
boxes aggregated over all N batch images and C is the number of channels in `x`.
"""
num_level_assignments = len(self.level_poolers)
assert isinstance(x, list) and isinstance(
box_lists, list
), "Arguments to pooler must be lists"
assert (
len(x) == num_level_assignments
), "unequal value, num_level_assignments={}, but x is list of {} Tensors".format(
num_level_assignments, len(x)
)
assert len(box_lists) == x[0].size(
0
), "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format(
x[0].size(0), len(box_lists)
)
pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists)
if num_level_assignments == 1:
return self.level_poolers[0](x[0], pooler_fmt_boxes)
if self.assign_crit == "length":
assign_method = assign_boxes_to_levels_by_length
else:
assign_method = assign_boxes_to_levels
level_assignments = assign_method(
box_lists, self.min_level, self.max_level,
self.canonical_box_size, self.canonical_level)
num_boxes = len(pooler_fmt_boxes)
num_channels = x[0].shape[1]
output_size = self.output_size[0]
dtype, device = x[0].dtype, x[0].device
output = torch.zeros(
(num_boxes, num_channels, output_size, output_size), dtype=dtype, device=device
)
for level, (x_level, pooler) in enumerate(zip(x, self.level_poolers)):
inds = torch.nonzero(level_assignments == level).squeeze(1)
pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
output[inds] = pooler(x_level, pooler_fmt_boxes_level)
return output
================================================
FILE: fcos/utils/comm.py
================================================
import torch.distributed as dist
from detectron2.utils.comm import get_world_size
def reduce_sum(tensor):
world_size = get_world_size()
if world_size < 2:
return tensor
tensor = tensor.clone()
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
return tensor
================================================
FILE: fcos/utils/measures.py
================================================
# coding: utf-8
# Adapted from https://github.com/ShichenLiu/CondenseNet/blob/master/utils.py
from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
import operator
from functools import reduce
def get_num_gen(gen):
return sum(1 for x in gen)
def is_pruned(layer):
try:
layer.mask
return True
except AttributeError:
return False
def is_leaf(model):
return get_num_gen(model.children()) == 0
def get_layer_info(layer):
layer_str = str(layer)
type_name = layer_str[:layer_str.find('(')].strip()
return type_name
def get_layer_param(model):
return sum([reduce(operator.mul, i.size(), 1) for i in model.parameters()])
### The input batch size should be 1 to call this function
def measure_layer(layer, *args):
global count_ops, count_params
for x in args:
delta_ops = 0
delta_params = 0
multi_add = 1
type_name = get_layer_info(layer)
### ops_conv
if type_name in ['Conv2d']:
out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0] - layer.kernel_size[0]) /
layer.stride[0] + 1)
out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1] - layer.kernel_size[1]) /
layer.stride[1] + 1)
delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
delta_params = get_layer_param(layer)
elif type_name in ['ConvTranspose2d']:
_, _, in_h, in_w = x.size()
out_h = int((in_h-1)*layer.stride[0] - 2 * layer.padding[0] + layer.kernel_size[0] + layer.output_padding[0])
out_w = int((in_w-1)*layer.stride[1] - 2 * layer.padding[1] + layer.kernel_size[1] + layer.output_padding[1])
delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * \
layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
delta_params = get_layer_param(layer)
### ops_learned_conv
elif type_name in ['LearnedGroupConv']:
measure_layer(layer.relu, x)
measure_layer(layer.norm, x)
conv = layer.conv
out_h = int((x.size()[2] + 2 * conv.padding[0] - conv.kernel_size[0]) /
conv.stride[0] + 1)
out_w = int((x.size()[3] + 2 * conv.padding[1] - conv.kernel_size[1]) /
conv.stride[1] + 1)
delta_ops = conv.in_channels * conv.out_channels * conv.kernel_size[0] * conv.kernel_size[1] * out_h * out_w / layer.condense_factor * multi_add
delta_params = get_layer_param(conv) / layer.condense_factor
### ops_nonlinearity
elif type_name in ['ReLU', 'ReLU6']:
delta_ops = x.numel()
delta_params = get_layer_param(layer)
### ops_pooling
elif type_name in ['AvgPool2d', 'MaxPool2d']:
in_w = x.size()[2]
kernel_ops = layer.kernel_size * layer.kernel_size
out_w = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
out_h = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
delta_ops = x.size()[0] * x.size()[1] * out_w * out_h * kernel_ops
delta_params = get_layer_param(layer)
elif type_name in ['LastLevelMaxPool']:
pass
elif type_name in ['AdaptiveAvgPool2d']:
delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
delta_params = get_layer_param(layer)
elif type_name in ['ZeroPad2d', 'RetinaNetPostProcessor']:
pass
#delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
#delta_params = get_layer_param(layer)
### ops_linear
elif type_name in ['Linear']:
weight_ops = layer.weight.numel() * multi_add
bias_ops = layer.bias.numel()
delta_ops = x.size()[0] * (weight_ops + bias_ops)
delta_params = get_layer_param(layer)
### ops_nothing
elif type_name in ['BatchNorm2d', 'Dropout2d', 'DropChannel', 'Dropout', 'FrozenBatchNorm2d', 'GroupNorm']:
delta_params = get_layer_param(layer)
elif type_name in ['SumTwo']:
delta_ops = x.numel()
elif type_name in ['AggregateCell']:
if not layer.pre_transform:
delta_ops = 2 * x.numel() # twice for each input
else:
measure_layer(layer.branch_1, x)
measure_layer(layer.branch_2, x)
delta_params = get_layer_param(layer)
elif type_name in ['Identity', 'Zero']:
pass
elif type_name in ['Scale']:
delta_params = get_layer_param(layer)
delta_ops = x.numel()
elif type_name in ['FCOSPostProcessor', 'RPNPostProcessor', 'KeypointPostProcessor',
'ROIAlign', 'PostProcessor', 'KeypointRCNNPredictor',
'NaiveSyncBatchNorm', 'Upsample', 'Sequential']:
pass
elif type_name in ['DeformConv']:
# don't count bilinear
offset_conv = list(layer.parameters())[0]
delta_ops = reduce(operator.mul, offset_conv.size(), x.size()[2] * x.size()[3])
out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0]
- layer.kernel_size[0]) / layer.stride[0] + 1)
out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1]
- layer.kernel_size[1]) / layer.stride[1] + 1)
delta_ops += layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
delta_params = get_layer_param(layer)
### unknown layer type
else:
raise TypeError('unknown layer type: %s' % type_name)
count_ops += delta_ops
count_params += delta_params
return
def measure_model(model, x):
global count_ops, count_params
count_ops = 0
count_params = 0
def should_measure(x):
return is_leaf(x) or is_pruned(x)
def modify_forward(model):
for child in model.children():
if should_measure(child):
def new_forward(m):
def lambda_forward(*args):
measure_layer(m, *args)
return m.old_forward(*args)
return lambda_forward
child.old_forward = child.forward
child.forward = new_forward(child)
else:
modify_forward(child)
def restore_forward(model):
for child in model.children():
# leaf node
if is_leaf(child) and hasattr(child, 'old_forward'):
child.forward = child.old_forward
child.old_forward = None
else:
restore_forward(child)
modify_forward(model)
out = model.forward(x)
restore_forward(model)
return out, count_ops, count_params
================================================
FILE: postprocessing.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from torch.nn import functional as F
from detectron2.layers import paste_masks_in_image
from detectron2.structures import Instances
def detector_postprocess(results, output_height, output_width, mask_threshold=0.5):
"""
Resize the output instances.
The input images are often resized when entering an object detector.
As a result, we often need the outputs of the detector in a different
resolution from its inputs.
This function will resize the raw outputs of an R-CNN detector
to produce outputs according to the desired output resolution.
Args:
results (Instances): the raw outputs from the detector.
`results.image_size` contains the input image resolution the detector sees.
This object might be modified in-place.
output_height, output_width: the desired output resolution.
Returns:
Instances: the resized output from the model, based on the output resolution
"""
scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
results = Instances((output_height, output_width), **results.get_fields())
if results.has("pred_boxes"):
output_boxes = results.pred_boxes
elif results.has("proposal_boxes"):
output_boxes = results.proposal_boxes
output_boxes.scale(scale_x, scale_y)
output_boxes.clip(results.image_size)
results = results[output_boxes.nonempty()]
if results.has("pred_masks"):
if results.pred_masks.shape[0]:
results.pred_masks = F.interpolate(input=results.pred_masks, size=results.image_size,mode="bilinear", align_corners=False).gt(0.5).squeeze(1)
#results.pred_masks = paste_masks_in_image(
# results.pred_masks[:, 0, :, :], # N, 1, M, M
# results.pred_boxes,
# results.image_size,
# threshold=mask_threshold,
#)
if results.has("pred_keypoints"):
results.pred_keypoints[:, :, 0] *= scale_x
results.pred_keypoints[:, :, 1] *= scale_y
return results
def sem_seg_postprocess(result, img_size, output_height, output_width):
"""
Return semantic segmentation predictions in the original resolution.
The input images are often resized when entering semantic segmentor. Moreover, in same
cases, they also padded inside segmentor to be divisible by maximum network stride.
As a result, we often need the predictions of the segmentor in a different
resolution from its inputs.
Args:
result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
where C is the number of classes, and H, W are the height and width of the prediction.
img_size (tuple): image size that segmentor is taking as input.
output_height, output_width: the desired output resolution.
Returns:
semantic segmentation prediction (Tensor): A tensor of the shape
(C, output_height, output_width) that contains per-pixel soft predictions.
"""
result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
result = F.interpolate(
result, size=(output_height, output_width), mode="bilinear", align_corners=False
)[0]
return result
================================================
FILE: tools/compute_flops.py
================================================
import torch
from detectron2.engine import default_argument_parser, default_setup
from adet.config import get_cfg
from adet.utils.measures import measure_model
from train_net import Trainer
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
return cfg
def main(args):
cfg = setup(args)
model = Trainer.build_model(cfg)
model.eval().cuda()
input_size = (3, 512, 512)
image = torch.zeros(*input_size)
batched_input = {"image": image}
ops, params = measure_model(model, [batched_input])
print('ops: {:.2f}G\tparams: {:.2f}M'.format(ops / 2**30, params / 2**20))
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
main(args)
================================================
FILE: tools/convert_fcos_weight.py
================================================
import argparse
from collections import OrderedDict
import torch
def get_parser():
parser = argparse.ArgumentParser(description="FCOS Detectron2 Converter")
parser.add_argument(
"--model",
default="weights/fcos_R_50_1x_official.pth",
metavar="FILE",
help="path to model weights",
)
parser.add_argument(
"--output",
default="weights/fcos_R_50_1x_converted.pth",
metavar="FILE",
help="path to model weights",
)
return parser
def rename_resnet_param_names(ckpt_state_dict):
converted_state_dict = OrderedDict()
for key in ckpt_state_dict.keys():
value = ckpt_state_dict[key]
key = key.replace("module.", "")
key = key.replace("body", "bottom_up")
# adding a . ahead to avoid renaming the fpn modules
# this can happen after fpn renaming
key = key.replace(".layer1", ".res2")
key = key.replace(".layer2", ".res3")
key = key.replace(".layer3", ".res4")
key = key.replace(".layer4", ".res5")
key = key.replace("downsample.0", "shortcut")
key = key.replace("downsample.1", "shortcut.norm")
key = key.replace("bn1", "conv1.norm")
key = key.replace("bn2", "conv2.norm")
key = key.replace("bn3", "conv3.norm")
key = key.replace("fpn_inner2", "fpn_lateral3")
key = key.replace("fpn_inner3", "fpn_lateral4")
key = key.replace("fpn_inner4", "fpn_lateral5")
key = key.replace("fpn_layer2", "fpn_output3")
key = key.replace("fpn_layer3", "fpn_output4")
key = key.replace("fpn_layer4", "fpn_output5")
key = key.replace("top_blocks", "top_block")
key = key.replace("fpn.", "")
key = key.replace("rpn", "proposal_generator")
key = key.replace("head", "fcos_head")
converted_state_dict[key] = value
return converted_state_dict
if __name__ == "__main__":
args = get_parser().parse_args()
ckpt = torch.load(args.model)
model = rename_resnet_param_names(ckpt["model"])
torch.save(model, args.output)
================================================
FILE: tools/remove_optim_from_ckpt.py
================================================
import argparse
import torch
def get_parser():
parser = argparse.ArgumentParser(description="Keep only model in ckpt")
parser.add_argument(
"--path",
default="output/person/blendmask/R_50_1x/",
help="path to model weights",
)
parser.add_argument(
"--name",
default="R_50_1x.pth",
help="name of output file",
)
return parser
if __name__ == "__main__":
args = get_parser().parse_args()
ckpt = torch.load(args.path + 'model_final.pth')
model = ckpt["model"]
torch.save(model, args.path + args.name)
================================================
FILE: train_net.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Detection Training Script.
This scripts reads a given config file and runs the training or evaluation.
It is an entry point that is made to train standard models in detectron2.
In order to let one script support training of many models,
this script contains logic that are specific to these built-in models and therefore
may not be suitable for your own project.
For example, your research project perhaps only needs a single "evaluator".
Therefore, we recommend you to use detectron2 as an library and take
this file as an example of how to use the library.
You may want to write your own script with your datasets and other customizations.
"""
import logging
import os
from collections import OrderedDict
import torch
from torch.nn.parallel import DistributedDataParallel
import detectron2.utils.comm as comm
from detectron2.data import MetadataCatalog, build_detection_train_loader
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
from detectron2.utils.events import EventStorage
from detectron2.evaluation import (
CityscapesEvaluator,
COCOEvaluator,
COCOPanopticEvaluator,
DatasetEvaluators,
LVISEvaluator,
PascalVOCDetectionEvaluator,
SemSegEvaluator,
verify_results,
)
from detectron2.modeling import GeneralizedRCNNWithTTA
from detectron2.data.dataset_mapper import DatasetMapper
from fcos.config import get_cfg
from fcos.checkpoint import AdetCheckpointer
class Trainer(DefaultTrainer):
"""
This is the same Trainer except that we rewrite the
`build_train_loader` method.
"""
def __init__(self, cfg):
"""
Args:
cfg (CfgNode):
Use the custom checkpointer, which loads other backbone models
with matching heuristics.
"""
# Assume these objects must be constructed in this order.
model = self.build_model(cfg)
optimizer = self.build_optimizer(cfg, model)
data_loader = self.build_train_loader(cfg)
# For training, wrap with DDP. But don't need this for inference.
if comm.get_world_size() > 1:
model = DistributedDataParallel(
model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
)
super(DefaultTrainer, self).__init__(model, data_loader, optimizer)
self.scheduler = self.build_lr_scheduler(cfg, optimizer)
# Assume no other objects need to be checkpointed.
# We can later make it checkpoint the stateful hooks
self.checkpointer = AdetCheckpointer(
# Assume you want to save checkpoints together with logs/statistics
model,
cfg.OUTPUT_DIR,
optimizer=optimizer,
scheduler=self.scheduler,
)
self.start_iter = 0
self.max_iter = cfg.SOLVER.MAX_ITER
self.cfg = cfg
self.register_hooks(self.build_hooks())
def train_loop(self, start_iter: int, max_iter: int):
"""
Args:
start_iter, max_iter (int): See docs above
"""
logger = logging.getLogger(__name__)
logger.info("Starting training from iteration {}".format(start_iter))
self.iter = self.start_iter = start_iter
self.max_iter = max_iter
with EventStorage(start_iter) as self.storage:
self.before_train()
for self.iter in range(start_iter, max_iter):
self.before_step()
self.run_step()
self.after_step()
self.after_train()
def train(self):
"""
Run training.
Returns:
OrderedDict of results, if evaluation is enabled. Otherwise None.
"""
self.train_loop(self.start_iter, self.max_iter)
if hasattr(self, "_last_eval_results") and comm.is_main_process():
verify_results(self.cfg, self._last_eval_results)
return self._last_eval_results
@classmethod
def build_train_loader(cls, cfg):
"""
Returns:
iterable
It calls :func:`detectron2.data.build_detection_train_loader` with a customized
DatasetMapper, which adds categorical labels as a semantic mask.
"""
mapper = DatasetMapper(cfg, True)
return build_detection_train_loader(cfg, mapper)
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
"""
Create evaluator(s) for a given dataset.
This uses the special metadata "evaluator_type" associated with each builtin dataset.
For your own dataset, you can simply create an evaluator manually in your
script and do not have to worry about the hacky if-else logic here.
"""
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
evaluator_list = []
evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
evaluator_list.append(
SemSegEvaluator(
dataset_name,
distributed=True,
num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
output_dir=output_folder,
)
)
if evaluator_type in ["coco", "coco_panoptic_seg"]:
evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
if evaluator_type == "coco_panoptic_seg":
evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
if evaluator_type == "cityscapes":
assert (
torch.cuda.device_count() >= comm.get_rank()
), "CityscapesEvaluator currently do not work with multiple machines."
return CityscapesEvaluator(dataset_name)
if evaluator_type == "pascal_voc":
return PascalVOCDetectionEvaluator(dataset_name)
if evaluator_type == "lvis":
return LVISEvaluator(dataset_name, cfg, True, output_folder)
if len(evaluator_list) == 0:
raise NotImplementedError(
"no Evaluator for the dataset {} with the type {}".format(
dataset_name, evaluator_type
)
)
if len(evaluator_list) == 1:
return evaluator_list[0]
return DatasetEvaluators(evaluator_list)
@classmethod
def test_with_TTA(cls, cfg, model):
logger = logging.getLogger("detectron2.trainer")
# In the end of training, run an evaluation with TTA
# Only support some R-CNN models.
logger.info("Running inference with test-time augmentation ...")
model = GeneralizedRCNNWithTTA(cfg, model)
evaluators = [
cls.build_evaluator(
cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
)
for name in cfg.DATASETS.TEST
]
res = cls.test(cfg, model, evaluators)
res = OrderedDict({k + "_TTA": v for k, v in res.items()})
return res
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
return cfg
def main(args):
cfg = setup(args)
if args.eval_only:
model = Trainer.build_model(cfg)
AdetCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if comm.is_main_process():
verify_results(cfg, res)
if cfg.TEST.AUG.ENABLED:
res.update(Trainer.test_with_TTA(cfg, model))
return res
"""
If you'd like to do anything fancier than the standard training logic,
consider writing your own training loop or subclassing the trainer.
"""
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
if cfg.TEST.AUG.ENABLED:
trainer.register_hooks(
[hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)