Repository: bagh2178/SG-Nav
Branch: main
Commit: d56863c96dea
Files: 618
Total size: 16.2 MB
Directory structure:
gitextract_t68xqa9y/
├── .gitignore
├── GLIP/
│ ├── CODE_OF_CONDUCT.md
│ ├── DATA.md
│ ├── LICENSE
│ ├── README.md
│ ├── SECURITY.md
│ ├── SUPPORT.md
│ ├── configs/
│ │ ├── flickr/
│ │ │ ├── test.yaml
│ │ │ └── val.yaml
│ │ ├── lvis/
│ │ │ ├── minival.yaml
│ │ │ └── val.yaml
│ │ ├── odinw/
│ │ │ └── Aquarium_Aquarium_Combined.v2-raw-1024.coco.yaml
│ │ └── pretrain/
│ │ ├── glip_Swin_L.yaml
│ │ └── glip_Swin_T_O365_GoldG.yaml
│ ├── maskrcnn_benchmark/
│ │ ├── __init__.py
│ │ ├── config/
│ │ │ ├── __init__.py
│ │ │ ├── defaults.py
│ │ │ └── paths_catalog.py
│ │ ├── csrc/
│ │ │ ├── ROIAlign.h
│ │ │ ├── ROIPool.h
│ │ │ ├── SigmoidFocalLoss.h
│ │ │ ├── cpu/
│ │ │ │ ├── ROIAlign_cpu.cpp
│ │ │ │ ├── nms_cpu.cpp
│ │ │ │ ├── soft_nms.cpp
│ │ │ │ └── vision.h
│ │ │ ├── cuda/
│ │ │ │ ├── ROIAlign_cuda.cu
│ │ │ │ ├── ROIPool_cuda.cu
│ │ │ │ ├── SigmoidFocalLoss_cuda.cu
│ │ │ │ ├── deform_conv_cuda.cu
│ │ │ │ ├── deform_conv_kernel_cuda.cu
│ │ │ │ ├── deform_pool_cuda.cu
│ │ │ │ ├── deform_pool_kernel_cuda.cu
│ │ │ │ ├── ml_nms.cu
│ │ │ │ ├── nms.cu
│ │ │ │ └── vision.h
│ │ │ ├── deform_conv.h
│ │ │ ├── deform_pool.h
│ │ │ ├── ml_nms.h
│ │ │ ├── nms.h
│ │ │ └── vision.cpp
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── build.py
│ │ │ ├── collate_batch.py
│ │ │ ├── datasets/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── background.py
│ │ │ │ ├── box_label_loader.py
│ │ │ │ ├── caption.py
│ │ │ │ ├── coco.py
│ │ │ │ ├── coco_dt.py
│ │ │ │ ├── concat_dataset.py
│ │ │ │ ├── custom_distributed_sampler.py
│ │ │ │ ├── duplicate_dataset.py
│ │ │ │ ├── evaluation/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── box_aug.py
│ │ │ │ │ ├── coco/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── coco_eval.py
│ │ │ │ │ ├── flickr/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── flickr_eval.py
│ │ │ │ │ ├── lvis/
│ │ │ │ │ │ ├── _change_lvis_annotation.py
│ │ │ │ │ │ ├── lvis.py
│ │ │ │ │ │ └── lvis_eval.py
│ │ │ │ │ ├── od_eval.py
│ │ │ │ │ ├── od_to_grounding/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── od_eval.py
│ │ │ │ │ ├── vg/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── vg_eval.py
│ │ │ │ │ └── voc/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── voc_eval.py
│ │ │ │ ├── flickr.py
│ │ │ │ ├── gqa.py
│ │ │ │ ├── imagenet.py
│ │ │ │ ├── list_dataset.py
│ │ │ │ ├── lvis.py
│ │ │ │ ├── mixed.py
│ │ │ │ ├── mixup.py
│ │ │ │ ├── modulated_coco.py
│ │ │ │ ├── object365.py
│ │ │ │ ├── od_to_grounding.py
│ │ │ │ ├── phrasecut.py
│ │ │ │ ├── pseudo_data.py
│ │ │ │ ├── refexp.py
│ │ │ │ ├── tsv.py
│ │ │ │ ├── vg.py
│ │ │ │ └── voc.py
│ │ │ ├── samplers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── distributed.py
│ │ │ │ ├── grouped_batch_sampler.py
│ │ │ │ └── iteration_based_batch_sampler.py
│ │ │ └── transforms/
│ │ │ ├── __init__.py
│ │ │ ├── build.py
│ │ │ └── transforms.py
│ │ ├── engine/
│ │ │ ├── __init__.py
│ │ │ ├── alter_trainer.py
│ │ │ ├── evolution.py
│ │ │ ├── inference.py
│ │ │ ├── predictor.py
│ │ │ ├── predictor_glip.py
│ │ │ ├── singlepath_trainer.py
│ │ │ ├── stage_trainer.py
│ │ │ └── trainer.py
│ │ ├── layers/
│ │ │ ├── __init__.py
│ │ │ ├── batch_norm.py
│ │ │ ├── deform_conv.py
│ │ │ ├── deform_pool.py
│ │ │ ├── dropblock.py
│ │ │ ├── dyhead.py
│ │ │ ├── dyrelu.py
│ │ │ ├── evonorm.py
│ │ │ ├── iou_loss.py
│ │ │ ├── misc.py
│ │ │ ├── nms.py
│ │ │ ├── roi_align.py
│ │ │ ├── roi_pool.py
│ │ │ ├── se.py
│ │ │ ├── set_loss.py
│ │ │ ├── sigmoid_focal_loss.py
│ │ │ └── smooth_l1_loss.py
│ │ ├── modeling/
│ │ │ ├── __init__.py
│ │ │ ├── backbone/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bifpn.py
│ │ │ │ ├── blocks.py
│ │ │ │ ├── efficientdet.py
│ │ │ │ ├── efficientnet.py
│ │ │ │ ├── fbnet.py
│ │ │ │ ├── fpn.py
│ │ │ │ ├── mixer.py
│ │ │ │ ├── ops.py
│ │ │ │ ├── resnet.py
│ │ │ │ ├── swint.py
│ │ │ │ ├── swint_v2.py
│ │ │ │ ├── swint_v2_vl.py
│ │ │ │ └── swint_vl.py
│ │ │ ├── balanced_positive_negative_sampler.py
│ │ │ ├── box_coder.py
│ │ │ ├── detector/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── generalized_rcnn.py
│ │ │ │ └── generalized_vl_rcnn.py
│ │ │ ├── language_backbone/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── backbone.py
│ │ │ │ ├── bert_model.py
│ │ │ │ ├── build.py
│ │ │ │ ├── clip_model.py
│ │ │ │ ├── hfpt_tokenizer.py
│ │ │ │ ├── rnn_model.py
│ │ │ │ ├── simple_tokenizer.py
│ │ │ │ ├── test_clip_tokenizer.py
│ │ │ │ └── word_utils.py
│ │ │ ├── make_layers.py
│ │ │ ├── matcher.py
│ │ │ ├── poolers.py
│ │ │ ├── registry.py
│ │ │ ├── roi_heads/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── box_head/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── box_head.py
│ │ │ │ │ ├── inference.py
│ │ │ │ │ ├── loss.py
│ │ │ │ │ ├── roi_box_feature_extractors.py
│ │ │ │ │ └── roi_box_predictors.py
│ │ │ │ ├── keypoint_head/
│ │ │ │ │ ├── inference.py
│ │ │ │ │ ├── keypoint_head.py
│ │ │ │ │ ├── loss.py
│ │ │ │ │ ├── roi_keypoint_feature_extractors.py
│ │ │ │ │ └── roi_keypoint_predictors.py
│ │ │ │ └── mask_head/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── hourglass.py
│ │ │ │ ├── inference.py
│ │ │ │ ├── loss.py
│ │ │ │ ├── mask_head.py
│ │ │ │ ├── roi_mask_feature_extractors.py
│ │ │ │ └── roi_mask_predictors.py
│ │ │ ├── rpn/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── anchor_generator.py
│ │ │ │ ├── atss.py
│ │ │ │ ├── dyhead.py
│ │ │ │ ├── fcos.py
│ │ │ │ ├── inference.py
│ │ │ │ ├── loss.py
│ │ │ │ ├── modeling_bert.py
│ │ │ │ ├── retina.py
│ │ │ │ ├── rpn.py
│ │ │ │ ├── transformer.py
│ │ │ │ └── vldyhead.py
│ │ │ └── utils.py
│ │ ├── solver/
│ │ │ ├── __init__.py
│ │ │ ├── build.py
│ │ │ └── lr_scheduler.py
│ │ ├── structures/
│ │ │ ├── __init__.py
│ │ │ ├── bounding_box.py
│ │ │ ├── boxlist_ops.py
│ │ │ ├── image_list.py
│ │ │ ├── keypoint.py
│ │ │ └── segmentation_mask.py
│ │ └── utils/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── amp.py
│ │ ├── big_model_loading.py
│ │ ├── c2_model_loading.py
│ │ ├── checkpoint.py
│ │ ├── collect_env.py
│ │ ├── comm.py
│ │ ├── cv2_util.py
│ │ ├── dist.py
│ │ ├── ema.py
│ │ ├── env.py
│ │ ├── flops.py
│ │ ├── fuse_helper.py
│ │ ├── imports.py
│ │ ├── logger.py
│ │ ├── mdetr_dist.py
│ │ ├── metric_logger.py
│ │ ├── miscellaneous.py
│ │ ├── model_serialization.py
│ │ ├── model_zoo.py
│ │ ├── pretrain_model_loading.py
│ │ ├── registry.py
│ │ ├── shallow_contrastive_loss_helper.py
│ │ └── stats.py
│ ├── setup.py
│ └── tools/
│ ├── cityscapes/
│ │ ├── convert_cityscapes_to_coco.py
│ │ └── instances2dict_with_polygons.py
│ ├── eval_all.py
│ ├── finetune.py
│ ├── test_grounding_net.py
│ ├── test_net.py
│ └── train_net.py
├── GroundingDINO/
│ ├── LICENSE
│ ├── README.md
│ ├── demo/
│ │ ├── gradio_app.py
│ │ └── inference_on_a_image.py
│ ├── groundingdino/
│ │ ├── __init__.py
│ │ ├── config/
│ │ │ ├── GroundingDINO_SwinB.py
│ │ │ └── GroundingDINO_SwinT_OGC.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ └── transforms.py
│ │ ├── models/
│ │ │ ├── GroundingDINO/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── backbone/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── backbone.py
│ │ │ │ │ ├── position_encoding.py
│ │ │ │ │ └── swin_transformer.py
│ │ │ │ ├── bertwarper.py
│ │ │ │ ├── csrc/
│ │ │ │ │ ├── MsDeformAttn/
│ │ │ │ │ │ ├── ms_deform_attn.h
│ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp
│ │ │ │ │ │ ├── ms_deform_attn_cpu.h
│ │ │ │ │ │ ├── ms_deform_attn_cuda.cu
│ │ │ │ │ │ ├── ms_deform_attn_cuda.h
│ │ │ │ │ │ └── ms_deform_im2col_cuda.cuh
│ │ │ │ │ ├── cuda_version.cu
│ │ │ │ │ └── vision.cpp
│ │ │ │ ├── fuse_modules.py
│ │ │ │ ├── groundingdino.py
│ │ │ │ ├── ms_deform_attn.py
│ │ │ │ ├── transformer.py
│ │ │ │ ├── transformer_vanilla.py
│ │ │ │ └── utils.py
│ │ │ ├── __init__.py
│ │ │ └── registry.py
│ │ ├── util/
│ │ │ ├── __init__.py
│ │ │ ├── box_ops.py
│ │ │ ├── get_tokenlizer.py
│ │ │ ├── inference.py
│ │ │ ├── logger.py
│ │ │ ├── misc.py
│ │ │ ├── slconfig.py
│ │ │ ├── slio.py
│ │ │ ├── time_counter.py
│ │ │ ├── utils.py
│ │ │ ├── visualizer.py
│ │ │ └── vl_utils.py
│ │ └── version.py
│ ├── pyproject.toml
│ ├── requirements.txt
│ └── setup.py
├── LICENSE
├── README.md
├── SG_Nav.py
├── configs/
│ ├── challenge_objectnav2021.local.rgbd.yaml
│ ├── challenge_objectnav2022.local.rgbd.yaml
│ ├── challenge_pointnav2021.local.rgbd.yaml
│ ├── challenge_pointnav2021.local.rgbd_test_scene.yaml
│ ├── ddppo_objectnav.yaml
│ ├── ddppo_pointnav.yaml
│ └── ddppo_pointnav_.yaml
├── habitat-lab/
│ ├── .circleci/
│ │ └── config.yml
│ ├── .editorconfig
│ ├── .github/
│ │ ├── ISSUE_TEMPLATE/
│ │ │ ├── bug-report.md
│ │ │ ├── feature-request.md
│ │ │ └── questions-help-support.md
│ │ └── PULL_REQUEST_TEMPLATE.md
│ ├── .gitignore
│ ├── .pre-commit-config.yaml
│ ├── CODE_OF_CONDUCT.md
│ ├── CONTRIBUTING.md
│ ├── Dockerfile
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── README.md
│ ├── configs/
│ │ ├── baselines/
│ │ │ └── ppo.yaml
│ │ ├── datasets/
│ │ │ ├── eqa/
│ │ │ │ └── mp3d.yaml
│ │ │ ├── imagenav/
│ │ │ │ ├── gibson.yaml
│ │ │ │ └── mp3d.yaml
│ │ │ ├── objectnav/
│ │ │ │ ├── hm3d.yaml
│ │ │ │ └── mp3d.yaml
│ │ │ ├── pointnav/
│ │ │ │ ├── gibson.yaml
│ │ │ │ ├── gibson_0_plus.yaml
│ │ │ │ ├── gibson_v2.yaml
│ │ │ │ ├── habitat_test.yaml
│ │ │ │ ├── hm3d.yaml
│ │ │ │ └── mp3d.yaml
│ │ │ ├── rearrangepick/
│ │ │ │ └── replica_cad.yaml
│ │ │ ├── single_episode.yaml
│ │ │ └── vln/
│ │ │ └── mp3d_r2r.yaml
│ │ ├── tasks/
│ │ │ ├── eqa_mp3d.yaml
│ │ │ ├── imagenav.yaml
│ │ │ ├── imagenav_gibson.yaml
│ │ │ ├── objectnav_hm3d.yaml
│ │ │ ├── objectnav_mp3d.yaml
│ │ │ ├── pointnav.yaml
│ │ │ ├── pointnav_gibson.yaml
│ │ │ ├── pointnav_hm3d.yaml
│ │ │ ├── pointnav_mp3d.yaml
│ │ │ ├── pointnav_rgbd.yaml
│ │ │ ├── rearrange/
│ │ │ │ ├── pick.yaml
│ │ │ │ ├── pick_spa.yaml
│ │ │ │ ├── pick_state.yaml
│ │ │ │ └── play.yaml
│ │ │ └── vln_r2r.yaml
│ │ └── test/
│ │ ├── habitat_all_sensors_test.yaml
│ │ ├── habitat_mp3d_eqa_test.yaml
│ │ ├── habitat_mp3d_object_nav_test.yaml
│ │ ├── habitat_r2r_vln_test.yaml
│ │ └── new_keys_test.yaml
│ ├── docs/
│ │ ├── .gitignore
│ │ ├── build-public.sh
│ │ ├── build.sh
│ │ ├── conf-public.py
│ │ ├── conf.py
│ │ ├── docs.rst
│ │ └── pages/
│ │ ├── habitat-lab-demo.rst
│ │ ├── habitat-sim-demo.rst
│ │ ├── index.rst
│ │ ├── quickstart.rst
│ │ └── view-transform-warp.rst
│ ├── examples/
│ │ ├── __init__.py
│ │ ├── benchmark.py
│ │ ├── example.py
│ │ ├── example_pointnav.py
│ │ ├── interactive_play.py
│ │ ├── new_actions.py
│ │ ├── register_new_sensors_and_measures.py
│ │ ├── shortest_path_follower_example.py
│ │ ├── tutorials/
│ │ │ ├── colabs/
│ │ │ │ └── Habitat_Lab.ipynb
│ │ │ └── nb_python/
│ │ │ └── Habitat_Lab.py
│ │ ├── visualization_examples.py
│ │ ├── vln_benchmark.py
│ │ └── vln_reference_path_follower_example.py
│ ├── habitat/
│ │ ├── __init__.py
│ │ ├── config/
│ │ │ ├── __init__.py
│ │ │ └── default.py
│ │ ├── core/
│ │ │ ├── __init__.py
│ │ │ ├── agent.py
│ │ │ ├── benchmark.py
│ │ │ ├── challenge.py
│ │ │ ├── dataset.py
│ │ │ ├── embodied_task.py
│ │ │ ├── env.py
│ │ │ ├── environments.py
│ │ │ ├── logging.py
│ │ │ ├── registry.py
│ │ │ ├── simulator.py
│ │ │ ├── spaces.py
│ │ │ ├── utils.py
│ │ │ └── vector_env.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ ├── eqa/
│ │ │ │ ├── __init__.py
│ │ │ │ └── mp3d_eqa_dataset.py
│ │ │ ├── object_nav/
│ │ │ │ ├── __init__.py
│ │ │ │ └── object_nav_dataset.py
│ │ │ ├── pointnav/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pointnav_dataset.py
│ │ │ │ └── pointnav_generator.py
│ │ │ ├── rearrange/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configs/
│ │ │ │ │ ├── pick/
│ │ │ │ │ │ └── counter.yaml
│ │ │ │ │ └── test_config.yaml
│ │ │ │ ├── generate_episode_inits.py
│ │ │ │ ├── rearrange_dataset.py
│ │ │ │ ├── rearrange_generator.py
│ │ │ │ ├── receptacle.py
│ │ │ │ └── samplers.py
│ │ │ ├── registration.py
│ │ │ ├── utils.py
│ │ │ └── vln/
│ │ │ ├── __init__.py
│ │ │ └── r2r_vln_dataset.py
│ │ ├── py.typed
│ │ ├── sims/
│ │ │ ├── __init__.py
│ │ │ ├── habitat_simulator/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── actions.py
│ │ │ │ ├── debug_visualizer.py
│ │ │ │ ├── habitat_simulator.py
│ │ │ │ └── sim_utilities.py
│ │ │ ├── pyrobot/
│ │ │ │ ├── __init__.py
│ │ │ │ └── pyrobot.py
│ │ │ └── registration.py
│ │ ├── tasks/
│ │ │ ├── __init__.py
│ │ │ ├── eqa/
│ │ │ │ ├── __init__.py
│ │ │ │ └── eqa.py
│ │ │ ├── nav/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── nav.py
│ │ │ │ ├── object_nav_task.py
│ │ │ │ └── shortest_path_follower.py
│ │ │ ├── rearrange/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── actions.py
│ │ │ │ ├── grip_actions.py
│ │ │ │ ├── marker_info.py
│ │ │ │ ├── policy_modules.py
│ │ │ │ ├── rearrange_grasp_manager.py
│ │ │ │ ├── rearrange_sensors.py
│ │ │ │ ├── rearrange_sim.py
│ │ │ │ ├── rearrange_task.py
│ │ │ │ ├── sub_tasks/
│ │ │ │ │ ├── pick_sensors.py
│ │ │ │ │ └── pick_task.py
│ │ │ │ └── utils.py
│ │ │ ├── registration.py
│ │ │ ├── utils.py
│ │ │ └── vln/
│ │ │ ├── __init__.py
│ │ │ └── vln.py
│ │ ├── utils/
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ ├── geometry_utils.py
│ │ │ ├── pickle5_multiprocessing.py
│ │ │ ├── profiling_wrapper.py
│ │ │ ├── test_utils.py
│ │ │ └── visualizations/
│ │ │ ├── __init__.py
│ │ │ ├── fog_of_war.py
│ │ │ ├── maps.py
│ │ │ └── utils.py
│ │ └── version.py
│ ├── habitat_baselines/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── agents/
│ │ │ ├── __init__.py
│ │ │ ├── benchmark_gym.py
│ │ │ ├── mp_agents.py
│ │ │ ├── ppo_agents.py
│ │ │ ├── simple_agents.py
│ │ │ └── slam_agents.py
│ │ ├── common/
│ │ │ ├── __init__.py
│ │ │ ├── base_il_trainer.py
│ │ │ ├── base_trainer.py
│ │ │ ├── baseline_registry.py
│ │ │ ├── environments.py
│ │ │ ├── obs_transformers.py
│ │ │ ├── rollout_storage.py
│ │ │ ├── tensor_dict.py
│ │ │ └── tensorboard_utils.py
│ │ ├── config/
│ │ │ ├── __init__.py
│ │ │ ├── default.py
│ │ │ ├── eqa/
│ │ │ │ ├── il_eqa_cnn_pretrain.yaml
│ │ │ │ ├── il_pacman_nav.yaml
│ │ │ │ └── il_vqa.yaml
│ │ │ ├── imagenav/
│ │ │ │ ├── ddppo_imagenav_example.yaml
│ │ │ │ ├── ddppo_imagenav_gibson.yaml
│ │ │ │ └── ppo_imagenav_example.yaml
│ │ │ ├── objectnav/
│ │ │ │ ├── ddppo_objectnav.yaml
│ │ │ │ └── ddppo_objectnav_hm3d.yaml
│ │ │ ├── pointnav/
│ │ │ │ ├── ddppo_pointnav.yaml
│ │ │ │ ├── ppo_pointnav.yaml
│ │ │ │ ├── ppo_pointnav_example.yaml
│ │ │ │ └── ppo_pointnav_habitat_iccv19.yaml
│ │ │ ├── rearrange/
│ │ │ │ ├── rl_pick.yaml
│ │ │ │ ├── rl_pick_state.yaml
│ │ │ │ └── spap_pick.yaml
│ │ │ └── test/
│ │ │ ├── ddppo_imagenav_test.yaml
│ │ │ ├── ddppo_pointnav_test.yaml
│ │ │ ├── ppo_imagenav_test.yaml
│ │ │ └── ppo_pointnav_test.yaml
│ │ ├── il/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── metrics.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ └── models.py
│ │ │ ├── requirements.txt
│ │ │ └── trainers/
│ │ │ ├── __init__.py
│ │ │ ├── eqa_cnn_pretrain_trainer.py
│ │ │ ├── pacman_trainer.py
│ │ │ └── vqa_trainer.py
│ │ ├── motion_planning/
│ │ │ ├── __init__.py
│ │ │ ├── grasp_generator.py
│ │ │ ├── motion_plan.py
│ │ │ ├── mp_sim.py
│ │ │ ├── mp_spaces.py
│ │ │ └── robot_target.py
│ │ ├── py.typed
│ │ ├── rl/
│ │ │ ├── __init__.py
│ │ │ ├── ddppo/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── algo/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── ddppo.py
│ │ │ │ ├── data_generation/
│ │ │ │ │ ├── create_gibson_large_dataset.py
│ │ │ │ │ └── gibson_dset_with_qual.json
│ │ │ │ ├── ddp_utils.py
│ │ │ │ ├── multi_node_slurm.sh
│ │ │ │ ├── policy/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── resnet.py
│ │ │ │ │ ├── resnet_policy.py
│ │ │ │ │ └── running_mean_and_var.py
│ │ │ │ ├── requirements.txt
│ │ │ │ └── single_node.sh
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── rnn_state_encoder.py
│ │ │ │ └── simple_cnn.py
│ │ │ ├── ppo/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── policy.py
│ │ │ │ ├── ppo.py
│ │ │ │ └── ppo_trainer.py
│ │ │ └── requirements.txt
│ │ ├── run.py
│ │ ├── slambased/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── install_deps.sh
│ │ │ ├── mappers.py
│ │ │ ├── monodepth.py
│ │ │ ├── path_planners.py
│ │ │ ├── reprojection.py
│ │ │ ├── requirements.txt
│ │ │ └── utils.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── common.py
│ │ ├── env_utils.py
│ │ ├── gym_adapter.py
│ │ ├── gym_definitions.py
│ │ ├── render_wrapper.py
│ │ └── visualizations/
│ │ ├── __init__.py
│ │ └── utils.py
│ ├── mypy.ini
│ ├── pyproject.toml
│ ├── requirements.txt
│ ├── scripts/
│ │ └── generate_profile_shell_scripts.py
│ ├── setup.cfg
│ ├── setup.py
│ └── test/
│ ├── test_baseline_agents.py
│ ├── test_baseline_trainers.py
│ ├── test_config.py
│ ├── test_dataset.py
│ ├── test_ddppo_reduce.py
│ ├── test_demo_notebook.py
│ ├── test_examples.py
│ ├── test_gym_wrapper.py
│ ├── test_habitat_env.py
│ ├── test_habitat_example.py
│ ├── test_habitat_sim.py
│ ├── test_habitat_task.py
│ ├── test_install.py
│ ├── test_motion_plan.py
│ ├── test_mp3d_eqa.py
│ ├── test_object_nav_task.py
│ ├── test_pointnav_dataset.py
│ ├── test_pyrobot.py
│ ├── test_r2r_vln.py
│ ├── test_rearrange_task.py
│ ├── test_rnn_state_encoder.py
│ ├── test_sensors.py
│ ├── test_spaces.py
│ ├── test_tensor_dict.py
│ └── test_visual_utils.py
├── requirements.txt
├── scenegraph.py
├── segment_anything/
│ ├── .flake8
│ ├── CODE_OF_CONDUCT.md
│ ├── CONTRIBUTING.md
│ ├── LICENSE
│ ├── README.md
│ ├── linter.sh
│ ├── notebooks/
│ │ ├── automatic_mask_generator_example.ipynb
│ │ ├── onnx_model_example.ipynb
│ │ └── predictor_example.ipynb
│ ├── scripts/
│ │ ├── amg.py
│ │ └── export_onnx_model.py
│ ├── segment_anything/
│ │ ├── __init__.py
│ │ ├── automatic_mask_generator.py
│ │ ├── build_sam.py
│ │ ├── build_sam_hq.py
│ │ ├── modeling/
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ ├── image_encoder.py
│ │ │ ├── mask_decoder.py
│ │ │ ├── mask_decoder_hq.py
│ │ │ ├── prompt_encoder.py
│ │ │ ├── sam.py
│ │ │ └── transformer.py
│ │ ├── predictor.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── amg.py
│ │ ├── onnx.py
│ │ └── transforms.py
│ ├── setup.cfg
│ └── setup.py
├── tools/
│ ├── agent.py
│ ├── download_mp.py
│ ├── matterport_category_mappings.tsv
│ ├── obj.npy
│ ├── replica.yaml
│ └── room.npy
└── utils/
├── __init__.py
├── image_process.py
├── utils_fmm/
│ ├── __init__.py
│ ├── control_helper.py
│ ├── depth_utils.py
│ ├── fmm_planner.py
│ ├── mapping.py
│ ├── model.py
│ ├── pose_utils.py
│ └── rotation_utils.py
├── utils_glip.py
└── utils_scenegraph/
├── __init__.py
├── grounded_sam_demo.py
├── iou.py
├── mapping.py
├── slam_classes.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
**/__pycache__/
glip_large_model.pth
/data
/GLIP/build
/GLIP/maskrcnn_benchmark.egg-info
/segment_anything/segment_anything.egg-info/
*.so
segment_anything/sam_vit_*
GroundingDINO/groundingdino_swint_ogc.pth
.vscode
start.py
start_multiprocess.py
================================================
FILE: GLIP/CODE_OF_CONDUCT.md
================================================
# Microsoft Open Source Code of Conduct
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
Resources:
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
================================================
FILE: GLIP/DATA.md
================================================
We provide guidance for preparing the data used by GLIP. Note that not all data are needed for a specific experiments. Please check the `` Required Data`` fields in [README](README.md) to download necessary data. All data should by placed under the ``DATASET`` folder.
#### ``COCO``
Download the original [COCO](https://cocodataset.org/#download) data into ``DATASET/coco`` folder. The contents should be organized as follows:
###### train2017
DATASET/coco/train2017
DATASET/coco/annotations/instances_train2017.json
###### val2017
DATASET/coco/val2017
DATASET/coco/annotations/instances_val2017.json
###### test2017
DATASET/coco/test2017
DATASET/coco/annotations/image_info_test-dev2017.json
###### train2014
DATASET/coco/train2014
#### ``LVIS``
LVIS use the same images as COCO. Thus prepare the COCO images first.
DATASET/coco
Download the following annotation files:
"wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/coco/annotations/lvis_v1_minival_inserted_image_name.json -O DATASET/coco/annotations/lvis_v1_minival_inserted_image_name.json"
"wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/coco/annotations/lvis_od_val.json -O coco/annotations/lvis_od_val.json"
#### ``Object Detection in the Wild (ODinW)``
Please see the "ODinW / Custom Dataset Evaluation" section in [README.md](README.md) for preparing the Aquarium dataset. We will release all the data in ODinW in the next update.
#### ``Objects365``
We store Objects365 data in the TSV format. Please see [link](https://github.com/microsoft/scene_graph_benchmark/tree/main/tools/mini_tsv) for a description of the TSV format. We provide the annotation files:
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/Objects365/objects365_train_vgoiv6.cas2000.yaml -O DATASET/Objects365/objects365_train_vgoiv6.cas2000.yaml
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/Objects365/train.label.tsv -O DATASET/Objects365/train.label.tsv
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/Objects365/train.label.linelist.cas.2000.tsv -O DATASET/Objects365/train.label.linelist.cas.2000.tsv
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/Objects365/train.label.lineidx -O DATASET/Objects365/train.label.lineidx
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/Objects365/train.hw.tsv -O DATASET/Objects365/train.hw.tsv
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/Objects365/train.hw.lineidx -O DATASET/Objects365/train.hw.lineidx
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/Objects365/object365_vgoiv6_class2ind.json -O DATASET/Objects365/object365_vgoiv6_class2ind.json
We cannot host the image data. Please download the original image data and organize them into ``DATASET/Objects365/images.tsv`` and ``DATASET/Objects365/images.lineidx``.
#### ``Flickr30K``
Download the Flickr30K images from [Link](http://shannon.cs.illinois.edu/DenotationGraph/) and put them under ``DATASET/flickr30k/flickr30k_images/``. Download the [MDETR annotations](https://zenodo.org/record/4729015/files/mdetr_annotations.tar.gz?download=1) and put them under ``DATASET/mdetr_annotations/``. The dataset structure should look like:
DATASET/flickr30k/flickr30k_images/
DATASET/mdetr_annotations/final_flickr_separateGT_*
#### ``MixedGrounding``
This is the grounding dataset curated by [MDETR](https://github.com/ashkamath/mdetr/blob/main/.github/pretrain.md).
Please prepare the COCO train2014 data and put them under ``DATASET/coco/train2014``.
Prepare the [GQA images](https://nlp.stanford.edu/data/gqa/images.zip) and put them under ``DATASET/gqa/images/``.
Then download the annotation files. The original MDETR annotation file contains COCO images; we provide a version without COCO images: ``wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/mdetr_annotations/final_mixed_train_no_coco.json -O DATASET/mdetr_annotations/final_mixed_train_no_coco.json``.
The dataset structure should look like:
"DATASET/coco/train2014"
"DATASET/gqa/images"
"DATASET/mdetr_annotations/final_mixed_train_no_coco.json",
#### ``GCC``
Goolge conceptual captions with pseudo-grounding annotations.
To be released in the next update.
================================================
FILE: GLIP/LICENSE
================================================
MIT License
Copyright (c) Microsoft Corporation.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
================================================
FILE: GLIP/README.md
================================================
# GLIP: Grounded Language-Image Pre-training
## Updates
04/30/2022: Updated [Demo](https://colab.research.google.com/drive/12x7v-_miN7-SRiziK3Cx4ffJzstBJNqb?usp=sharing)!
04/14/2022: GLIP has been accepted to CVPR 2022 as an oral presentation! First version of code and pre-trained models are released!
12/06/2021: GLIP paper on arxiv https://arxiv.org/abs/2112.03857.
11/23/2021: Project page built.
## Introduction
This repository is the project page for [GLIP](https://arxiv.org/abs/2112.03857). GLIP demonstrate strong zero-shot and few-shot transferability to various object-level recognition tasks.
1. When directly evaluated on COCO and LVIS (without seeing any images in COCO), GLIP achieves 49.8 AP and 26.9 AP, respectively, surpassing many supervised baselines.
2. After fine-tuned on COCO, GLIP achieves 60.8 AP on val and 61.5 AP on test-dev, surpassing prior SoTA.
3. When transferred to 13 downstream object detection tasks, a few-shot GLIP rivals with a fully-supervised Dynamic Head.
We provide code for:
1. **pre-training** GLIP on detection and grounding data;
2. **zero-shot evaluating** GLIP on standard benchmarks (COCO, LVIS, Flickr30K) and custom COCO-formated datasets;
3. **fine-tuning** GLIP on standard benchmarks (COCO) and custom COCO-formated datasets;
4. **a Colab demo**.
Please see respective sections for instructions.
## Demo
Please see a Colab demo at [link](https://colab.research.google.com/drive/12x7v-_miN7-SRiziK3Cx4ffJzstBJNqb?usp=sharing)!
## Installation and Setup
***Environment***
This repo requires Pytorch>=1.9 and torchvision. We recommand using docker to setup the environment. You can use this pre-built docker image ``docker pull pengchuanzhang/maskrcnn:ubuntu18-py3.7-cuda10.2-pytorch1.9`` or this one ``docker pull pengchuanzhang/pytorch:ubuntu20.04_torch1.9-cuda11.3-nccl2.9.9`` depending on your GPU.
Then install the following packages:
```
pip install einops shapely timm yacs tensorboardX ftfy prettytable pymongo
pip install transformers
python setup.py build develop --user
```
***Backbone Checkpoints.*** Download the ImageNet pre-trained backbone checkpoints into the ``MODEL`` folder.
```
mkdir MODEL
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/swin_tiny_patch4_window7_224.pth -O MODEL/swin_tiny_patch4_window7_224.pth
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/swin_large_patch4_window12_384_22k.pth -O MODEL/swin_large_patch4_window12_384_22k.pth
```
## Model Zoo
Model | COCO [1] | LVIS [2] | LVIS [3] | ODinW [4] | Pre-Train Data | Config | Weight
-- | -- | -- | -- | -- | -- | -- | --
GLIP-T (C) | 46.7 / 55.1 | 14.3 | [17.7](https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_tiny_model_o365_goldg_lvisbest.pth) | 44.4 | Objects365,GoldG | [config](configs/pretrain/glip_Swin_T_O365_GoldG.yaml) | [weight](https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_tiny_model_o365_goldg.pth)
GLIP-T [5] | 46.6 / 55.2 | 17.6 | [20.1](https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_tiny_model_o365_goldg_cc_sbu_lvisbest.pth) | 42.7 | Objects365,GoldG,CC3M,SBU | [config](configs/pretrain/glip_Swin_T_O365_GoldG.yaml) [6] | [weight](https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_tiny_model_o365_goldg_cc_sbu.pth)
GLIP-L [7] | 51.4 / 61.7 [8] | 29.3 | [30.1](https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_large_model_lvisbest.pth) | 51.2 | FourODs,GoldG,CC3M+12M,SBU | [config](configs/pretrain/glip_Swin_L.yaml) [9] | [weight](https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_large_model.pth)
[1] Zero-shot and fine-tuning performance on COCO val2017.
[2] Zero-shot performance on LVIS minival (APr) with the last pre-trained checkpoint.
[3] On LVIS, the model could overfit slightly during the pre-training course. Thus we reported two numbers on LVIS: the performance of the last checkpoint (LVIS[2]) and the performance of the best checkpoint during the pre-training course (LVIS[3]).
[4] Zero-shot performance on the 13 ODinW datasets.
[5] GLIP-T released in this repo is pre-trained on Conceptual Captions 3M and SBU captions. It is referred in paper in Table 1 and in Appendix C.3. It differs slightly from the GLIP-T in the main paper in terms of downstream performance. We will release the pre-training support for using CC3M and SBU captions data in the next update.
[6] This config is only intended for zero-shot evaluation and fine-tuning. Pre-training config with support for using CC3M and SBU captions data will be updated.
[7] GLIP-L released in this repo is pre-trained on Conceptual Captions 3M+12M and SBU captions. It slightly outperforms the GLIP-L in the main paper because the model used to annotate the caption data are improved compared to the main paper. We will release the pre-training support for using CC3M+12M and SBU captions data in the next update.
[8] Multi-scale testing used.
[9] This config is only intended for zero-shot evaluation and fine-tuning. Pre-training config with support for using CC3M+12M and SBU captions data to be updated.
## Pre-Training
***Required Data.*** Prepare ``Objects365``, ``Flickr30K``, and ``MixedGrounding`` data as in [DATA.md](DATA.md). Support for training using caption data (Conceptual Captions and SBU captions) will be released soon.
***Command.***
Perform pre-training with the following command (please change the config-file accordingly; checkout model zoo for the corresponding config; change the ``{output_dir}`` to your desired output directory):
```
python -m torch.distributed.launch --nnodes 2 --nproc_per_node=16 tools/train_net.py \
--config-file configs/pretrain/glip_Swin_T_O365_GoldG.yaml \
--skip-test --use-tensorboard --override_output_dir {output_dir}
```
For training GLIP-T models, we used `nnodes = 2`, `nproc_per_node=16` on 32GB V100 machines. For training GLIP-L models, we used `nnodes = 4`, `nproc_per_node=16` on 32GB V100 machines. Please setup the environment accordingly based on your local machine.
## (Zero-Shot) Evaluation
### COCO Evaluation
Prepare ``COCO/val2017`` data as in [DATA.md](DATA.md). Set ``{config_file}``, ``{model_checkpoint}`` according to the ``Model Zoo``; set ``{output_dir}`` to a folder where the evaluation results will be stored.
```
python tools/test_grounding_net.py --config-file {config_file} --weight {model_checkpoint} \
TEST.IMS_PER_BATCH 1 \
MODEL.DYHEAD.SCORE_AGG "MEAN" \
TEST.EVAL_TASK detection \
MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS False \
OUTPUT_DIR {output_dir}
```
### LVIS Evaluation
We follow MDETR to evaluate with the [FixedAP](https://arxiv.org/pdf/2102.01066.pdf) criterion. Set ``{config_file}``, ``{model_checkpoint}`` according to the ``Model Zoo``. Prepare ``COCO/val2017`` data as in [DATA.md](DATA.md).
```
python -m torch.distributed.launch --nproc_per_node=4 \
tools/test_grounding_net.py \
--config-file {config_file} \
--task_config configs/lvis/minival.yaml \
--weight {model_checkpoint} \
TEST.EVAL_TASK detection OUTPUT_DIR {output_dir}
TEST.CHUNKED_EVALUATION 40 TEST.IMS_PER_BATCH 4 SOLVER.IMS_PER_BATCH 4 TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 3000 MODEL.RETINANET.DETECTIONS_PER_IMG 300 MODEL.FCOS.DETECTIONS_PER_IMG 300 MODEL.ATSS.DETECTIONS_PER_IMG 300 MODEL.ROI_HEADS.DETECTIONS_PER_IMG 300
```
If you wish to evaluate on Val 1.0, set ``--task_config`` to ``configs/lvis/val.yaml``.
### ODinW / Custom Dataset Evaluation
GLIP supports easy evaluation on a custom dataset. Currently, the code supports evaluation on [COCO-formatted](https://cocodataset.org/#format-data) dataset.
We will use the [Aquarium](https://public.roboflow.com/object-detection/aquarium) dataset from ODinW as an example to show how to evaluate on a custom COCO-formatted dataset.
1. Download the raw dataset from RoboFlow in the COCO format into ``DATASET/odinw/Aquarium``. Each train/val/test split has a corresponding ``annotation`` file and a ``image`` folder.
2. Remove the background class from the annotation file. This can be as simple as open "_annotations.coco.json" and remove the entry with "id:0" from "categories". For convenience, we provide the modified annotation files for Aquarium:
```
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/odinw/Aquarium/Aquarium%20Combined.v2-raw-1024.coco/test/annotations_without_background.json -O DATASET/odinw/Aquarium/Aquarium\ Combined.v2-raw-1024.coco/test/annotations_without_background.json
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/odinw/Aquarium/Aquarium%20Combined.v2-raw-1024.coco/train/annotations_without_background.json -O DATASET/odinw/Aquarium/Aquarium\ Combined.v2-raw-1024.coco/train/annotations_without_background.json
wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/odinw/Aquarium/Aquarium%20Combined.v2-raw-1024.coco/valid/annotations_without_background.json -O DATASET/odinw/Aquarium/Aquarium\ Combined.v2-raw-1024.coco/valid/annotations_without_background.json
```
4. Then create a yaml file as in ``configs/odinw/Aquarium_Aquarium_Combined.v2-raw-1024.coco.yaml``. A few fields to be noted in the yamls:
DATASET.CAPTION_PROMPT allows manually changing the prompt (the default prompt is simply concatnating all the categories);
MODELS.\*.NUM_CLASSES need to be set to the number of categories in the dataset (including the background class). E.g., Aquarium has 7 non-background categories thus MODELS.\*.NUM_CLASSES is set to 8;
4. Run the following command to evaluate on the dataset. Set ``{config_file}``, ``{model_checkpoint}`` according to the ``Model Zoo``. Set {odinw_configs} to the path of the task yaml file we just prepared.
```
python tools/test_grounding_net.py --config-file {config_file} --weight {model_checkpoint} \
--task_config {odinw_configs} \
TEST.IMS_PER_BATCH 1 SOLVER.IMS_PER_BATCH 1 \
TEST.EVAL_TASK detection \
DATASETS.TRAIN_DATASETNAME_SUFFIX _grounding \
DATALOADER.DISTRIBUTE_CHUNK_AMONG_NODE False \
DATASETS.USE_OVERRIDE_CATEGORY True \
DATASETS.USE_CAPTION_PROMPT True
```
### Flickr30K Evaluation
Prepare ``Flickr30K`` data as in [DATA.md](DATA.md). Set ``{config_file}``, ``{model_checkpoint}`` according to the ``Model Zoo``.
```
python tools/test_grounding_net.py \
--config-file {config_file} \
--task_config configs/flickr/test.yaml,configs/flickr/val.yaml \
--weight {model_checkpoint} \
OUTPUT_DIR {output_dir} TEST.IMS_PER_BATCH 1 SOLVER.IMS_PER_BATCH 1 TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100 TEST.EVAL_TASK grounding MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS False
```
## Fine-Tuning
### COCO Fine-Tuning
Prepare the ``COCO`` data as in [DATA.md](DATA.md). Set ``{config_file}``, ``{model_checkpoint}`` according to the ``Model Zoo``.
Below is the fine-tuning script for tuning the Tiny models:
```
python -m torch.distributed.launch --nproc_per_node=16 tools/train_net.py \
--config-file {config_file} \
--skip-test \
MODEL.WEIGHT {model_checkpoint} \
DATASETS.TRAIN '("coco_grounding_train", )' \
MODEL.BACKBONE.FREEZE_CONV_BODY_AT -1 SOLVER.IMS_PER_BATCH 32 SOLVER.USE_AMP True SOLVER.MAX_EPOCH 24 TEST.DURING_TRAINING False TEST.IMS_PER_BATCH 16 SOLVER.FIND_UNUSED_PARAMETERS False SOLVER.BASE_LR 0.00001 SOLVER.LANG_LR 0.00001 SOLVER.STEPS \(0.67,0.89\) DATASETS.DISABLE_SHUFFLE True MODEL.DYHEAD.SCORE_AGG "MEAN" TEST.EVAL_TASK detection
```
For evaluation, please follow the instructions in ``COCO Evaluation``. Scripts for tuning the Large model will be released soon.
### ODinW / Custom Dataset Fine-Tuning
Prepare the dataset as in ``ODinW / Custom Dataset Evaluation``.
#### Full Model Fine-Tuning
For tuning with 1/3/5/10-shot, set {custom_shot_and_epoch_and_general_copy} to "1_200_8", "3_200_4", "5_200_2", "10_200_1", respectively.
For tuning with all the data, set {custom_shot_and_epoch_and_general_copy} to "0_200_1"; set SOLVER.STEP_PATIENCE to 2; set SOLVER.AUTO_TERMINATE_PATIENCE to 4.
```
python -m torch.distributed.launch --nproc_per_node=4 tools/finetune.py \
--config-file {config_file} --ft-tasks {configs} --skip-test \
--custom_shot_and_epoch_and_general_copy {custom_shot_and_epoch_and_general_copy} \
--evaluate_only_best_on_test --push_both_val_and_test \
MODEL.WEIGHT {model_checkpoint} \
SOLVER.USE_AMP True TEST.DURING_TRAINING True TEST.IMS_PER_BATCH 4 SOLVER.IMS_PER_BATCH 4 SOLVER.WEIGHT_DECAY 0.05 TEST.EVAL_TASK detection DATASETS.TRAIN_DATASETNAME_SUFFIX _grounding MODEL.BACKBONE.FREEZE_CONV_BODY_AT 2 MODEL.DYHEAD.USE_CHECKPOINT True SOLVER.FIND_UNUSED_PARAMETERS False SOLVER.TEST_WITH_INFERENCE True SOLVER.USE_AUTOSTEP True DATASETS.USE_OVERRIDE_CATEGORY True SOLVER.SEED 10 DATASETS.SHUFFLE_SEED 3 DATASETS.USE_CAPTION_PROMPT True DATASETS.DISABLE_SHUFFLE True \
SOLVER.STEP_PATIENCE 3 SOLVER.CHECKPOINT_PER_EPOCH 1.0 SOLVER.AUTO_TERMINATE_PATIENCE 8 SOLVER.MODEL_EMA 0.0 SOLVER.TUNING_HIGHLEVEL_OVERRIDE full
```
#### Prompt Tuning
Follow the command as in ``Full Model Fine-Tuning``. But set the following hyper-parameters:
```
SOLVER.WEIGHT_DECAY 0.25 \
SOLVER.BASE_LR 0.05 \
SOLVER.TUNING_HIGHLEVEL_OVERRIDE language_prompt_v2
```
## Citations
Please consider citing this paper if you use the code:
```
@inproceedings{li2021grounded,
title={Grounded Language-Image Pre-training},
author={Liunian Harold Li* and Pengchuan Zhang* and Haotian Zhang* and Jianwei Yang and Chunyuan Li and Yiwu Zhong and Lijuan Wang and Lu Yuan and Lei Zhang and Jenq-Neng Hwang and Kai-Wei Chang and Jianfeng Gao},
year={2022},
booktitle={CVPR},
}
```
================================================
FILE: GLIP/SECURITY.md
================================================
## Security
Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
## Reporting Security Issues
**Please do not report security vulnerabilities through public GitHub issues.**
Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
* Full paths of source file(s) related to the manifestation of the issue
* The location of the affected source code (tag/branch/commit or direct URL)
* Any special configuration required to reproduce the issue
* Step-by-step instructions to reproduce the issue
* Proof-of-concept or exploit code (if possible)
* Impact of the issue, including how an attacker might exploit the issue
This information will help us triage your report more quickly.
If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
## Preferred Languages
We prefer all communications to be in English.
## Policy
Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
================================================
FILE: GLIP/SUPPORT.md
================================================
# TODO: The maintainer of this repo has not yet edited this file
**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
- **No CSS support:** Fill out this template with information about how to file issues and get help.
- **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
- **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
# Support
## How to file issues and get help
This project uses GitHub Issues to track bugs and feature requests. Please search the existing
issues before filing new issues to avoid duplicates. For new issues, file your bug or
feature request as a new Issue.
For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
## Microsoft Support Policy
Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
================================================
FILE: GLIP/configs/flickr/test.yaml
================================================
MODEL:
ATSS:
NUM_CLASSES: 8 # Placeholder
FCOS:
NUM_CLASSES: 8 # Placeholder
ROI_BOX_HEAD:
NUM_CLASSES: 8 # Placeholder
DYHEAD:
NUM_CLASSES: 8 # Placeholder
DATASETS:
TRAIN: ("flickr30k_test", )
TEST: ("flickr30k_test", )
FLICKR_GT_TYPE: "separate"
INPUT:
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
SIZE_DIVISIBILITY: 32
ASPECT_RATIO_GROUPING: False
================================================
FILE: GLIP/configs/flickr/val.yaml
================================================
MODEL:
ATSS:
NUM_CLASSES: 8 # Placeholder
FCOS:
NUM_CLASSES: 8 # Placeholder
ROI_BOX_HEAD:
NUM_CLASSES: 8 # Placeholder
DYHEAD:
NUM_CLASSES: 8 # Placeholder
DATASETS:
TRAIN: ("flickr30k_val", )
TEST: ("flickr30k_val", )
FLICKR_GT_TYPE: "separate"
INPUT:
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
SIZE_DIVISIBILITY: 32
ASPECT_RATIO_GROUPING: False
SOLVER:
WARMUP_ITERS: 0
MAX_EPOCH: 12
CHECKPOINT_PERIOD: 100
TEST:
IMS_PER_BATCH: 8
================================================
FILE: GLIP/configs/lvis/minival.yaml
================================================
MODEL:
ATSS:
NUM_CLASSES: 8 # these fields are not used; just a placeholder
FCOS:
NUM_CLASSES: 8
ROI_BOX_HEAD:
NUM_CLASSES: 8
DYHEAD:
NUM_CLASSES: 8
DATASETS:
REGISTER:
lvis_evaluation_mini_val:
img_dir: "coco"
ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json"
lvis_evaluation_val:
img_dir: "coco"
ann_file: "coco/annotations/lvis_od_val.json"
TRAIN: ("lvis_evaluation_mini_val",)
TEST: ("lvis_evaluation_mini_val",)
INPUT:
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
SIZE_DIVISIBILITY: 32
ASPECT_RATIO_GROUPING: False
TEST:
IMS_PER_BATCH: 8
================================================
FILE: GLIP/configs/lvis/val.yaml
================================================
MODEL:
ATSS:
NUM_CLASSES: 8 # these fields are not used; just a placeholder
FCOS:
NUM_CLASSES: 8
ROI_BOX_HEAD:
NUM_CLASSES: 8
DYHEAD:
NUM_CLASSES: 8
DATASETS:
REGISTER:
lvis_evaluation_mini_val:
img_dir: "coco"
ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json"
lvis_evaluation_val:
img_dir: "coco"
ann_file: "coco/annotations/lvis_od_val.json"
TRAIN: ("lvis_evaluation_val",)
TEST: ("lvis_evaluation_val",)
INPUT:
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
SIZE_DIVISIBILITY: 32
ASPECT_RATIO_GROUPING: False
TEST:
IMS_PER_BATCH: 8
================================================
FILE: GLIP/configs/odinw/Aquarium_Aquarium_Combined.v2-raw-1024.coco.yaml
================================================
DATALOADER:
ASPECT_RATIO_GROUPING: false
SIZE_DIVISIBILITY: 32
DATASETS:
GENERAL_COPY: 16
CAPTION_PROMPT: '[{"prefix": " ", "name": "fish", "suffix": ""}, {"prefix": "", "name": "jellyfish", "suffix": ""}, {"prefix": "", "name": "penguin", "suffix": " , which is black and white"}, {"prefix": "", "name": "puffin", "suffix": " with orange beaks "}, {"prefix": "", "name": "shark", "suffix": ""}, {"prefix": "", "name": "starfish", "suffix": ""}, {"prefix": "", "name": "stingray", "suffix": " which is flat and round"}, ]'
REGISTER:
test:
ann_file: odinw/Aquarium/Aquarium Combined.v2-raw-1024.coco/test/annotations_without_background.json
img_dir: odinw/Aquarium/Aquarium Combined.v2-raw-1024.coco/test
train:
ann_file: odinw/Aquarium/Aquarium Combined.v2-raw-1024.coco/train/annotations_without_background.json
img_dir: odinw/Aquarium/Aquarium Combined.v2-raw-1024.coco/train
val:
ann_file: odinw/Aquarium/Aquarium Combined.v2-raw-1024.coco/valid/annotations_without_background.json
img_dir: odinw/Aquarium/Aquarium Combined.v2-raw-1024.coco/valid
TEST: ("val",)
TRAIN: ("train",)
INPUT:
MAX_SIZE_TEST: 1333
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MIN_SIZE_TRAIN: 800
MODEL:
ATSS:
NUM_CLASSES: 8
DYHEAD:
NUM_CLASSES: 8
FCOS:
NUM_CLASSES: 8
ROI_BOX_HEAD:
NUM_CLASSES: 8
SOLVER:
CHECKPOINT_PERIOD: 100
MAX_EPOCH: 12
WARMUP_ITERS: 0
TEST:
IMS_PER_BATCH: 8
================================================
FILE: GLIP/configs/pretrain/glip_Swin_L.yaml
================================================
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "swin_large_patch4_window12_384_22k.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
SWINT:
EMBED_DIM: 192
DEPTHS: (2, 2, 18, 2)
NUM_HEADS: (6, 12, 24, 48)
WINDOW_SIZE: 12
OUT_CHANNELS: (192, 384, 768, 1536)
DROP_PATH_RATE: 0.4
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
MASK_SPECIAL: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 8
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
USE_CHECKPOINT: True
FUSE_CONFIG:
USE_FUSED_FEATURES_DOT_PRODUCT: True
EARLY_FUSE_ON: True
TYPE: "MHA-B"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
DATASETS:
TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
TEST: ("coco_2017_val", )
ONE_HOT: False
FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
VG_COPY: 3 # 0.4 * 3 = ~1.2M
IN_COPY: 2 # 0.67 * 2 = ~1.33M
OI_COPY: 1 # 2M * 1 = 2M
DISABLE_SHUFFLE: False
ADD_DET_PROMPT: False
RANDOM_SAMPLE_NEG: 85
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
FURTHER_SCREEN: True
CAPTION_CONF: 0.5
CAPTION_NMS: -1.0
CAPTION_MIN_BOX: 1
SEPARATION_TOKENS: ". "
PACK_RANDOM_CAPTION_NUMBER: 20
NO_RANDOM_PACK_PROBABILITY: 0.4
RANDOM_PACK_PROB: 0.5
CAPTION_FORMAT_VERSION: "v2"
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.01
WEIGHT_DECAY_SCHEDULE: True
STEPS: (0.67, 0.89)
MAX_ITER: 1000000
IMS_PER_BATCH: 64
WARMUP_ITERS: 2000
WARMUP_FACTOR: 0.001
FIND_UNUSED_PARAMETERS: False
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
================================================
FILE: GLIP/configs/pretrain/glip_Swin_T_O365_GoldG.yaml
================================================
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "swin_tiny_patch4_window7_224.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
FREEZE_CONV_BODY_AT: -1
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
MASK_SPECIAL: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 6
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
FUSE_CONFIG:
EARLY_FUSE_ON: True
TYPE: "MHA-B"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_FUSED_FEATURES_DOT_PRODUCT: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
USE_CHECKPOINT: True
TEST:
DURING_TRAINING: False
IMS_PER_BATCH: 64
# use for grounding model
DATASETS:
TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
TEST: ("coco_2017_val", )
DISABLE_SHUFFLE: False
ADD_DET_PROMPT: False
RANDOM_SAMPLE_NEG: 85
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
SEPARATION_TOKENS: ". "
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.0001
STEPS: (0.67, 0.89)
MAX_EPOCH: 30
IMS_PER_BATCH: 64
WARMUP_ITERS: 2000
WARMUP_FACTOR: 0.001
USE_AMP: True
MODEL_EMA: 0.999
FIND_UNUSED_PARAMETERS: False
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
================================================
FILE: GLIP/maskrcnn_benchmark/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
================================================
FILE: GLIP/maskrcnn_benchmark/config/__init__.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from .defaults import _C as cfg
from .paths_catalog import try_to_find
================================================
FILE: GLIP/maskrcnn_benchmark/config/defaults.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import os
from yacs.config import CfgNode as CN
# -----------------------------------------------------------------------------
# Convention about Training / Test specific parameters
# -----------------------------------------------------------------------------
# Whenever an argument can be either used for training or for testing, the
# corresponding name will be post-fixed by a _TRAIN for a training parameter,
# or _TEST for a test-specific parameter.
# For example, the number of images during training will be
# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
# IMAGES_PER_BATCH_TEST
# -----------------------------------------------------------------------------
# Config definition
# -----------------------------------------------------------------------------
_C = CN()
_C.MODEL = CN()
_C.MODEL.RPN_ONLY = False
_C.MODEL.BOX_ON = True
_C.MODEL.MASK_ON = False
_C.MODEL.KEYPOINT_ON = False
_C.MODEL.DEVICE = "cuda"
_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
_C.MODEL.RPN_ARCHITECTURE = "RPN"
_C.MODEL.DEBUG = False # add debug flag
_C.MODEL.ONNX = False # add onnx flag
# If the WEIGHT starts with a catalog://, like :R-50, the code will look for
# the path in paths_catalog. Else, it will use it as the specified absolute
# path
_C.MODEL.WEIGHT = ""
_C.MODEL.PRETRAIN_NAME = ""
# If LINEAR_PROB = True, only the last linear layers in rpn and roi_head are trainable
_C.MODEL.LINEAR_PROB = False
# -----------------------------------------------------------------------------
# Multitask Training / Test specific parameters
# -----------------------------------------------------------------------------
_C.MODEL.MULTITASK = CN(new_allowed=True)
# -----------------------------------------------------------------------------
# INPUT
# -----------------------------------------------------------------------------
_C.INPUT = CN()
# Size of the smallest side of the image during training
_C.INPUT.MIN_SIZE_TRAIN = 800 # (800,)
# Maximum size of the side of the image during training
_C.INPUT.MAX_SIZE_TRAIN = 1333
# Size of the smallest side of the image during testing
_C.INPUT.MIN_SIZE_TEST = 800
# Maximum size of the side of the image during testing
_C.INPUT.MAX_SIZE_TEST = 1333
# Values to be used for image normalization
_C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717]
# Values to be used for image normalization
_C.INPUT.PIXEL_STD = [1., 1., 1.]
# Convert image to BGR format (for Caffe2 models), in range 0-255
_C.INPUT.TO_BGR255 = True
_C.INPUT.FORMAT = ''
_C.INPUT.FIX_RES = False
# -----------------------------------------------------------------------------
# Augmentation
# -----------------------------------------------------------------------------
_C.AUGMENT = CN()
_C.AUGMENT.USE_RA = 0
_C.AUGMENT.FLIP_PROB_TRAIN = 0.5
_C.AUGMENT.VERTICAL_FLIP_PROB_TRAIN = 0.0
_C.AUGMENT.MULT_MIN_SIZE_TRAIN = ()
_C.AUGMENT.BRIGHTNESS = 0.0
_C.AUGMENT.CONTRAST = 0.0
_C.AUGMENT.SATURATION = 0.0
_C.AUGMENT.HUE = 0.0
_C.AUGMENT.CROP_PROB = 0.5
_C.AUGMENT.CROP_MIN_IOUS = (0.1, 0.3, 0.5, 0.7, 0.9)
_C.AUGMENT.CROP_MIN_SIZE = 0.3
# -----------------------------------------------------------------------------
# Dataset
# -----------------------------------------------------------------------------
_C.DATASETS = CN()
# List of the dataset names for training, as present in paths_catalog.py
_C.DATASETS.TRAIN = ()
# List of the dataset names for testing, as present in paths_catalog.py
_C.DATASETS.TEST = ()
# Use is_crowd label
_C.DATASETS.USE_CROWD = False
_C.DATASETS.CLASS_AGNOSTIC = False
_C.DATASETS.CLASS_CONCAT = False
_C.DATASETS.MAX_BOX = -1
_C.DATASETS.SAMPLE_RATIO = 0.0
_C.DATASETS.FEW_SHOT = 0
# SHUFFLE_SEED != 0 means shuffle the dataset in the few shot setting
_C.DATASETS.SHUFFLE_SEED = 0
_C.DATASETS.PREDEFINED_TEXT = ''
_C.DATASETS.ALTERNATIVE_TRAINING = False
_C.DATASETS.MULTISTAGE_TRAINING = False
_C.DATASETS.REGISTER = CN(new_allowed=True)
_C.DATASETS.BOX_THRESHOLD = 0.1
# Duplicate Dataset
_C.DATASETS.COCO_COPY = 1
_C.DATASETS.LVIS_COPY = 1
_C.DATASETS.FLICKR_COPY = 1
_C.DATASETS.MIXED_COPY = 1
_C.DATASETS.OBJECT365_COPY = 1
_C.DATASETS.VG_COPY = 1
_C.DATASETS.OI_COPY = 1
_C.DATASETS.IN_COPY = 1
# Duplicate Dataset
_C.DATASETS.COCO_COPY = 1
_C.DATASETS.FLICKR_COPY = 1
_C.DATASETS.MIXED_COPY = 1
_C.DATASETS.OBJECT365_COPY = 1
_C.DATASETS.VG_COPY = 1
_C.DATASETS.OI_COPY = 1
_C.DATASETS.IN_COPY = 1
_C.DATASETS.GENERAL_COPY = -1
_C.DATASETS.GENERAL_COPY_TEST = -1
# OD to Grounding
_C.DATASETS.RANDOM_SAMPLE_NEG = -1
_C.DATASETS.ADD_DET_PROMPT = False
_C.DATASETS.ADD_DET_PROMPT_ADVANCED = False
_C.DATASETS.USE_OD_AUG = False
_C.DATASETS.USE_COCO_FORMAT = False
_C.DATASETS.CONTROL_PROB = ()
_C.DATASETS.DISABLE_SHUFFLE = False
_C.DATASETS.PROMPT_VERSION = ""
_C.DATASETS.PROMPT_LIMIT_NEG = -1
_C.DATASETS.POS_QUESTION_PROB = 0.6
_C.DATASETS.NEG_QUESTION_PROB = 0.8
_C.DATASETS.FULL_QUESTION_PROB = 0.5
_C.DATASETS.ONE_HOT = False
_C.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT = False
_C.DATASETS.DISABLE_CLIP_TO_IMAGE = False
_C.DATASETS.SEPARATION_TOKENS = " "
# LVIS
_C.DATASETS.LVIS_USE_NORMAL_AP = False
_C.DATASETS.SPECIAL_SAFEGUARD_FOR_COCO_GROUNDING = False
# Caption
_C.DATASETS.BING_INDEX_LIST = []
_C.DATASETS.CAPTION_MIN_BOX = 1
_C.DATASETS.REPLACE_CLEAN_LABEL = False
_C.DATASETS.FURTHER_SCREEN = False
_C.DATASETS.CAPTION_CONF = 0.9
_C.DATASETS.CAPTION_NMS = 0.9
_C.DATASETS.PACK_RANDOM_CAPTION_NUMBER = 0
_C.DATASETS.INFERENCE_CAPTION = False
_C.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA = -1.0
_C.DATASETS.RANDOM_PACK_PROB = -1.0
_C.DATASETS.NO_RANDOM_PACK_PROBABILITY = 0.0
_C.DATASETS.SAFEGUARD_POSITIVE_CAPTION = True
_C.DATASETS.CAPTION_FORMAT_VERSION = "v1"
_C.DATASETS.LOCAL_DEBUG = False
# Od in the wild
_C.DATASETS.PREDEFINED_TEXT = None
_C.DATASETS.TRAIN_DATASETNAME_SUFFIX = ""
_C.DATASETS.TEST_DATASETNAME_SUFFIX = ""
_C.DATASETS.OVERRIDE_CATEGORY = None
_C.DATASETS.USE_OVERRIDE_CATEGORY = False
_C.DATASETS.SUPRESS_QUERY = None
_C.DATASETS.USE_SUPRESS_QUERY = False
_C.DATASETS.USE_CAPTION_PROMPT = False
_C.DATASETS.CAPTION_PROMPT = None
_C.DATASETS.FLICKR_GT_TYPE = "separate"
# VQA
_C.DATASETS.DIVER_BOX_FOR_VQA = False
# -----------------------------------------------------------------------------
# DataLoader
# -----------------------------------------------------------------------------
_C.DATALOADER = CN()
# Number of data loading threads
_C.DATALOADER.NUM_WORKERS = 4
# If > 0, this enforces that each collated batch should have a size divisible
# by SIZE_DIVISIBILITY
_C.DATALOADER.SIZE_DIVISIBILITY = 0
# If True, each batch should contain only images for which the aspect ratio
# is compatible. This groups portrait images together, and landscape images
# are not batched with portrait images.
_C.DATALOADER.ASPECT_RATIO_GROUPING = True
# Define min number of keypoints required from GT, for example 10 out of 17
_C.DATALOADER.MIN_KPS_PER_IMS = 0
# Use random sampler during training
_C.DATALOADER.USE_RANDOM_SEED = False
_C.DATALOADER.DISTRIBUTE_CHUNK_AMONG_NODE = False
# ---------------------------------------------------------------------------- #
# Backbone options
# ---------------------------------------------------------------------------- #
_C.MODEL.BACKBONE = CN()
# The backbone conv body to use
# The string must match a function that is imported in modeling.model_builder
# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN
# backbone)
_C.MODEL.BACKBONE.CONV_BODY = "R-50-C4"
# Add StopGrad at a specified stage so the bottom layers are frozen
_C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
_C.MODEL.BACKBONE.FREEZE = False
_C.MODEL.BACKBONE.GROUP = 1
_C.MODEL.BACKBONE.OUT_CHANNELS = 256 * 4
# Option to reset bn running statics
_C.MODEL.BACKBONE.RESET_BN = False
# Backbone Normalization Level
_C.MODEL.BACKBONE.NORM_LEVEL = 3
# BN for backbone
_C.MODEL.BACKBONE.USE_BN = False
# Sync BN for backbone
_C.MODEL.BACKBONE.USE_SYNCBN = False
_C.MODEL.BACKBONE.USE_NSYNCBN = False
# GN for backbone
_C.MODEL.BACKBONE.USE_GN = False
# Evo Norm for backbone
_C.MODEL.BACKBONE.USE_EN = False
# Layers for backbone
_C.MODEL.BACKBONE.USE_DFCONV = False
_C.MODEL.BACKBONE.USE_DYRELU = False
_C.MODEL.BACKBONE.USE_SE = False
_C.MODEL.BACKBONE.LAYER_SETUP = (3, 4, 6, 3)
_C.MODEL.BACKBONE.LAYER_SEARCH = CN(new_allowed=True)
_C.MODEL.BACKBONE.OUT_FEATURES = ("stage2", "stage3", "stage4", "stage5")
_C.MODEL.BACKBONE.FPN_LAYER = ()
_C.MODEL.BACKBONE.USE_CHECKPOINT = False
# Add JF efficient det cfgs
_C.MODEL.BACKBONE.EFFICIENT_DET_START_FROM = 3
_C.MODEL.BACKBONE.EFFICIENT_DET_COMPOUND = 0
_C.MODEL.BACKBONE.EFFICIENT_DET_BIFPN_VERSION = 0
_C.MODEL.LANGUAGE_BACKBONE = CN()
_C.MODEL.LANGUAGE_BACKBONE.WEIGHT = ""
_C.MODEL.LANGUAGE_BACKBONE.FREEZE = False
_C.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT = False
_C.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE = "bert-base-uncased"
_C.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "bert-base-uncased"
_C.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 768
_C.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN = 256
_C.MODEL.LANGUAGE_BACKBONE.N_LAYERS = 1
_C.MODEL.LANGUAGE_BACKBONE.UNUSED_TOKEN = 106
_C.MODEL.LANGUAGE_BACKBONE.MASK_SPECIAL = False
_C.MODEL.LANGUAGE_BACKBONE.RNN_TYPE = "lstm"
_C.MODEL.LANGUAGE_BACKBONE.VARIABLE_LENGTH = True
_C.MODEL.LANGUAGE_BACKBONE.WORD_EMBEDDING_SIZE = 512
_C.MODEL.LANGUAGE_BACKBONE.WORD_VEC_SIZE = 512
_C.MODEL.LANGUAGE_BACKBONE.HIDDEN_SIZE = 512
_C.MODEL.LANGUAGE_BACKBONE.BIDIRECTIONAL = True
_C.MODEL.LANGUAGE_BACKBONE.INPUT_DROPOUT_P = 0.5
_C.MODEL.LANGUAGE_BACKBONE.DROPOUT_P = 0.2
_C.MODEL.LANGUAGE_BACKBONE.CORPUS_PATH = ""
_C.MODEL.LANGUAGE_BACKBONE.VOCAB_SIZE = 0
_C.MODEL.LANGUAGE_BACKBONE.PAD_MAX = True
# ---------------------------------------------------------------------------- #
# FPN options
# ---------------------------------------------------------------------------- #
_C.MODEL.FPN = CN()
_C.MODEL.FPN.FREEZE = False
_C.MODEL.FPN.USE_GN = False
_C.MODEL.FPN.USE_RELU = False
_C.MODEL.FPN.USE_DYRELU = False
_C.MODEL.FPN.DROP_BLOCK = True
_C.MODEL.FPN.DROP_PROB = 0.3
_C.MODEL.FPN.DROP_SIZE = 3
_C.MODEL.FPN.USE_SPP = False
_C.MODEL.FPN.USE_PAN = False
_C.MODEL.FPN.USE_DYHEAD = False
_C.MODEL.FPN.RETURN_SWINT_FEATURE_BEFORE_FUSION = False
# ---------------------------------------------------------------------------- #
# BIFPN options
# ---------------------------------------------------------------------------- #
_C.MODEL.BIFPN = CN()
_C.MODEL.BIFPN.NUM_REPEATS = 1
_C.MODEL.BIFPN.USE_ATTENTION = True
# ---------------------------------------------------------------------------- #
# Group Norm options
# ---------------------------------------------------------------------------- #
_C.MODEL.GROUP_NORM = CN()
# Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS)
_C.MODEL.GROUP_NORM.DIM_PER_GP = -1
# Number of groups in GroupNorm (-1 if using DIM_PER_GP)
_C.MODEL.GROUP_NORM.NUM_GROUPS = 16
# GroupNorm's small constant in the denominator
_C.MODEL.GROUP_NORM.EPSILON = 1e-5
# ---------------------------------------------------------------------------- #
# Evo Norm options
# ---------------------------------------------------------------------------- #
_C.MODEL.EVO_NORM = CN()
# Number of groups in EvoNorm (-1 if using DIM_PER_GP)
_C.MODEL.EVO_NORM.NUM_GROUPS = 8
# EvoNorm's small constant in the denominator
_C.MODEL.EVO_NORM.EPSILON = 1e-5
# ---------------------------------------------------------------------------- #
# RetinaNet Options (Follow the Detectron version)
# ---------------------------------------------------------------------------- #
_C.MODEL.RETINANET = CN()
# This is the number of foreground classes and background.
_C.MODEL.RETINANET.NUM_CLASSES = 81
# Convolutions to use in the cls and bbox tower
# NOTE: this doesn't include the last conv for logits
_C.MODEL.RETINANET.NUM_CONVS = 4
# During inference, #locs to select based on cls score before NMS is performed
# per FPN level
_C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000
# Prior prob for the positives at the beginning of training. This is used to set
# the bias init for the logits layer
_C.MODEL.RETINANET.PRIOR_PROB = 0.01
# Inference cls score threshold, anchors with score > INFERENCE_TH are
# considered for inference
_C.MODEL.RETINANET.INFERENCE_TH = 0.05
# NMS threshold used in RetinaNet
_C.MODEL.RETINANET.NMS_TH = 0.4
_C.MODEL.RETINANET.DETECTIONS_PER_IMG = 100
# ---------------------------------------------------------------------------- #
# Focal Loss Options (Follow the Detectron version)
# ---------------------------------------------------------------------------- #
_C.MODEL.FOCAL = CN()
# Weight for bbox_regression loss
_C.MODEL.FOCAL.BBOX_REG_WEIGHT = 4.0
# Smooth L1 loss beta for bbox regression
_C.MODEL.FOCAL.BBOX_REG_BETA = 0.11
# IoU overlap ratio for labeling an anchor as positive
# Anchors with >= iou overlap are labeled positive
_C.MODEL.FOCAL.FG_IOU_THRESHOLD = 0.5
# IoU overlap ratio for labeling an anchor as negative
# Anchors with < iou overlap are labeled negative
_C.MODEL.FOCAL.BG_IOU_THRESHOLD = 0.4
# Focal loss parameter: alpha
_C.MODEL.FOCAL.LOSS_ALPHA = 0.25
# Focal loss parameter: gamma
_C.MODEL.FOCAL.LOSS_GAMMA = 2.0
# ---------------------------------------------------------------------------- #
# FCOS Options
# ---------------------------------------------------------------------------- #
_C.MODEL.FCOS = CN()
_C.MODEL.FCOS.NUM_CLASSES = 81 # the number of classes including background
_C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
_C.MODEL.FCOS.PRIOR_PROB = 0.01
_C.MODEL.FCOS.INFERENCE_TH = 0.05
_C.MODEL.FCOS.NMS_TH = 0.6
_C.MODEL.FCOS.PRE_NMS_TOP_N = 1000
# the number of convolutions used in the cls and bbox tower
_C.MODEL.FCOS.NUM_CONVS = 4
# if use deformable conv to align features
_C.MODEL.FCOS.USE_DFCONV = False
# if CENTER_SAMPLING_RADIUS <= 0, it will disable center sampling
_C.MODEL.FCOS.CENTER_SAMPLING_RADIUS = 0.0
# IOU_LOSS_TYPE can be "iou", "linear_iou" or "giou"
_C.MODEL.FCOS.IOU_LOSS_TYPE = "iou"
_C.MODEL.FCOS.NORM_REG_TARGETS = False
_C.MODEL.FCOS.CENTERNESS_ON_REG = False
_C.MODEL.FCOS.USE_GT_CENTER = False
_C.MODEL.FCOS.DETECTIONS_PER_IMG = 100
_C.MODEL.FCOS.USE_GN = False
_C.MODEL.FCOS.USE_BN = False
_C.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.0
_C.MODEL.FCOS.PRE_NMS_TOP_N_TRAIN = 3000
_C.MODEL.FCOS.POST_NMS_TOP_N_TRAIN = 1000
# ---------------------------------------------------------------------------- #
# ATSS Options
# ---------------------------------------------------------------------------- #
_C.MODEL.ATSS = CN()
_C.MODEL.ATSS.NUM_CLASSES = 81 # the number of classes including background
_C.MODEL.ATSS.PRIOR_PROB = 0.01
_C.MODEL.ATSS.INFERENCE_TH = 0.05
_C.MODEL.ATSS.NMS_TH = 0.6
_C.MODEL.ATSS.PRE_NMS_TOP_N = 1000
# the number of convolutions used in the cls and bbox tower
_C.MODEL.ATSS.NUM_CONVS = 4
# the channels of convolutions used in the cls and bbox tower
_C.MODEL.ATSS.CHANNELS = 128
# if use deformable conv to align features
_C.MODEL.ATSS.USE_DFCONV = False
# topk for selecting candidate positive samples from each level
_C.MODEL.ATSS.TOPK = 9
# Weight for bbox_regression loss
_C.MODEL.ATSS.REG_LOSS_WEIGHT = 2.0
_C.MODEL.ATSS.DETECTIONS_PER_IMG = 100
_C.MODEL.ATSS.USE_GN = False
_C.MODEL.ATSS.USE_BN = False
_C.MODEL.ATSS.USE_DYRELU = False
_C.MODEL.ATSS.USE_SE = False
_C.MODEL.ATSS.INFERENCE_TH_TRAIN = 0.0
_C.MODEL.ATSS.PRE_NMS_TOP_N_TRAIN = 3000
_C.MODEL.ATSS.POST_NMS_TOP_N_TRAIN = 1000
# ---------------------------------------------------------------------------- #
# DYHEAD Options
# ---------------------------------------------------------------------------- #
_C.MODEL.DYHEAD = CN()
_C.MODEL.DYHEAD.NUM_CLASSES = 81 # the number of classes including background
_C.MODEL.DYHEAD.PRIOR_PROB = 0.01
# the number of convolutions used in the cls and bbox tower
_C.MODEL.DYHEAD.NUM_CONVS = 4
# the channels of convolutions used in the cls and bbox tower
_C.MODEL.DYHEAD.CHANNELS = 128
_C.MODEL.DYHEAD.GROUPS = 1
# if use deformable conv to align features
_C.MODEL.DYHEAD.USE_DFCONV = False
# topk for selecting candidate positive samples from each level
_C.MODEL.DYHEAD.TOPK = 9
_C.MODEL.DYHEAD.SCORE_AGG = "MEAN" # MEAN or MAX, for binary focal loss score aggregation
_C.MODEL.DYHEAD.LOG_SCALE = 0.0 # temperature (dot product)
_C.MODEL.DYHEAD.SHALLOW_LOG_SCALE = 0.0 # # temperature (shallow contrastive)
_C.MODEL.DYHEAD.USE_GN = False
_C.MODEL.DYHEAD.USE_NSYNCBN = False
_C.MODEL.DYHEAD.USE_SYNCBN = False
_C.MODEL.DYHEAD.USE_DYFUSE = False
_C.MODEL.DYHEAD.USE_DYRELU = False
_C.MODEL.DYHEAD.CONV_FUNC = ''
# CosineSimOutputLayers: https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/modeling/roi_heads/fast_rcnn.py#L448-L464
_C.MODEL.DYHEAD.COSINE_SCALE = -1.0
_C.MODEL.DYHEAD.FUSE_CONFIG = CN()
_C.MODEL.DYHEAD.FUSE_CONFIG.EARLY_FUSE_ON = False
_C.MODEL.DYHEAD.FUSE_CONFIG.TYPE = ""
_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_SIZE = 256
_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_OUT_SIZE = 256
_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_DROPOUT = 0.1
_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_MLP_LAYERS = 2
_C.MODEL.DYHEAD.FUSE_CONFIG.USE_CLASSIFICATION_LOSS = False
_C.MODEL.DYHEAD.FUSE_CONFIG.USE_TOKEN_LOSS = False
_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_LOSS_WEIGHT = 1.0
_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_GAMMA = 2.0
_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_ALPHA = 0.25
_C.MODEL.DYHEAD.FUSE_CONFIG.USE_DOT_PRODUCT_TOKEN_LOSS = False
_C.MODEL.DYHEAD.FUSE_CONFIG.USE_CONTRASTIVE_ALIGN_LOSS = False
_C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_HIDDEN_DIM = 64
_C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_ALIGN_LOSS_WEIGHT = 1.0
_C.MODEL.DYHEAD.FUSE_CONFIG.DOT_PRODUCT_TOKEN_LOSS_WEIGHT = 1.0
_C.MODEL.DYHEAD.FUSE_CONFIG.USE_LAYER_SCALE = True
_C.MODEL.DYHEAD.FUSE_CONFIG.SEPARATE_BIDIRECTIONAL = False
_C.MODEL.DYHEAD.FUSE_CONFIG.STABLE_SOFTMAX_2D = False
_C.MODEL.DYHEAD.FUSE_CONFIG.DO_LANG_PROJ_OUTSIDE_CHECKPOINT = False
_C.MODEL.DYHEAD.FUSE_CONFIG.USE_FUSED_FEATURES_DOT_PRODUCT = False
# Controls for
_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MIN_FOR_UNDERFLOW = False
_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MAX_FOR_OVERFLOW = False
_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MIN_FOR_UNDERFLOW = False
_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MAX_FOR_OVERFLOW = False
_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_DOT_PRODUCT = False
# MLM Loss
_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS = False
_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_FOR_ONLY_POSITIVES = True
_C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_OD = False
_C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_GOLD = False
_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_COEF = 1.0
_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_OBJ_FOR_ONLY_POSITIVE = False
# Shallow Contrastive Loss (FPN)
_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_CONTRASTIVE_LOSS = False
_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_MAX_POSITIVE_ANCHORS = 100
_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_ZERO_PADS = False
_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_HIDDEN_DIM = 64
_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_LOSS_WEIGHT = 1.0
# Shallow Contrastive Loss (BACKBONE)
_C.MODEL.DYHEAD.FUSE_CONFIG.USE_BACKBONE_SHALLOW_CONTRASTIVE_LOSS = False
_C.MODEL.DYHEAD.FUSE_CONFIG.ADD_LINEAR_LAYER = False
# use checkpoint to save memory
_C.MODEL.DYHEAD.USE_CHECKPOINT = False
# ---------------------------------------------------------------------------- #
# RPN options
# ---------------------------------------------------------------------------- #
_C.MODEL.RPN = CN()
_C.MODEL.RPN.USE_FPN = False
# Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input
_C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)
# Stride of the feature map that RPN is attached.
# For FPN, number of strides should match number of scales
_C.MODEL.RPN.ANCHOR_STRIDE = (16,)
# RPN anchor aspect ratios
_C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0)
# Anchor shift away ration from the center for r,t,l,d
_C.MODEL.RPN.ANCHOR_SHIFT = (0.0, 0.0, 0.0, 0.0)
# Use center to decide anchor size
_C.MODEL.RPN.USE_RELATIVE_SIZE = False
# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
_C.MODEL.RPN.STRADDLE_THRESH = 0
# Anchor scales per octave for complex anchors
_C.MODEL.RPN.OCTAVE = 2.0
_C.MODEL.RPN.SCALES_PER_OCTAVE = 3
# Minimum overlap required between an anchor and ground-truth box for the
# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
# ==> positive RPN example)
_C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7
# Maximum overlap allowed between an anchor and ground-truth box for the
# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
# ==> negative RPN example)
_C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3
# Total number of RPN examples per image
_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
# Target fraction of foreground (positive) examples per RPN minibatch
_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
# Number of top scoring RPN proposals to keep before applying NMS
# When FPN is used, this is *per FPN level* (not total)
_C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000
_C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000
# Number of top scoring RPN proposals to keep after applying NMS
_C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000
_C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000
# NMS threshold used on RPN proposals
_C.MODEL.RPN.NMS_THRESH = 0.7
# Proposal height and width both need to be greater than RPN_MIN_SIZE
# (a the scale used during training or inference)
_C.MODEL.RPN.MIN_SIZE = 0
# Number of top scoring RPN proposals to keep after combining proposals from
# all FPN levels
_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
# Custom rpn head, empty to use default conv or separable conv
_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
_C.MODEL.RPN.FREEZE = False
_C.MODEL.RPN.FORCE_BOXES = False
_C.MODEL.RPN.RETURN_FUSED_FEATURES = False
# ---------------------------------------------------------------------------- #
# ROI HEADS options
# ---------------------------------------------------------------------------- #
_C.MODEL.ROI_HEADS = CN()
_C.MODEL.ROI_HEADS.USE_FPN = False
# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
_C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5
# Overlap threshold for an RoI to be considered background
# (class = 0 if overlap in [0, BG_IOU_THRESHOLD))
_C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5
# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
# These are empirically chosen to approximately lead to unit variance targets
_C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.)
# RoI minibatch size *per image* (number of regions of interest [ROIs])
# Total number of RoIs per training minibatch =
# TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH * NUM_GPUS
# E.g., a common configuration is: 512 * 2 * 8 = 8192
_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
# Only used on test mode
# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
# balance obtaining high recall with not having too many low precision
# detections that will slow down inference post processing steps (like NMS)
_C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05
# Overlap threshold used for non-maximum suppression (suppress boxes with
# IoU >= this threshold)
_C.MODEL.ROI_HEADS.NMS = 0.5
# Maximum number of detections to return per image (100 is based on the limit
# established for the COCO dataset)
_C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100
_C.MODEL.ROI_BOX_HEAD = CN()
_C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
_C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor"
_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
_C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,)
_C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
# Hidden layer dimension when using an MLP for the RoI box head
_C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024
# GN
_C.MODEL.ROI_BOX_HEAD.USE_GN = False
# Dilation
_C.MODEL.ROI_BOX_HEAD.DILATION = 1
_C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256
_C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4
# Use D2 style ROIAlignV2
_C.MODEL.ROI_BOX_HEAD.POOLER_ALIGNED = False
_C.MODEL.ROI_MASK_HEAD = CN()
_C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
_C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor"
_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
_C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,)
_C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024
_C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256)
_C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14
_C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
# Whether or not resize and translate masks to the input image.
_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False
_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5
# Dilation
_C.MODEL.ROI_MASK_HEAD.DILATION = 1
# GN
_C.MODEL.ROI_MASK_HEAD.USE_GN = False
# HG
_C.MODEL.ROI_MASK_HEAD.HG_SCALE = 1
_C.MODEL.ROI_KEYPOINT_HEAD = CN()
_C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor"
_C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor"
_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,)
_C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024
_C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8))
_C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14
_C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17
_C.MODEL.ROI_KEYPOINT_HEAD.KEYPOINT_NAME = () # If left empty, use default names
_C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
# ---------------------------------------------------------------------------- #
# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
# Note that parts of a resnet may be used for both the backbone and the head
# These options apply to both
# ---------------------------------------------------------------------------- #
_C.MODEL.RESNETS = CN()
_C.MODEL.RESNETS.USE_STEM3X3 = False
_C.MODEL.RESNETS.WITH_SE = False
_C.MODEL.RESNETS.USE_AVG_DOWN = False
# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
_C.MODEL.RESNETS.NUM_GROUPS = 1
# Baseline width of each group
_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
# Place the stride 2 conv on the 1x1 filter
# Use True only for the original MSRA ResNet; use False for C2 and Torch models
_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
# Residual transformation function
_C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm"
# ResNet's stem function (conv1 and pool1)
_C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
# Apply dilation in stage "res5"
_C.MODEL.RESNETS.RES5_DILATION = 1
_C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4
_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
_C.MODEL.RESNETS.REVISION = "resnet_light"
# Deformable convolutions
_C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
_C.MODEL.RESNETS.WITH_MODULATED_DCN = False
_C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1
# ---------------------------------------------------------------------------- #
# Swin Transformer
# ---------------------------------------------------------------------------- #
_C.MODEL.SWINT = CN()
_C.MODEL.SWINT.EMBED_DIM = 96
_C.MODEL.SWINT.OUT_CHANNELS = (96, 192, 384, 768)
_C.MODEL.SWINT.DEPTHS = (2, 2, 6, 2)
_C.MODEL.SWINT.NUM_HEADS = (3, 6, 12, 24)
_C.MODEL.SWINT.WINDOW_SIZE = 7
_C.MODEL.SWINT.MLP_RATIO = 4
_C.MODEL.SWINT.DROP_PATH_RATE = 0.2
_C.MODEL.SWINT.APE = False
_C.MODEL.SWINT.VERSION = "v1"
_C.MODEL.SWINT.OUT_NORM = True
_C.MODEL.SWINT.LAYER_SCALE = 0
# ---------------------------------------------------------------------------- #
# CVT SPEC
# ---------------------------------------------------------------------------- #
_C.MODEL.SPEC = CN(new_allowed=True)
# ---------------------------------------------------------------------------- #
# CLIP SPEC
# ---------------------------------------------------------------------------- #
_C.MODEL.CLIP = CN()
_C.MODEL.CLIP.CONTEXT_LENGTH = 256 # default 77
_C.MODEL.CLIP.WIDTH = 512
_C.MODEL.CLIP.LAYERS = 12
_C.MODEL.CLIP.HEADS = 8
_C.MODEL.CLIP.DROP_PATH = 0.0
_C.MODEL.CLIP.TOKENIZER = "clip"
_C.MODEL.CLIP.VOCAB_SIZE = 49408
# ---------------------------------------------------------------------------- #
# SEARCH
# ---------------------------------------------------------------------------- #
_C.SEARCH = CN()
_C.SEARCH.MAX_EPOCH = 20
_C.SEARCH.SELECT_NUM = 20
_C.SEARCH.POPULATION_NUM = 64
_C.SEARCH.MUTATION_NUM = 24
_C.SEARCH.CROSSOVER_NUM = 24
_C.SEARCH.MUTATION_PROB = 0.1
# ---------------------------------------------------------------------------- #
# Solver
# ---------------------------------------------------------------------------- #
_C.SOLVER = CN()
_C.SOLVER.USE_AMP = False
_C.SOLVER.MAX_ITER = 40000
_C.SOLVER.MULTI_MAX_ITER = () # set different max epoch for different stage
_C.SOLVER.MAX_EPOCH = 0 # any epoch number>0 will overwrite max_iter
_C.SOLVER.MULTI_MAX_EPOCH = () # set different max epoch for different stage
_C.SOLVER.OPTIMIZER = "SGD" # "ADAMW"
_C.SOLVER.BASE_LR = 0.001
_C.SOLVER.LANG_LR = 0.00001
_C.SOLVER.BACKBONE_BODY_LR_FACTOR = 1.0
_C.SOLVER.BIAS_LR_FACTOR = 2
_C.SOLVER.GRAD_CLIP = 0.0
# D2 gradient clip
_C.SOLVER.CLIP_GRADIENTS = CN()
_C.SOLVER.CLIP_GRADIENTS.ENABLED = False
_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.0
_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
_C.SOLVER.MODEL_EMA = 0.0
_C.SOLVER.MOMENTUM = 0.9
_C.SOLVER.WEIGHT_DECAY = 0.0005
_C.SOLVER.WEIGHT_DECAY_BIAS = 0.0
_C.SOLVER.WEIGHT_DECAY_NORM_FACTOR = 1.0
# use cosine lr to replace default multistage
_C.SOLVER.USE_COSINE = False
_C.SOLVER.MIN_LR = 0.000001
_C.SOLVER.GAMMA = 0.1
_C.SOLVER.STEPS = (30000,)
_C.SOLVER.USE_AUTOSTEP = False
_C.SOLVER.STEP_PATIENCE = 5
_C.SOLVER.WARMUP_FACTOR = 1.0 / 3
_C.SOLVER.WARMUP_ITERS = 500
_C.SOLVER.WARMUP_METHOD = "linear"
_C.SOLVER.CHECKPOINT_PERIOD = 2500
_C.SOLVER.CHECKPOINT_PER_EPOCH = -1.0
_C.SOLVER.TEST_WITH_INFERENCE = False
_C.SOLVER.AUTO_TERMINATE_PATIENCE = -1
# Number of images per batch
# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
# see 2 images per batch
_C.SOLVER.IMS_PER_BATCH = 16
# This is the max negative ratio allowed per batch
_C.SOLVER.MAX_NEG_PER_BATCH = 0.1
_C.SOLVER.SEED = 0
_C.SOLVER.DISABLE_OUTPUT_DISTRIBUTED = False
_C.SOLVER.PROMPT_PROBING_LEVEL = -1.0
# -1 means tuning the whole model;
# 1 means tuning the whole language model; 1.5 means tuning the box head as well
_C.SOLVER.FIND_UNUSED_PARAMETERS = True
_C.SOLVER.DATASET_LENGTH = -1 # Just for logging purpose
_C.SOLVER.TUNING_HIGHLEVEL_OVERRIDE = None
_C.SOLVER.USE_EMA_FOR_MONITOR = False
_C.SOLVER.WEIGHT_DECAY_SCHEDULE = False
_C.SOLVER.WEIGHT_DECAY_SCHEDULE_RATIO = 0.667
# ---------------------------------------------------------------------------- #
# Specific test options
# ---------------------------------------------------------------------------- #
_C.TEST = CN()
_C.TEST.EXPECTED_RESULTS = []
_C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4
_C.TEST.DURING_TRAINING = False
# Number of images per batch
# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
# see 2 images per batch
_C.TEST.IMS_PER_BATCH = 16
# Special Test Configuration
_C.TEST.USE_MULTISCALE = False
# _C.TEST.SCALES = (400, 600, 800, 1000, 1200, 1400)
# _C.TEST.RANGES = ((96, 10000), (64, 10000), (0, 10000), (0, 10000), (0, 256), (0, 192))
_C.TEST.SCALES = (400, 500, 600, 640, 700, 900, 1000, 1100, 1200, 1300, 1400, 1800)
_C.TEST.RANGES = ((96, 10000), (96, 10000), (64, 10000), (64, 10000), (64, 10000), (0, 10000), (0, 10000), (0, 256), (0, 256), (0, 192), (0, 192), (0, 96))
_C.TEST.MAX_SIZE = 2500
_C.TEST.FLIP = True
_C.TEST.SPECIAL_NMS = 'none' # ('none', 'soft-nms', 'vote', 'soft-vote')
_C.TEST.TH = 0.6 # threshold for nms or vote
_C.TEST.PRE_NMS_TOP_N = 1000
_C.TEST.NUM_CLASSES = 81
_C.TEST.SELECT_CLASSES = ()
_C.TEST.EVAL_TASK = ""
_C.TEST.SUBSET = -1
_C.TEST.CHUNKED_EVALUATION = -1
_C.TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM = -1
# ---------------------------------------------------------------------------- #
# Misc options
# ---------------------------------------------------------------------------- #
_C.OUTPUT_DIR = "OUTPUT"
_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
# TensorBoard experiment location
_C.TENSORBOARD_EXP = "OUTPUT"
================================================
FILE: GLIP/maskrcnn_benchmark/config/paths_catalog.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
"""Centralized catalog of paths."""
import os
def try_to_find(file, return_dir=False, search_path=['./DATASET', './OUTPUT', './data', './MODEL']):
if not file:
return file
if file.startswith('catalog://'):
return file
DATASET_PATH = ['./']
if 'DATASET' in os.environ:
DATASET_PATH.append(os.environ['DATASET'])
DATASET_PATH += search_path
for path in DATASET_PATH:
if os.path.exists(os.path.join(path, file)):
if return_dir:
return path
else:
return os.path.join(path, file)
print('Cannot find {} in {}'.format(file, DATASET_PATH))
exit(1)
class DatasetCatalog(object):
DATASETS = {
# pretrained grounding dataset
# mixed vg and coco
"mixed_train": {
"coco_img_dir": "coco/train2014",
"vg_img_dir": "gqa/images",
"ann_file": "mdetr_annotations/final_mixed_train.json",
},
"mixed_train_no_coco": {
"coco_img_dir": "coco/train2014",
"vg_img_dir": "gqa/images",
"ann_file": "mdetr_annotations/final_mixed_train_no_coco.json",
},
# flickr30k
"flickr30k_train": {
"img_folder": "flickr30k/flickr30k_images/train",
"ann_file": "mdetr_annotations/final_flickr_separateGT_train.json",
"is_train": True
},
"flickr30k_val": {
"img_folder": "flickr30k/flickr30k_images/val",
"ann_file": "mdetr_annotations/final_flickr_separateGT_val.json",
"is_train": False
},
"flickr30k_test": {
"img_folder": "flickr30k/flickr30k_images/test",
"ann_file": "mdetr_annotations/final_flickr_separateGT_test.json",
"is_train": False
},
# refcoco
"refexp_all_val": {
"img_dir": "refcoco/train2014",
"ann_file": "mdetr_annotations/final_refexp_val.json",
"is_train": False
},
# gqa
"gqa_val": {
"img_dir": "gqa/images",
"ann_file": "mdetr_annotations/final_gqa_val.json",
"is_train": False
},
# phrasecut
"phrasecut_train": {
"img_dir": "gqa/images",
"ann_file": "mdetr_annotations/finetune_phrasecut_train.json",
"is_train": True
},
# od to grounding
# coco tsv
"coco_dt_train": {
"dataset_file": "coco_dt",
"yaml_path": "coco_tsv/coco_obj.yaml",
"is_train": True,
},
"COCO_odinw_train_8copy_dt_train": {
"dataset_file": "coco_odinw_dt",
"yaml_path": "coco_tsv/COCO_odinw_train_8copy.yaml",
"is_train": True,
},
"COCO_odinw_val_dt_train": {
"dataset_file": "coco_odinw_dt",
"yaml_path": "coco_tsv/COCO_odinw_val.yaml",
"is_train": False,
},
# lvis tsv
"lvisv1_dt_train": {
"dataset_file": "lvisv1_dt",
"yaml_path": "coco_tsv/LVIS_v1_train.yaml",
"is_train": True,
},
"LVIS_odinw_train_8copy_dt_train": {
"dataset_file": "coco_odinw_dt",
"yaml_path": "coco_tsv/LVIS_odinw_train_8copy.yaml",
"is_train": True,
},
# object365 tsv
"object365_dt_train": {
"dataset_file": "object365_dt",
"yaml_path": "Objects365/objects365_train_vgoiv6.cas2000.yaml",
"is_train": True,
},
"object365_odinw_2copy_dt_train": {
"dataset_file": "object365_odinw_dt",
"yaml_path": "Objects365/objects365_train_odinw.cas2000_2copy.yaml",
"is_train": True,
},
"objects365_odtsv_train": {
"dataset_file": "objects365_odtsv",
"yaml_path": "Objects365/train.cas2000.yaml",
"is_train": True,
},
"objects365_odtsv_val": {
"dataset_file": "objects365_odtsv",
"yaml_path": "Objects365/val.yaml",
"is_train": False,
},
# ImagetNet OD
"imagenetod_train_odinw_2copy_dt": {
"dataset_file": "imagenetod_odinw_dt",
"yaml_path": "imagenet_od/imagenetod_train_odinw_2copy.yaml",
"is_train": True,
},
# OpenImage OD
"oi_train_odinw_dt": {
"dataset_file": "oi_odinw_dt",
"yaml_path": "openimages_v5c/oi_train_odinw.cas.2000.yaml",
"is_train": True,
},
# vg tsv
"vg_dt_train": {
"dataset_file": "vg_dt",
"yaml_path": "visualgenome/train_vgoi6_clipped.yaml",
"is_train": True,
},
"vg_odinw_clipped_8copy_dt_train": {
"dataset_file": "vg_odinw_clipped_8copy_dt",
"yaml_path": "visualgenome/train_odinw_clipped_8copy.yaml",
"is_train": True,
},
"vg_vgoi6_clipped_8copy_dt_train": {
"dataset_file": "vg_vgoi6_clipped_8copy_dt",
"yaml_path": "visualgenome/train_vgoi6_clipped_8copy.yaml",
"is_train": True,
},
# coco json
"coco_grounding_train": {
"img_dir": "coco/train2017",
"ann_file": "coco/annotations/instances_train2017.json",
"is_train": True,
},
"lvis_grounding_train": {
"img_dir": "coco",
"ann_file": "coco/annotations/lvis_od_train.json"
},
"lvis_val": {
"img_dir": "coco",
"ann_file": "coco/annotations/lvis_od_val.json"
},
"coco_2017_train": {
"img_dir": "coco/train2017",
"ann_file": "coco/annotations/instances_train2017.json"
},
"coco_2017_val": {
"img_dir": "coco/val2017",
"ann_file": "coco/annotations/instances_val2017.json"
},
"coco_2017_test": {
"img_dir": "coco/test2017",
"ann_file": "coco/annotations/image_info_test-dev2017.json"
},
"coco_2014_train": {
"img_dir": "coco/train2014",
"ann_file": "coco/annotations/instances_train2014.json"
},
"coco_2014_val": {
"img_dir": "coco/val2014",
"ann_file": "coco/annotations/instances_val2014.json"
},
"coco_2014_minival": {
"img_dir": "coco/val2014",
"ann_file": "coco/annotations/instances_minival2014.json"
},
}
@staticmethod
def set(name, info):
DatasetCatalog.DATASETS.update({name: info})
@staticmethod
def get(name):
if name.endswith('_bg'):
attrs = DatasetCatalog.DATASETS[name]
data_dir = try_to_find(attrs["ann_file"], return_dir=True)
args = dict(
root=os.path.join(data_dir, attrs["img_dir"]),
ann_file=os.path.join(data_dir, attrs["ann_file"]),
)
return dict(
factory="Background",
args=args,
)
else:
if "bing" in name.split("_"):
attrs = DatasetCatalog.DATASETS["bing_caption_train"]
else:
attrs = DatasetCatalog.DATASETS[name]
if "voc" in name and 'split' in attrs:
data_dir = try_to_find(attrs["data_dir"], return_dir=True)
args = dict(
data_dir=os.path.join(data_dir, attrs["data_dir"]),
split=attrs["split"],
)
return dict(
factory="PascalVOCDataset",
args=args,
)
elif "mixed" in name:
vg_img_dir = try_to_find(attrs["vg_img_dir"], return_dir=True)
coco_img_dir = try_to_find(attrs["coco_img_dir"], return_dir=True)
ann_file = try_to_find(attrs["ann_file"], return_dir=True)
args = dict(
img_folder_coco=os.path.join(coco_img_dir, attrs["coco_img_dir"]),
img_folder_vg=os.path.join(vg_img_dir, attrs["vg_img_dir"]),
ann_file=os.path.join(ann_file, attrs["ann_file"])
)
return dict(
factory="MixedDataset",
args=args,
)
elif "flickr" in name:
img_dir = try_to_find(attrs["img_folder"], return_dir=True)
ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
args = dict(
img_folder=os.path.join(img_dir, attrs["img_folder"]),
ann_file=os.path.join(ann_dir, attrs["ann_file"]),
is_train=attrs["is_train"]
)
return dict(
factory="FlickrDataset",
args=args,
)
elif "refexp" in name:
img_dir = try_to_find(attrs["img_dir"], return_dir=True)
ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
args = dict(
img_folder=os.path.join(img_dir, attrs["img_dir"]),
ann_file=os.path.join(ann_dir, attrs["ann_file"]),
)
return dict(
factory="RefExpDataset",
args=args,
)
elif "gqa" in name:
img_dir = try_to_find(attrs["img_dir"], return_dir=True)
ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
args = dict(
img_folder=os.path.join(img_dir, attrs["img_dir"]),
ann_file=os.path.join(ann_dir, attrs["ann_file"]),
)
return dict(
factory="GQADataset",
args=args,
)
elif "phrasecut" in name:
img_dir = try_to_find(attrs["img_dir"], return_dir=True)
ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
args = dict(
img_folder=os.path.join(img_dir, attrs["img_dir"]),
ann_file=os.path.join(ann_dir, attrs["ann_file"]),
)
return dict(
factory="PhrasecutDetection",
args=args,
)
elif "_caption" in name:
yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
if "no_coco" in name:
yaml_name = attrs["yaml_name_no_coco"]
else:
yaml_name = attrs["yaml_name"]
yaml_file_name = "{}.{}.yaml".format(yaml_name, name.split("_")[2])
args = dict(
yaml_file=os.path.join(yaml_path, attrs["yaml_path"], yaml_file_name)
)
return dict(
factory="CaptionTSV",
args=args,
)
elif "inferencecap" in name:
yaml_file_name = try_to_find(attrs["yaml_path"])
args = dict(
yaml_file=yaml_file_name)
return dict(
factory="CaptionTSV",
args=args,
)
elif "pseudo_data" in name:
args = dict(
yaml_file=try_to_find(attrs["yaml_path"])
)
return dict(
factory="PseudoData",
args=args,
)
elif "_dt" in name:
dataset_file = attrs["dataset_file"]
yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
args = dict(
name=dataset_file,
yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
)
return dict(
factory="CocoDetectionTSV",
args=args,
)
elif "_odtsv" in name:
dataset_file = attrs["dataset_file"]
yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
args = dict(
name=dataset_file,
yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
)
return dict(
factory="ODTSVDataset",
args=args,
)
elif "_grounding" in name:
img_dir = try_to_find(attrs["img_dir"], return_dir=True)
ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
args = dict(
img_folder=os.path.join(img_dir, attrs["img_dir"]),
ann_file=os.path.join(ann_dir, attrs["ann_file"]),
)
return dict(
factory="CocoGrounding",
args=args,
)
elif "lvis_evaluation" in name:
img_dir = try_to_find(attrs["img_dir"], return_dir=True)
ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
args = dict(
img_folder=os.path.join(img_dir, attrs["img_dir"]),
ann_file=os.path.join(ann_dir, attrs["ann_file"]),
)
return dict(
factory="LvisDetection",
args=args,
)
else:
ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
img_dir = try_to_find(attrs["img_dir"], return_dir=True)
args = dict(
root=os.path.join(img_dir, attrs["img_dir"]),
ann_file=os.path.join(ann_dir, attrs["ann_file"]),
)
for k, v in attrs.items():
args.update({k: os.path.join(ann_dir, v)})
return dict(
factory="COCODataset",
args=args,
)
raise RuntimeError("Dataset not available: {}".format(name))
class ModelCatalog(object):
S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
C2_IMAGENET_MODELS = {
"MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
"MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
"MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
"MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
"FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
"FAIR/20171220/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
}
C2_DETECTRON_SUFFIX = "output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl"
C2_DETECTRON_MODELS = {
"35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
"35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
"35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
"36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
"35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
"35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
"35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
"36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
}
@staticmethod
def get(name):
if name.startswith("Caffe2Detectron/COCO"):
return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
if name.startswith("ImageNetPretrained"):
return ModelCatalog.get_c2_imagenet_pretrained(name)
raise RuntimeError("model not present in the catalog {}".format(name))
@staticmethod
def get_c2_imagenet_pretrained(name):
prefix = ModelCatalog.S3_C2_DETECTRON_URL
name = name[len("ImageNetPretrained/"):]
name = ModelCatalog.C2_IMAGENET_MODELS[name]
url = "/".join([prefix, name])
return url
@staticmethod
def get_c2_detectron_12_2017_baselines(name):
# Detectron C2 models are stored following the structure
# prefix//2012_2017_baselines/.yaml./suffix
# we use as identifiers in the catalog Caffe2Detectron/COCO//
prefix = ModelCatalog.S3_C2_DETECTRON_URL
suffix = ModelCatalog.C2_DETECTRON_SUFFIX
# remove identification prefix
name = name[len("Caffe2Detectron/COCO/"):]
# split in and
model_id, model_name = name.split("/")
# parsing to make it match the url address from the Caffe2 models
model_name = "{}.yaml".format(model_name)
signature = ModelCatalog.C2_DETECTRON_MODELS[name]
unique_name = ".".join([model_name, signature])
url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
return url
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/ROIAlign.h
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#pragma once
#include "cpu/vision.h"
#ifdef WITH_CUDA
#include "cuda/vision.h"
#endif
// Interface for Python
at::Tensor ROIAlign_forward(const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
}
at::Tensor ROIAlign_backward(const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio) {
if (grad.device().is_cuda()) {
#ifdef WITH_CUDA
return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/ROIPool.h
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#pragma once
#include "cpu/vision.h"
#ifdef WITH_CUDA
#include "cuda/vision.h"
#endif
std::tuple ROIPool_forward(const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
at::Tensor ROIPool_backward(const at::Tensor& grad,
const at::Tensor& input,
const at::Tensor& rois,
const at::Tensor& argmax,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width) {
if (grad.device().is_cuda()) {
#ifdef WITH_CUDA
return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
================================================
#pragma once
#include "cpu/vision.h"
#ifdef WITH_CUDA
#include "cuda/vision.h"
#endif
// Interface for Python
at::Tensor SigmoidFocalLoss_forward(
const at::Tensor& logits,
const at::Tensor& targets,
const int num_classes,
const float gamma,
const float alpha) {
if (logits.device().is_cuda()) {
#ifdef WITH_CUDA
return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
at::Tensor SigmoidFocalLoss_backward(
const at::Tensor& logits,
const at::Tensor& targets,
const at::Tensor& d_losses,
const int num_classes,
const float gamma,
const float alpha) {
if (logits.device().is_cuda()) {
#ifdef WITH_CUDA
return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include "cpu/vision.h"
// implementation taken from Caffe2
template
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template
void pre_calc_for_bilinear_interpolate(
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int iy_upper,
const int ix_upper,
T roi_start_h,
T roi_start_w,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
std::vector>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast(iy + .5f) * bin_size_h /
static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast(ix + .5f) * bin_size_w /
static_cast(roi_bin_grid_w);
T x = xx;
T y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indeces
PreCalc pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
template
void ROIAlignForward_cpu_kernel(
const int nthreads,
const T* bottom_data,
const T& spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const T* bottom_rois,
//int roi_cols,
T* top_data) {
//AT_ASSERT(roi_cols == 4 || roi_cols == 5);
int roi_cols = 5;
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
// roi could have 4 or 5 columns
const T* offset_bottom_rois = bottom_rois + n * roi_cols;
int roi_batch_ind = 0;
if (roi_cols == 5) {
roi_batch_ind = offset_bottom_rois[0];
offset_bottom_rois++;
}
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_bottom_rois[0] * spatial_scale;
T roi_start_h = offset_bottom_rois[1] * spatial_scale;
T roi_end_w = offset_bottom_rois[2] * spatial_scale;
T roi_end_h = offset_bottom_rois[3] * spatial_scale;
// T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
// T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
// T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
// T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
// Force malformed ROIs to be 1x1
T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
// we want to precalculate indeces and weights shared by all chanels,
// this is the key point of optimiation
std::vector> pre_calc(
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
height,
width,
pooled_height,
pooled_width,
roi_bin_grid_h,
roi_bin_grid_w,
roi_start_h,
roi_start_w,
bin_size_h,
bin_size_w,
roi_bin_grid_h,
roi_bin_grid_w,
pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_bottom_data[pc.pos1] +
pc.w2 * offset_bottom_data[pc.pos2] +
pc.w3 * offset_bottom_data[pc.pos3] +
pc.w4 * offset_bottom_data[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
top_data[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
AT_ASSERTM(!input.device().is_cuda(), "input must be a CPU tensor");
AT_ASSERTM(!rois.device().is_cuda(), "rois must be a CPU tensor");
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
if (output.numel() == 0) {
return output;
}
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
ROIAlignForward_cpu_kernel(
output_size,
input.data_ptr(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
rois.data_ptr(),
output.data_ptr());
});
return output;
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include "cpu/vision.h"
template
at::Tensor nms_cpu_kernel(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold) {
AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
}
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data_ptr();
auto order = order_t.data_ptr();
auto x1 = x1_t.data_ptr();
auto y1 = y1_t.data_ptr();
auto x2 = x2_t.data_ptr();
auto y2 = y2_t.data_ptr();
auto areas = areas_t.data_ptr();
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1)
continue;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1)
continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast(0), xx2 - xx1 + 1);
auto h = std::max(static_cast(0), yy2 - yy1 + 1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= threshold)
suppressed[j] = 1;
}
}
return at::nonzero(suppressed_t == 0).squeeze(1);
}
at::Tensor nms_cpu(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold) {
at::Tensor result;
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
result = nms_cpu_kernel(dets, scores, threshold);
});
return result;
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cpu/soft_nms.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include "cpu/vision.h"
template
std::pair soft_nms_cpu_kernel(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold,
const float sigma) {
AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
if (dets.numel() == 0) {
return std::make_pair(at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)),
at::empty({0}, scores.options().dtype(at::kFloat).device(at::kCPU)));
}
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores_t = scores.clone();
at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
auto ndets = dets.size(0);
auto inds_t = at::arange(ndets, dets.options().dtype(at::kLong).device(at::kCPU));
auto x1 = x1_t.data_ptr();
auto y1 = y1_t.data_ptr();
auto x2 = x2_t.data_ptr();
auto y2 = y2_t.data_ptr();
auto s = scores_t.data_ptr();
auto inds = inds_t.data_ptr();
auto areas = areas_t.data_ptr();
for (int64_t i = 0; i < ndets; i++) {
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto is = s[i];
auto ii = inds[i];
auto iarea = areas[i];
auto maxpos = scores_t.slice(0, i, ndets).argmax().item() + i;
// add max box as a detection
x1[i] = x1[maxpos];
y1[i] = y1[maxpos];
x2[i] = x2[maxpos];
y2[i] = y2[maxpos];
s[i] = s[maxpos];
inds[i] = inds[maxpos];
areas[i] = areas[maxpos];
// swap ith box with position of max box
x1[maxpos] = ix1;
y1[maxpos] = iy1;
x2[maxpos] = ix2;
y2[maxpos] = iy2;
s[maxpos] = is;
inds[maxpos] = ii;
areas[maxpos] = iarea;
ix1 = x1[i];
iy1 = y1[i];
ix2 = x2[i];
iy2 = y2[i];
iarea = areas[i];
// NMS iterations, note that ndets changes if detection boxes
// fall below threshold
for (int64_t j = i + 1; j < ndets; j++) {
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast(0), xx2 - xx1 + 1);
auto h = std::max(static_cast(0), yy2 - yy1 + 1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
s[j] = s[j] * std::exp(- std::pow(ovr, 2.0) / sigma);
// if box score falls below threshold, discard the box by
// swapping with last box update ndets
if (s[j] < threshold) {
x1[j] = x1[ndets - 1];
y1[j] = y1[ndets - 1];
x2[j] = x2[ndets - 1];
y2[j] = y2[ndets - 1];
s[j] = s[ndets - 1];
inds[j] = inds[ndets - 1];
areas[j] = areas[ndets - 1];
j--;
ndets--;
}
}
}
return std::make_pair(inds_t.slice(0, 0, ndets), scores_t.slice(0, 0, ndets));
}
std::pair soft_nms_cpu(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold,
const float sigma) {
std::pair result;
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "soft_nms", [&] {
result = soft_nms_cpu_kernel(dets, scores, threshold, sigma);
});
return result;
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cpu/vision.h
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#pragma once
#include
at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio);
at::Tensor nms_cpu(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold);
std::pair soft_nms_cpu(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold,
const float sigma);
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include
#include
#include
#include
#include
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template
__device__ T bilinear_interpolate(const T* bottom_data,
const int height, const int width,
T y, T x,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
//empty
return 0;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low = (int) y;
int x_low = (int) x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T) y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T) x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = bottom_data[y_low * width + x_low];
T v2 = bottom_data[y_low * width + x_high];
T v3 = bottom_data[y_high * width + x_low];
T v4 = bottom_data[y_high * width + x_high];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template
__global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
const T spatial_scale, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int sampling_ratio,
const T* bottom_rois, T* top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_bottom_rois[1] * spatial_scale;
T roi_start_h = offset_bottom_rois[2] * spatial_scale;
T roi_end_w = offset_bottom_rois[3] * spatial_scale;
T roi_end_h = offset_bottom_rois[4] * spatial_scale;
// T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
// T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
// T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
// T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
// Force malformed ROIs to be 1x1
T roi_width = max(roi_end_w - roi_start_w, (T)1.);
T roi_height = max(roi_end_h - roi_start_h, (T)1.);
T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
{
const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix ++)
{
const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w);
T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
output_val += val;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
template
__device__ void bilinear_interpolate_gradient(
const int height, const int width,
T y, T x,
T & w1, T & w2, T & w3, T & w4,
int & x_low, int & x_high, int & y_low, int & y_high,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
//empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
y_low = (int) y;
x_low = (int) x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T) y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T) x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = bottom_data[y_low * width + x_low];
// T v2 = bottom_data[y_low * width + x_high];
// T v3 = bottom_data[y_high * width + x_low];
// T v4 = bottom_data[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template
__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
const int num_rois, const T spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width,
const int sampling_ratio,
T* bottom_diff,
const T* bottom_rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_bottom_rois[1] * spatial_scale;
T roi_start_h = offset_bottom_rois[2] * spatial_scale;
T roi_end_w = offset_bottom_rois[3] * spatial_scale;
T roi_end_h = offset_bottom_rois[4] * spatial_scale;
// T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
// T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
// T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
// T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
// Force malformed ROIs to be 1x1
T roi_width = max(roi_end_w - roi_start_w, (T)1.);
T roi_height = max(roi_end_h - roi_start_h, (T)1.);
T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
{
const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix ++)
{
const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x,
w1, w2, w3, w4,
x_low, x_high, y_low, y_high,
index);
T g1 = top_diff_this_bin * w1 / count;
T g2 = top_diff_this_bin * w2 / count;
T g3 = top_diff_this_bin * w3 / count;
T g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
{
atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1));
atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2));
atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3));
atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4));
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RoIAlignBackward
at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
dim3 block(512);
if (output.numel() == 0) {
THCudaCheck(cudaGetLastError());
return output;
}
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
RoIAlignForward<<>>(
output_size,
input.contiguous().data_ptr(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
rois.contiguous().data_ptr(),
output.data_ptr());
});
THCudaCheck(cudaGetLastError());
return output;
}
// TODO remove the dependency on input and use instead its sizes -> save memory
at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width,
const int sampling_ratio) {
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
auto num_rois = rois.size(0);
auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
dim3 block(512);
// handle possibly empty gradients
if (grad.numel() == 0) {
THCudaCheck(cudaGetLastError());
return grad_input;
}
AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
RoIAlignBackwardFeature<<>>(
grad.numel(),
grad.contiguous().data_ptr(),
num_rois,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
grad_input.data_ptr(),
rois.contiguous().data_ptr());
});
THCudaCheck(cudaGetLastError());
return grad_input;
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include
#include
#include
#include
#include
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template
__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
const T spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const T* bottom_rois, T* top_data, int* argmax_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
// Force malformed ROIs to be 1x1
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
T bin_size_h = static_cast(roi_height)
/ static_cast(pooled_height);
T bin_size_w = static_cast(roi_width)
/ static_cast(pooled_width);
int hstart = static_cast(floor(static_cast(ph)
* bin_size_h));
int wstart = static_cast(floor(static_cast(pw)
* bin_size_w));
int hend = static_cast(ceil(static_cast(ph + 1)
* bin_size_h));
int wend = static_cast(ceil(static_cast(pw + 1)
* bin_size_w));
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, 0), height);
hend = min(max(hend + roi_start_h, 0), height);
wstart = min(max(wstart + roi_start_w, 0), width);
wend = min(max(wend + roi_start_w, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
// Define an empty pooling region to be zero
T maxval = is_empty ? 0 : -FLT_MAX;
// If nothing is pooled, argmax = -1 causes nothing to be backprop'd
int maxidx = -1;
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int bottom_index = h * width + w;
if (offset_bottom_data[bottom_index] > maxval) {
maxval = offset_bottom_data[bottom_index];
maxidx = bottom_index;
}
}
}
top_data[index] = maxval;
argmax_data[index] = maxidx;
}
}
template
__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
const int* argmax_data, const int num_rois, const T spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, T* bottom_diff,
const T* bottom_rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
int bottom_offset = (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
T* offset_bottom_diff = bottom_diff + bottom_offset;
const int* offset_argmax_data = argmax_data + top_offset;
int argmax = offset_argmax_data[ph * pooled_width + pw];
if (argmax != -1) {
atomicAdd(
offset_bottom_diff + argmax,
static_cast(offset_top_diff[ph * pooled_width + pw]));
}
}
}
std::tuple ROIPool_forward_cuda(const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width) {
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt));
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
dim3 block(512);
if (output.numel() == 0) {
THCudaCheck(cudaGetLastError());
return std::make_tuple(output, argmax);
}
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
RoIPoolFForward<<>>(
output_size,
input.contiguous().data_ptr(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
rois.contiguous().data_ptr(),
output.data_ptr(),
argmax.data_ptr());
});
THCudaCheck(cudaGetLastError());
return std::make_tuple(output, argmax);
}
// TODO remove the dependency on input and use instead its sizes -> save memory
at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
const at::Tensor& input,
const at::Tensor& rois,
const at::Tensor& argmax,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int batch_size,
const int channels,
const int height,
const int width) {
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
// TODO add more checks
auto num_rois = rois.size(0);
auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
dim3 block(512);
// handle possibly empty gradients
if (grad.numel() == 0) {
THCudaCheck(cudaGetLastError());
return grad_input;
}
AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
RoIPoolFBackward<<>>(
grad.numel(),
grad.contiguous().data_ptr(),
argmax.data_ptr(),
num_rois,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
grad_input.data_ptr(),
rois.contiguous().data_ptr());
});
THCudaCheck(cudaGetLastError());
return grad_input;
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
// Cheng-Yang Fu
// cyfu@cs.unc.edu
#include
#include
#include
#include
#include
#include
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template
__global__ void SigmoidFocalLossForward(const int nthreads,
const T* logits,
const int* targets,
const int num_classes,
const float gamma,
const float alpha,
const int num,
T* losses) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
int n = i / num_classes;
int d = i % num_classes; // current class[0~79];
int t = targets[n]; // target class [1~80];
// Decide it is positive or negative case.
T c1 = (t == (d+1));
T c2 = (t>=0 & t != (d+1));
T zn = (1.0 - alpha);
T zp = (alpha);
// p = 1. / 1. + expf(-x); p = sigmoid(x)
T p = 1. / (1. + expf(-logits[i]));
// (1-p)**gamma * log(p) where
T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
// p**gamma * log(1-p)
T term2 = powf(p, gamma) *
(-1. * logits[i] * (logits[i] >= 0) -
logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
losses[i] = 0.0;
losses[i] += -c1 * term1 * zp;
losses[i] += -c2 * term2 * zn;
} // CUDA_1D_KERNEL_LOOP
} // SigmoidFocalLossForward
template
__global__ void SigmoidFocalLossBackward(const int nthreads,
const T* logits,
const int* targets,
const T* d_losses,
const int num_classes,
const float gamma,
const float alpha,
const int num,
T* d_logits) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
int n = i / num_classes;
int d = i % num_classes; // current class[0~79];
int t = targets[n]; // target class [1~80], 0 is background;
// Decide it is positive or negative case.
T c1 = (t == (d+1));
T c2 = (t>=0 & t != (d+1));
T zn = (1.0 - alpha);
T zp = (alpha);
// p = 1. / 1. + expf(-x); p = sigmoid(x)
T p = 1. / (1. + expf(-logits[i]));
// (1-p)**g * (1 - p - g*p*log(p)
T term1 = powf((1. - p), gamma) *
(1. - p - (p * gamma * logf(max(p, FLT_MIN))));
// (p**g) * (g*(1-p)*log(1-p) - p)
T term2 = powf(p, gamma) *
((-1. * logits[i] * (logits[i] >= 0) -
logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
(1. - p) * gamma - p);
d_logits[i] = 0.0;
d_logits[i] += -c1 * term1 * zp;
d_logits[i] += -c2 * term2 * zn;
d_logits[i] = d_logits[i] * d_losses[i];
} // CUDA_1D_KERNEL_LOOP
} // SigmoidFocalLossBackward
at::Tensor SigmoidFocalLoss_forward_cuda(
const at::Tensor& logits,
const at::Tensor& targets,
const int num_classes,
const float gamma,
const float alpha) {
AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
const int num_samples = logits.size(0);
auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
auto losses_size = num_samples * logits.size(1);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
dim3 block(512);
if (losses.numel() == 0) {
THCudaCheck(cudaGetLastError());
return losses;
}
AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
SigmoidFocalLossForward<<>>(
losses_size,
logits.contiguous().data_ptr(),
targets.contiguous().data_ptr(),
num_classes,
gamma,
alpha,
num_samples,
losses.data_ptr());
});
THCudaCheck(cudaGetLastError());
return losses;
}
at::Tensor SigmoidFocalLoss_backward_cuda(
const at::Tensor& logits,
const at::Tensor& targets,
const at::Tensor& d_losses,
const int num_classes,
const float gamma,
const float alpha) {
AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
const int num_samples = logits.size(0);
AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
auto d_logits_size = num_samples * logits.size(1);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
dim3 block(512);
if (d_logits.numel() == 0) {
THCudaCheck(cudaGetLastError());
return d_logits;
}
AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
SigmoidFocalLossBackward<<>>(
d_logits_size,
logits.contiguous().data_ptr(),
targets.contiguous().data_ptr(),
d_losses.contiguous().data_ptr(),
num_classes,
gamma,
alpha,
num_samples,
d_logits.data_ptr());
});
THCudaCheck(cudaGetLastError());
return d_logits;
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu
================================================
// modify from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
#include
#include
#include
#include
#include
#include
#include
void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
const int channels, const int height, const int width,
const int ksize_h, const int ksize_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
at::Tensor data_col);
void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
const int channels, const int height, const int width,
const int ksize_h, const int ksize_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
at::Tensor grad_im);
void deformable_col2im_coord(
const at::Tensor data_col, const at::Tensor data_im,
const at::Tensor data_offset, const int channels, const int height,
const int width, const int ksize_h, const int ksize_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, at::Tensor grad_offset);
void modulated_deformable_im2col_cuda(
const at::Tensor data_im, const at::Tensor data_offset,
const at::Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
at::Tensor data_col);
void modulated_deformable_col2im_cuda(
const at::Tensor data_col, const at::Tensor data_offset,
const at::Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
at::Tensor grad_im);
void modulated_deformable_col2im_coord_cuda(
const at::Tensor data_col, const at::Tensor data_im,
const at::Tensor data_offset, const at::Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, at::Tensor grad_offset,
at::Tensor grad_mask);
void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
int padW, int dilationH, int dilationW, int group,
int deformable_group)
{
TORCH_CHECK(weight.ndimension() == 4,
"4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
"but got: %s",
weight.ndimension());
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
TORCH_CHECK(kW > 0 && kH > 0,
"kernel size should be greater than zero, but got kH: %d kW: %d", kH,
kW);
TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
"kernel size should be consistent with weight, ",
"but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
kW, weight.size(2), weight.size(3));
TORCH_CHECK(dW > 0 && dH > 0,
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
TORCH_CHECK(
dilationW > 0 && dilationH > 0,
"dilation should be greater than 0, but got dilationH: %d dilationW: %d",
dilationH, dilationW);
int ndim = input.ndimension();
int dimf = 0;
int dimh = 1;
int dimw = 2;
if (ndim == 4) {
dimf++;
dimh++;
dimw++;
}
TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
ndim);
long nInputPlane = weight.size(1) * group;
long inputHeight = input.size(dimh);
long inputWidth = input.size(dimw);
long nOutputPlane = weight.size(0);
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
TORCH_CHECK(nInputPlane % deformable_group == 0,
"input channels must divide deformable group size");
if (outputWidth < 1 || outputHeight < 1)
AT_ERROR(
"Given input size: (%ld x %ld x %ld). "
"Calculated output size: (%ld x %ld x %ld). Output size is too small",
nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
outputWidth);
TORCH_CHECK(input.size(1) == nInputPlane,
"invalid number of input planes, expected: %d, but got: %d",
nInputPlane, input.size(1));
TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
"input image is smaller than kernel");
TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
"invalid spatial size of offset, expected height: %d width: %d, but "
"got height: %d width: %d",
outputHeight, outputWidth, offset.size(2), offset.size(3));
TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
"invalid number of channels of offset");
if (gradOutput != NULL) {
TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
"invalid number of gradOutput planes, expected: %d, but got: %d",
nOutputPlane, gradOutput->size(dimf));
TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
gradOutput->size(dimw) == outputWidth),
"invalid size of gradOutput, expected height: %d width: %d , but "
"got height: %d width: %d",
outputHeight, outputWidth, gradOutput->size(dimh),
gradOutput->size(dimw));
}
}
int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
at::Tensor offset, at::Tensor output,
at::Tensor columns, at::Tensor ones, int kW,
int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group,
int deformable_group, int im2col_step)
{
// todo: resize columns to include im2col: done
// todo: add im2col_step as input
// todo: add new output buffer and transpose it to output (or directly
// transpose output) todo: possibly change data indexing because of
// parallel_imgs
shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
dilationH, dilationW, group, deformable_group);
input = input.contiguous();
offset = offset.contiguous();
weight = weight.contiguous();
int batch = 1;
if (input.ndimension() == 3) {
// Force batch
batch = 0;
input.unsqueeze_(0);
offset.unsqueeze_(0);
}
// todo: assert batchsize dividable by im2col_step
long batchSize = input.size(0);
long nInputPlane = input.size(1);
long inputHeight = input.size(2);
long inputWidth = input.size(3);
long nOutputPlane = weight.size(0);
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
outputHeight, outputWidth});
columns = at::zeros(
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
input.options());
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
ones = at::ones({outputHeight, outputWidth}, input.options());
}
input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
inputHeight, inputWidth});
offset =
offset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight, outputWidth});
at::Tensor output_buffer =
at::zeros({batchSize / im2col_step, nOutputPlane,
im2col_step * outputHeight, outputWidth},
output.options());
output_buffer = output_buffer.view(
{output_buffer.size(0), group, output_buffer.size(1) / group,
output_buffer.size(2), output_buffer.size(3)});
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.size(2), weight.size(3)});
for (int g = 0; g < group; g++) {
output_buffer[elt][g] = output_buffer[elt][g]
.flatten(1)
.addmm_(weight[g].flatten(1), columns[g])
.view_as(output_buffer[elt][g]);
}
}
output_buffer = output_buffer.view(
{output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
output_buffer.size(3), output_buffer.size(4)});
output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
im2col_step, outputHeight, outputWidth});
output_buffer.transpose_(1, 2);
output.copy_(output_buffer);
output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
offset = offset.view(
{batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
if (batch == 0) {
output = output.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth});
offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
}
return 1;
}
int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
at::Tensor gradOutput, at::Tensor gradInput,
at::Tensor gradOffset, at::Tensor weight,
at::Tensor columns, int kW, int kH, int dW,
int dH, int padW, int padH, int dilationW,
int dilationH, int group,
int deformable_group, int im2col_step)
{
shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
dilationH, dilationW, group, deformable_group);
input = input.contiguous();
offset = offset.contiguous();
gradOutput = gradOutput.contiguous();
weight = weight.contiguous();
int batch = 1;
if (input.ndimension() == 3) {
// Force batch
batch = 0;
input = input.view({1, input.size(0), input.size(1), input.size(2)});
offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
gradOutput = gradOutput.view(
{1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
}
long batchSize = input.size(0);
long nInputPlane = input.size(1);
long inputHeight = input.size(2);
long inputWidth = input.size(3);
long nOutputPlane = weight.size(0);
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
columns = at::zeros(
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
input.options());
// change order of grad output
gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
nOutputPlane, outputHeight, outputWidth});
gradOutput.transpose_(1, 2);
gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
inputHeight, inputWidth});
input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
inputHeight, inputWidth});
gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight,
outputWidth});
offset =
offset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight, outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
// divide into groups
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.size(2), weight.size(3)});
gradOutput = gradOutput.view(
{gradOutput.size(0), group, gradOutput.size(1) / group,
gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
for (int g = 0; g < group; g++) {
columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
gradOutput = gradOutput.view(
{gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
dilationH, dilationW, im2col_step, deformable_group,
gradOffset[elt]);
deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, gradInput[elt]);
}
gradOutput.transpose_(1, 2);
gradOutput =
gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
gradOffset = gradOffset.view(
{batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
offset = offset.view(
{batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
if (batch == 0) {
gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth});
gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
gradOffset =
gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
}
return 1;
}
int deform_conv_backward_parameters_cuda(
at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
at::Tensor gradWeight, // at::Tensor gradBias,
at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
int padW, int padH, int dilationW, int dilationH, int group,
int deformable_group, float scale, int im2col_step)
{
// todo: transpose and reshape outGrad
// todo: reshape columns
// todo: add im2col_step as input
shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
padW, dilationH, dilationW, group, deformable_group);
input = input.contiguous();
offset = offset.contiguous();
gradOutput = gradOutput.contiguous();
int batch = 1;
if (input.ndimension() == 3) {
// Force batch
batch = 0;
input = input.view(
at::IntList({1, input.size(0), input.size(1), input.size(2)}));
gradOutput = gradOutput.view(
{1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
}
long batchSize = input.size(0);
long nInputPlane = input.size(1);
long inputHeight = input.size(2);
long inputWidth = input.size(3);
long nOutputPlane = gradWeight.size(0);
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
columns = at::zeros(
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
input.options());
gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
nOutputPlane, outputHeight, outputWidth});
gradOutput.transpose_(1, 2);
at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
gradOutputBuffer =
gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
outputHeight, outputWidth});
gradOutputBuffer.copy_(gradOutput);
gradOutputBuffer =
gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
im2col_step * outputHeight, outputWidth});
gradOutput.transpose_(1, 2);
gradOutput =
gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
inputHeight, inputWidth});
offset =
offset.view({batchSize / im2col_step, im2col_step,
deformable_group * 2 * kH * kW, outputHeight, outputWidth});
for (int elt = 0; elt < batchSize / im2col_step; elt++) {
deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
dilationW, im2col_step, deformable_group, columns);
// divide into group
gradOutputBuffer = gradOutputBuffer.view(
{gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
gradWeight =
gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
gradWeight.size(2), gradWeight.size(3)});
for (int g = 0; g < group; g++) {
gradWeight[g] = gradWeight[g]
.flatten(1)
.addmm_(gradOutputBuffer[elt][g].flatten(1),
columns[g].transpose(1, 0), 1.0, scale)
.view_as(gradWeight[g]);
}
gradOutputBuffer = gradOutputBuffer.view(
{gradOutputBuffer.size(0),
gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
gradWeight.size(2), gradWeight.size(3),
gradWeight.size(4)});
}
input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
offset = offset.view(
{batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
if (batch == 0) {
gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
input = input.view({nInputPlane, inputHeight, inputWidth});
}
return 1;
}
void modulated_deform_conv_cuda_forward(
at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
int kernel_h, int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group, const int deformable_group,
const bool with_bias)
{
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_out = weight.size(0);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
channels, channels_kernel * group);
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = at::ones({height_out, width_out}, input.options());
}
// resize output
output = output.view({batch, channels_out, height_out, width_out}).zero_();
// resize temporary columns
columns =
at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
input.options());
output = output.view({output.size(0), group, output.size(1) / group,
output.size(2), output.size(3)});
for (int b = 0; b < batch; b++) {
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
// divide into group
weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.size(2), weight.size(3)});
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
for (int g = 0; g < group; g++) {
output[b][g] = output[b][g]
.flatten(1)
.addmm_(weight[g].flatten(1), columns[g])
.view_as(output[b][g]);
}
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
}
output = output.view({output.size(0), output.size(1) * output.size(2),
output.size(3), output.size(4)});
if (with_bias) {
output += bias.view({1, bias.size(0), 1, 1});
}
}
void modulated_deform_conv_cuda_backward(
at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
at::Tensor offset, at::Tensor mask, at::Tensor columns,
at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias)
{
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
channels, channels_kernel * group);
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = at::ones({height_out, width_out}, input.options());
}
grad_input = grad_input.view({batch, channels, height, width});
columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
input.options());
grad_output =
grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
grad_output.size(2), grad_output.size(3)});
for (int b = 0; b < batch; b++) {
// divide int group
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.size(2), weight.size(3)});
for (int g = 0; g < group; g++) {
columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
grad_output[b][g].flatten(1), 0.0f, 1.0f);
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cuda(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
grad_weight.size(1), grad_weight.size(2),
grad_weight.size(3)});
if (with_bias)
grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
for (int g = 0; g < group; g++) {
grad_weight[g] =
grad_weight[g]
.flatten(1)
.addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
.view_as(grad_weight[g]);
if (with_bias) {
grad_bias[g] =
grad_bias[g]
.view({-1, 1})
.addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
.view(-1);
}
}
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
grad_weight.size(2), grad_weight.size(3),
grad_weight.size(4)});
if (with_bias)
grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
}
grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
grad_output.size(2), grad_output.size(3),
grad_output.size(4)});
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu
================================================
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer ********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
#include
#include
#include
#include
#include
using namespace at;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
const int kMaxGridNum = 65535;
inline int GET_BLOCKS(const int N)
{
return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
}
/*
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N)
{
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}*/
template
__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
const int height, const int width, scalar_t h, scalar_t w)
{
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
scalar_t lh = h - h_low;
scalar_t lw = w - w_low;
scalar_t hh = 1 - lh, hw = 1 - lw;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
v1 = bottom_data[h_low * data_width + w_low];
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = bottom_data[h_low * data_width + w_high];
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = bottom_data[h_high * data_width + w_low];
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = bottom_data[h_high * data_width + w_high];
scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template
__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
const int h, const int w, const int height, const int width)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template
__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
const int height, const int width, const scalar_t *im_data,
const int data_width, const int bp_dir)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (bp_dir == 0)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
else if (bp_dir == 1)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template
__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
const int batch_size, const int num_channels, const int deformable_group,
const int height_col, const int width_col,
scalar_t *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
//const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i)
{
for (int j = 0; j < kernel_w; ++j)
{
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
scalar_t val = static_cast(0);
const scalar_t h_im = h_in + i * dilation_h + offset_h;
const scalar_t w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
{
//const scalar_t map_h = i * dilation_h + offset_h;
//const scalar_t map_w = j * dilation_w + offset_w;
//const int cur_height = height - h_in;
//const int cur_width = width - w_in;
//val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
void deformable_im2col(
const at::Tensor data_im, const at::Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int parallel_imgs,
const int deformable_group, at::Tensor data_col)
{
// num_axes should be smaller than block size
// todo: check parallel_imgs is correctly passed in
int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data_ptr();
const scalar_t *data_offset_ = data_offset.data_ptr();
scalar_t *data_col_ = data_col.data_ptr();
deformable_im2col_gpu_kernel<<>>(
num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, parallel_imgs, channels, deformable_group,
height_col, width_col, data_col_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
}
}
template
__global__ void deformable_col2im_gpu_kernel(
const int n, const scalar_t *data_col, const scalar_t *data_offset,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int deformable_group,
const int height_col, const int width_col,
scalar_t *grad_im)
{
CUDA_KERNEL_LOOP(index, n)
{
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
2 * kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
const scalar_t cur_top_grad = data_col[index];
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++)
{
for (int dx = -2; dx <= 2; dx++)
{
if (cur_h + dy >= 0 && cur_h + dy < height &&
cur_w + dx >= 0 && cur_w + dx < width &&
abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1)
{
int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
}
}
void deformable_col2im(
const at::Tensor data_col, const at::Tensor data_offset, const int channels,
const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group,
at::Tensor grad_im)
{
// todo: make sure parallel_imgs is passed in correctly
int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr();
const scalar_t *data_offset_ = data_offset.data_ptr();
scalar_t *grad_im_ = grad_im.data_ptr();
deformable_col2im_gpu_kernel<<>>(
num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
ksize_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
parallel_imgs, deformable_group, height_col, width_col, grad_im_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
}
}
template
__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
const scalar_t *data_im, const scalar_t *data_offset,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col, scalar_t *grad_offset)
{
CUDA_KERNEL_LOOP(index, n)
{
scalar_t val = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
batch_size * width_col * height_col;
const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w * height * width;
const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
{
const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
scalar_t inv_h = h_in + i * dilation_h + offset_h;
scalar_t inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
{
inv_h = inv_w = -2;
}
const scalar_t weight = get_coordinate_weight(
inv_h, inv_w,
height, width, data_im_ptr + cnt * height * width, width, bp_dir);
val += weight * data_col_ptr[col_pos];
cnt += 1;
}
grad_offset[index] = val;
}
}
void deformable_col2im_coord(
const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
const int channels, const int height, const int width, const int ksize_h,
const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
{
int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr();
const scalar_t *data_im_ = data_im.data_ptr();
const scalar_t *data_offset_ = data_offset.data_ptr();
scalar_t *grad_offset_ = grad_offset.data_ptr();
deformable_col2im_coord_gpu_kernel<<>>(
num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
height_col, width_col, grad_offset_);
}));
}
template
__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
const int height, const int width, scalar_t h, scalar_t w)
{
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
scalar_t lh = h - h_low;
scalar_t lw = w - w_low;
scalar_t hh = 1 - lh, hw = 1 - lw;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
v1 = bottom_data[h_low * data_width + w_low];
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = bottom_data[h_low * data_width + w_high];
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = bottom_data[h_high * data_width + w_low];
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = bottom_data[h_high * data_width + w_high];
scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template
__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
const int h, const int w, const int height, const int width)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template
__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
const int height, const int width, const scalar_t *im_data,
const int data_width, const int bp_dir)
{
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
{
//empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
scalar_t weight = 0;
if (bp_dir == 0)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
else if (bp_dir == 1)
{
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template
__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int num_channels, const int deformable_group,
const int height_col, const int width_col,
scalar_t *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
//const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i)
{
for (int j = 0; j < kernel_w; ++j)
{
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
scalar_t val = static_cast(0);
const scalar_t h_im = h_in + i * dilation_h + offset_h;
const scalar_t w_im = w_in + j * dilation_w + offset_w;
//if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
{
//const float map_h = i * dilation_h + offset_h;
//const float map_w = j * dilation_w + offset_w;
//const int cur_height = height - h_in;
//const int cur_width = width - w_in;
//val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
}
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
//data_col_ptr += height_col * width_col;
}
}
}
}
template
__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int deformable_group,
const int height_col, const int width_col,
scalar_t *grad_im)
{
CUDA_KERNEL_LOOP(index, n)
{
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
const scalar_t cur_top_grad = data_col[index] * mask;
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++)
{
for (int dx = -2; dx <= 2; dx++)
{
if (cur_h + dy >= 0 && cur_h + dy < height &&
cur_w + dx >= 0 && cur_w + dx < width &&
abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1)
{
int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
}
}
template
__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
const scalar_t *data_col, const scalar_t *data_im,
const scalar_t *data_offset, const scalar_t *data_mask,
const int channels, const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col,
scalar_t *grad_offset, scalar_t *grad_mask)
{
CUDA_KERNEL_LOOP(index, n)
{
scalar_t val = 0, mval = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
{
const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
scalar_t inv_h = h_in + i * dilation_h + offset_h;
scalar_t inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
{
inv_h = inv_w = -2;
}
else
{
mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
}
const scalar_t weight = dmcn_get_coordinate_weight(
inv_h, inv_w,
height, width, data_im_ptr + cnt * height * width, width, bp_dir);
val += weight * data_col_ptr[col_pos] * mask;
cnt += 1;
}
// KERNEL_ASSIGN(grad_offset[index], offset_req, val);
grad_offset[index] = val;
if (offset_c % 2 == 0)
// KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
}
}
void modulated_deformable_im2col_cuda(
const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, at::Tensor data_col)
{
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data_ptr();
const scalar_t *data_offset_ = data_offset.data_ptr();
const scalar_t *data_mask_ = data_mask.data_ptr();
scalar_t *data_col_ = data_col.data_ptr();
modulated_deformable_im2col_gpu_kernel<<>>(
num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, channels, deformable_group, height_col, width_col, data_col_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
}
}
void modulated_deformable_col2im_cuda(
const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group, at::Tensor grad_im)
{
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr();
const scalar_t *data_offset_ = data_offset.data_ptr();
const scalar_t *data_mask_ = data_mask.data_ptr();
scalar_t *grad_im_ = grad_im.data_ptr();
modulated_deformable_col2im_gpu_kernel<<>>(
num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
batch_size, deformable_group, height_col, width_col, grad_im_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
}
}
void modulated_deformable_col2im_coord_cuda(
const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
const int batch_size, const int channels, const int height_im, const int width_im,
const int height_col, const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int deformable_group,
at::Tensor grad_offset, at::Tensor grad_mask)
{
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr();
const scalar_t *data_im_ = data_im.data_ptr();
const scalar_t *data_offset_ = data_offset.data_ptr();
const scalar_t *data_mask_ = data_mask.data_ptr();
scalar_t *grad_offset_ = grad_offset.data_ptr();
scalar_t *grad_mask_ = grad_mask.data_ptr();
modulated_deformable_col2im_coord_gpu_kernel<<>>(
num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group,
batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
grad_offset_, grad_mask_);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
}
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu
================================================
// modify from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
// based on
// author: Charles Shang
// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
#include
#include
#include
#include
#include
#include
#include
void DeformablePSROIPoolForward(
const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
at::Tensor out, at::Tensor top_count, const int batch, const int channels,
const int height, const int width, const int num_bbox,
const int channels_trans, const int no_trans, const float spatial_scale,
const int output_dim, const int group_size, const int pooled_size,
const int part_size, const int sample_per_part, const float trans_std);
void DeformablePSROIPoolBackwardAcc(
const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
at::Tensor trans_grad, const int batch, const int channels,
const int height, const int width, const int num_bbox,
const int channels_trans, const int no_trans, const float spatial_scale,
const int output_dim, const int group_size, const int pooled_size,
const int part_size, const int sample_per_part, const float trans_std);
void deform_psroi_pooling_cuda_forward(
at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
at::Tensor top_count, const int no_trans, const float spatial_scale,
const int output_dim, const int group_size, const int pooled_size,
const int part_size, const int sample_per_part, const float trans_std)
{
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_trans = no_trans ? 2 : trans.size(1);
const int num_bbox = bbox.size(0);
if (num_bbox != out.size(0))
AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
out.size(0), num_bbox);
DeformablePSROIPoolForward(
input, bbox, trans, out, top_count, batch, channels, height, width,
num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
pooled_size, part_size, sample_per_part, trans_std);
}
void deform_psroi_pooling_cuda_backward(
at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
const int no_trans, const float spatial_scale, const int output_dim,
const int group_size, const int pooled_size, const int part_size,
const int sample_per_part, const float trans_std)
{
TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_trans = no_trans ? 2 : trans.size(1);
const int num_bbox = bbox.size(0);
if (num_bbox != out_grad.size(0))
AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
out_grad.size(0), num_bbox);
DeformablePSROIPoolBackwardAcc(
out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
channels, height, width, num_bbox, channels_trans, no_trans,
spatial_scale, output_dim, group_size, pooled_size, part_size,
sample_per_part, trans_std);
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu
================================================
/*!
* Copyright (c) 2017 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file deformable_psroi_pooling.cu
* \brief
* \author Yi Li, Guodong Zhang, Jifeng Dai
*/
/***************** Adapted by Charles Shang *********************/
// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
#include
#include
#include
#include
#include
using namespace at;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N)
{
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
template
__device__ scalar_t bilinear_interp(
const scalar_t *data,
const scalar_t x,
const scalar_t y,
const int width,
const int height)
{
int x1 = floor(x);
int x2 = ceil(x);
int y1 = floor(y);
int y2 = ceil(y);
scalar_t dist_x = (scalar_t)(x - x1);
scalar_t dist_y = (scalar_t)(y - y1);
scalar_t value11 = data[y1 * width + x1];
scalar_t value12 = data[y2 * width + x1];
scalar_t value21 = data[y1 * width + x2];
scalar_t value22 = data[y2 * width + x2];
scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
return value;
}
template
__global__ void DeformablePSROIPoolForwardKernel(
const int count,
const scalar_t *bottom_data,
const scalar_t spatial_scale,
const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const scalar_t *bottom_rois, const scalar_t *bottom_trans,
const int no_trans,
const scalar_t trans_std,
const int sample_per_part,
const int output_dim,
const int group_size,
const int part_size,
const int num_classes,
const int channels_each_class,
scalar_t *top_data,
scalar_t *top_count)
{
CUDA_KERNEL_LOOP(index, count)
{
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
// Force too small ROIs to be 1x1
scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
// Compute w and h at bottom
scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
int class_id = ctop / channels_each_class;
scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
scalar_t sum = 0;
int count = 0;
int gw = floor((scalar_t)(pw)*group_size / pooled_width);
int gh = floor((scalar_t)(ph)*group_size / pooled_height);
gw = min(max(gw, 0), group_size - 1);
gh = min(max(gh, 0), group_size - 1);
const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
for (int ih = 0; ih < sample_per_part; ih++)
{
for (int iw = 0; iw < sample_per_part; iw++)
{
scalar_t w = wstart + iw * sub_bin_size_w;
scalar_t h = hstart + ih * sub_bin_size_h;
// bilinear interpolation
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
{
continue;
}
w = min(max(w, 0.), width - 1.);
h = min(max(h, 0.), height - 1.);
int c = (ctop * group_size + gh) * group_size + gw;
scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
sum += val;
count++;
}
}
top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
top_count[index] = count;
}
}
template
__global__ void DeformablePSROIPoolBackwardAccKernel(
const int count,
const scalar_t *top_diff,
const scalar_t *top_count,
const int num_rois,
const scalar_t spatial_scale,
const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int output_dim,
scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
const scalar_t *bottom_data,
const scalar_t *bottom_rois,
const scalar_t *bottom_trans,
const int no_trans,
const scalar_t trans_std,
const int sample_per_part,
const int group_size,
const int part_size,
const int num_classes,
const int channels_each_class)
{
CUDA_KERNEL_LOOP(index, count)
{
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
// Force too small ROIs to be 1x1
scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
// Compute w and h at bottom
scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
int class_id = ctop / channels_each_class;
scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
if (top_count[index] <= 0)
{
continue;
}
scalar_t diff_val = top_diff[index] / top_count[index];
const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
int gw = floor((scalar_t)(pw)*group_size / pooled_width);
int gh = floor((scalar_t)(ph)*group_size / pooled_height);
gw = min(max(gw, 0), group_size - 1);
gh = min(max(gh, 0), group_size - 1);
for (int ih = 0; ih < sample_per_part; ih++)
{
for (int iw = 0; iw < sample_per_part; iw++)
{
scalar_t w = wstart + iw * sub_bin_size_w;
scalar_t h = hstart + ih * sub_bin_size_h;
// bilinear interpolation
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
{
continue;
}
w = min(max(w, 0.), width - 1.);
h = min(max(h, 0.), height - 1.);
int c = (ctop * group_size + gh) * group_size + gw;
// backward on feature
int x0 = floor(w);
int x1 = ceil(w);
int y0 = floor(h);
int y1 = ceil(h);
scalar_t dist_x = w - x0, dist_y = h - y0;
scalar_t q00 = (1 - dist_x) * (1 - dist_y);
scalar_t q01 = (1 - dist_x) * dist_y;
scalar_t q10 = dist_x * (1 - dist_y);
scalar_t q11 = dist_x * dist_y;
int bottom_index_base = c * height * width;
atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
if (no_trans)
{
continue;
}
scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
diff_x *= roi_width;
scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
diff_y *= roi_height;
atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
}
}
}
}
void DeformablePSROIPoolForward(const at::Tensor data,
const at::Tensor bbox,
const at::Tensor trans,
at::Tensor out,
at::Tensor top_count,
const int batch,
const int channels,
const int height,
const int width,
const int num_bbox,
const int channels_trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
const int pooled_height = pooled_size;
const int pooled_width = pooled_size;
const int count = num_bbox * output_dim * pooled_height * pooled_width;
const int num_classes = no_trans ? 1 : channels_trans / 2;
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
const scalar_t *bottom_data = data.data_ptr();
const scalar_t *bottom_rois = bbox.data_ptr();
const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr();
scalar_t *top_data = out.data_ptr();
scalar_t *top_count_data = top_count.data_ptr();
DeformablePSROIPoolForwardKernel<<>>(
count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
}
}
void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
const at::Tensor data,
const at::Tensor bbox,
const at::Tensor trans,
const at::Tensor top_count,
at::Tensor in_grad,
at::Tensor trans_grad,
const int batch,
const int channels,
const int height,
const int width,
const int num_bbox,
const int channels_trans,
const int no_trans,
const float spatial_scale,
const int output_dim,
const int group_size,
const int pooled_size,
const int part_size,
const int sample_per_part,
const float trans_std)
{
// LOG(INFO) << "DeformablePSROIPoolBackward";
const int num_rois = num_bbox;
const int pooled_height = pooled_size;
const int pooled_width = pooled_size;
const int count = num_bbox * output_dim * pooled_height * pooled_width;
const int num_classes = no_trans ? 1 : channels_trans / 2;
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
const scalar_t *top_diff = out_grad.data_ptr();
const scalar_t *bottom_data = data.data_ptr();
const scalar_t *bottom_rois = bbox.data_ptr();
const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr();
scalar_t *bottom_data_diff = in_grad.data_ptr();
scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr();
const scalar_t *top_count_data = top_count.data_ptr();
DeformablePSROIPoolBackwardAccKernel<<>>(
count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
group_size, part_size, num_classes, channels_each_class);
}));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
}
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/ml_nms.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include
#include
#include
#include
#include
#include
int const threadsPerBlock = sizeof(unsigned long long) * 8;
__device__ inline float devIoU(float const * const a, float const * const b) {
if (a[5] != b[5]) {
return 0.0;
}
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void ml_nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 6];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 6 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
block_boxes[threadIdx.x * 6 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
block_boxes[threadIdx.x * 6 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
block_boxes[threadIdx.x * 6 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
block_boxes[threadIdx.x * 6 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
block_boxes[threadIdx.x * 6 + 5] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 6;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 6) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
// boxes is a N x 6 tensor
at::Tensor ml_nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
using scalar_t = float;
AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
auto scores = boxes.select(1, 4);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
int boxes_num = boxes.size(0);
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
scalar_t* boxes_dev = boxes_sorted.data_ptr();
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
unsigned long long* mask_dev = NULL;
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
// boxes_num * col_blocks * sizeof(unsigned long long)));
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
THCCeilDiv(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
ml_nms_kernel<<>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);
std::vector mask_host(boxes_num * col_blocks);
THCudaCheck(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
std::vector remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data_ptr();
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
THCudaFree(state, mask_dev);
// TODO improve this part
return std::get<0>(order_t.index({
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
order_t.device(), keep.scalar_type())
}).sort(0, false));
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/nms.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include
#include
#include
#include
#include
#include
int const threadsPerBlock = sizeof(unsigned long long) * 8;
__device__ inline float devIoU(float const * const a, float const * const b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 5;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
// boxes is a N x 5 tensor
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
using scalar_t = float;
AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
auto scores = boxes.select(1, 4);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
int boxes_num = boxes.size(0);
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
scalar_t* boxes_dev = boxes_sorted.data_ptr();
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
unsigned long long* mask_dev = NULL;
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
// boxes_num * col_blocks * sizeof(unsigned long long)));
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
THCCeilDiv(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);
std::vector mask_host(boxes_num * col_blocks);
THCudaCheck(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
std::vector remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data_ptr();
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
THCudaFree(state, mask_dev);
// TODO improve this part
return std::get<0>(order_t.index({
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
order_t.device(), keep.scalar_type())
}).sort(0, false));
}
================================================
FILE: GLIP/maskrcnn_benchmark/csrc/cuda/vision.h
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#pragma once
#include