Repository: Mikubill/sd-webui-controlnet Branch: main Commit: 56cec5b2958e Files: 932 Total size: 6.2 MB Directory structure: gitextract_wjd3xpkj/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ └── config.yml │ └── workflows/ │ ├── lint.yaml │ └── tests.yml ├── .gitignore ├── LICENSE ├── README.md ├── annotator/ │ ├── __init__.py │ ├── anime_face_segment/ │ │ ├── LICENSE │ │ └── __init__.py │ ├── annotator_path.py │ ├── binary/ │ │ └── __init__.py │ ├── canny/ │ │ └── __init__.py │ ├── clipvision/ │ │ ├── __init__.py │ │ ├── clip_vision_h_uc.data │ │ └── clip_vision_vith_uc.data │ ├── color/ │ │ └── __init__.py │ ├── densepose/ │ │ ├── __init__.py │ │ └── densepose.py │ ├── depth_anything.py │ ├── depth_anything_v2.py │ ├── hed/ │ │ └── __init__.py │ ├── keypose/ │ │ ├── __init__.py │ │ ├── faster_rcnn_r50_fpn_coco.py │ │ └── hrnet_w48_coco_256x192.py │ ├── lama/ │ │ ├── __init__.py │ │ ├── config.yaml │ │ └── saicinpainting/ │ │ ├── __init__.py │ │ ├── training/ │ │ │ ├── __init__.py │ │ │ ├── data/ │ │ │ │ ├── __init__.py │ │ │ │ └── masks.py │ │ │ ├── losses/ │ │ │ │ ├── __init__.py │ │ │ │ ├── adversarial.py │ │ │ │ ├── constants.py │ │ │ │ ├── distance_weighting.py │ │ │ │ ├── feature_matching.py │ │ │ │ ├── perceptual.py │ │ │ │ ├── segmentation.py │ │ │ │ └── style_loss.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── depthwise_sep_conv.py │ │ │ │ ├── fake_fakes.py │ │ │ │ ├── ffc.py │ │ │ │ ├── multidilated_conv.py │ │ │ │ ├── multiscale.py │ │ │ │ ├── pix2pixhd.py │ │ │ │ ├── spatial_transform.py │ │ │ │ └── squeeze_excitation.py │ │ │ ├── trainers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── default.py │ │ │ └── visualizers/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── colors.py │ │ │ ├── directory.py │ │ │ └── noop.py │ │ └── utils.py │ ├── leres/ │ │ ├── __init__.py │ │ ├── leres/ │ │ │ ├── LICENSE │ │ │ ├── Resnet.py │ │ │ ├── Resnext_torch.py │ │ │ ├── depthmap.py │ │ │ ├── multi_depth_model_woauxi.py │ │ │ ├── net_tools.py │ │ │ └── network_auxi.py │ │ └── pix2pix/ │ │ ├── LICENSE │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── base_model.py │ │ │ ├── base_model_hg.py │ │ │ ├── networks.py │ │ │ └── pix2pix4depth_model.py │ │ ├── options/ │ │ │ ├── __init__.py │ │ │ ├── base_options.py │ │ │ └── test_options.py │ │ └── util/ │ │ ├── __init__.py │ │ ├── get_data.py │ │ ├── guidedfilter.py │ │ ├── html.py │ │ ├── image_pool.py │ │ ├── util.py │ │ └── visualizer.py │ ├── lineart/ │ │ ├── LICENSE │ │ └── __init__.py │ ├── lineart_anime/ │ │ ├── LICENSE │ │ └── __init__.py │ ├── manga_line/ │ │ ├── LICENSE │ │ └── __init__.py │ ├── mediapipe_face/ │ │ ├── __init__.py │ │ └── mediapipe_face_common.py │ ├── midas/ │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── api.py │ │ ├── midas/ │ │ │ ├── __init__.py │ │ │ ├── base_model.py │ │ │ ├── blocks.py │ │ │ ├── dpt_depth.py │ │ │ ├── midas_net.py │ │ │ ├── midas_net_custom.py │ │ │ ├── transforms.py │ │ │ └── vit.py │ │ └── utils.py │ ├── mlsd/ │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── models/ │ │ │ ├── mbv2_mlsd_large.py │ │ │ └── mbv2_mlsd_tiny.py │ │ └── utils.py │ ├── mmpkg/ │ │ ├── mmcv/ │ │ │ ├── __init__.py │ │ │ ├── arraymisc/ │ │ │ │ ├── __init__.py │ │ │ │ └── quantization.py │ │ │ ├── cnn/ │ │ │ │ ├── __init__.py │ │ │ │ ├── alexnet.py │ │ │ │ ├── bricks/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── activation.py │ │ │ │ │ ├── context_block.py │ │ │ │ │ ├── conv.py │ │ │ │ │ ├── conv2d_adaptive_padding.py │ │ │ │ │ ├── conv_module.py │ │ │ │ │ ├── conv_ws.py │ │ │ │ │ ├── depthwise_separable_conv_module.py │ │ │ │ │ ├── drop.py │ │ │ │ │ ├── generalized_attention.py │ │ │ │ │ ├── hsigmoid.py │ │ │ │ │ ├── hswish.py │ │ │ │ │ ├── non_local.py │ │ │ │ │ ├── norm.py │ │ │ │ │ ├── padding.py │ │ │ │ │ ├── plugin.py │ │ │ │ │ ├── registry.py │ │ │ │ │ ├── scale.py │ │ │ │ │ ├── swish.py │ │ │ │ │ ├── transformer.py │ │ │ │ │ ├── upsample.py │ │ │ │ │ └── wrappers.py │ │ │ │ ├── builder.py │ │ │ │ ├── resnet.py │ │ │ │ ├── utils/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── flops_counter.py │ │ │ │ │ ├── fuse_conv_bn.py │ │ │ │ │ ├── sync_bn.py │ │ │ │ │ └── weight_init.py │ │ │ │ └── vgg.py │ │ │ ├── engine/ │ │ │ │ ├── __init__.py │ │ │ │ └── test.py │ │ │ ├── fileio/ │ │ │ │ ├── __init__.py │ │ │ │ ├── file_client.py │ │ │ │ ├── handlers/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── json_handler.py │ │ │ │ │ ├── pickle_handler.py │ │ │ │ │ └── yaml_handler.py │ │ │ │ ├── io.py │ │ │ │ └── parse.py │ │ │ ├── image/ │ │ │ │ ├── __init__.py │ │ │ │ ├── colorspace.py │ │ │ │ ├── geometric.py │ │ │ │ ├── io.py │ │ │ │ ├── misc.py │ │ │ │ └── photometric.py │ │ │ ├── model_zoo/ │ │ │ │ ├── deprecated.json │ │ │ │ ├── mmcls.json │ │ │ │ └── open_mmlab.json │ │ │ ├── ops/ │ │ │ │ ├── __init__.py │ │ │ │ ├── assign_score_withk.py │ │ │ │ ├── ball_query.py │ │ │ │ ├── bbox.py │ │ │ │ ├── border_align.py │ │ │ │ ├── box_iou_rotated.py │ │ │ │ ├── carafe.py │ │ │ │ ├── cc_attention.py │ │ │ │ ├── contour_expand.py │ │ │ │ ├── corner_pool.py │ │ │ │ ├── correlation.py │ │ │ │ ├── deform_conv.py │ │ │ │ ├── deform_roi_pool.py │ │ │ │ ├── deprecated_wrappers.py │ │ │ │ ├── focal_loss.py │ │ │ │ ├── furthest_point_sample.py │ │ │ │ ├── fused_bias_leakyrelu.py │ │ │ │ ├── gather_points.py │ │ │ │ ├── group_points.py │ │ │ │ ├── info.py │ │ │ │ ├── iou3d.py │ │ │ │ ├── knn.py │ │ │ │ ├── masked_conv.py │ │ │ │ ├── merge_cells.py │ │ │ │ ├── modulated_deform_conv.py │ │ │ │ ├── multi_scale_deform_attn.py │ │ │ │ ├── nms.py │ │ │ │ ├── pixel_group.py │ │ │ │ ├── point_sample.py │ │ │ │ ├── points_in_boxes.py │ │ │ │ ├── points_sampler.py │ │ │ │ ├── psa_mask.py │ │ │ │ ├── roi_align.py │ │ │ │ ├── roi_align_rotated.py │ │ │ │ ├── roi_pool.py │ │ │ │ ├── roiaware_pool3d.py │ │ │ │ ├── roipoint_pool3d.py │ │ │ │ ├── saconv.py │ │ │ │ ├── scatter_points.py │ │ │ │ ├── sync_bn.py │ │ │ │ ├── three_interpolate.py │ │ │ │ ├── three_nn.py │ │ │ │ ├── tin_shift.py │ │ │ │ ├── upfirdn2d.py │ │ │ │ └── voxelize.py │ │ │ ├── parallel/ │ │ │ │ ├── __init__.py │ │ │ │ ├── _functions.py │ │ │ │ ├── collate.py │ │ │ │ ├── data_container.py │ │ │ │ ├── data_parallel.py │ │ │ │ ├── distributed.py │ │ │ │ ├── distributed_deprecated.py │ │ │ │ ├── registry.py │ │ │ │ ├── scatter_gather.py │ │ │ │ └── utils.py │ │ │ ├── runner/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base_module.py │ │ │ │ ├── base_runner.py │ │ │ │ ├── builder.py │ │ │ │ ├── checkpoint.py │ │ │ │ ├── default_constructor.py │ │ │ │ ├── dist_utils.py │ │ │ │ ├── epoch_based_runner.py │ │ │ │ ├── fp16_utils.py │ │ │ │ ├── hooks/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── checkpoint.py │ │ │ │ │ ├── closure.py │ │ │ │ │ ├── ema.py │ │ │ │ │ ├── evaluation.py │ │ │ │ │ ├── hook.py │ │ │ │ │ ├── iter_timer.py │ │ │ │ │ ├── logger/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── base.py │ │ │ │ │ │ ├── dvclive.py │ │ │ │ │ │ ├── mlflow.py │ │ │ │ │ │ ├── neptune.py │ │ │ │ │ │ ├── pavi.py │ │ │ │ │ │ ├── tensorboard.py │ │ │ │ │ │ ├── text.py │ │ │ │ │ │ └── wandb.py │ │ │ │ │ ├── lr_updater.py │ │ │ │ │ ├── memory.py │ │ │ │ │ ├── momentum_updater.py │ │ │ │ │ ├── optimizer.py │ │ │ │ │ ├── profiler.py │ │ │ │ │ ├── sampler_seed.py │ │ │ │ │ └── sync_buffer.py │ │ │ │ ├── iter_based_runner.py │ │ │ │ ├── log_buffer.py │ │ │ │ ├── optimizer/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── builder.py │ │ │ │ │ └── default_constructor.py │ │ │ │ ├── priority.py │ │ │ │ └── utils.py │ │ │ ├── utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── config.py │ │ │ │ ├── env.py │ │ │ │ ├── ext_loader.py │ │ │ │ ├── logging.py │ │ │ │ ├── misc.py │ │ │ │ ├── parrots_jit.py │ │ │ │ ├── parrots_wrapper.py │ │ │ │ ├── path.py │ │ │ │ ├── progressbar.py │ │ │ │ ├── registry.py │ │ │ │ ├── testing.py │ │ │ │ ├── timer.py │ │ │ │ ├── trace.py │ │ │ │ └── version_utils.py │ │ │ ├── version.py │ │ │ ├── video/ │ │ │ │ ├── __init__.py │ │ │ │ ├── io.py │ │ │ │ ├── optflow.py │ │ │ │ └── processing.py │ │ │ └── visualization/ │ │ │ ├── __init__.py │ │ │ ├── color.py │ │ │ ├── image.py │ │ │ └── optflow.py │ │ └── mmseg/ │ │ ├── apis/ │ │ │ ├── __init__.py │ │ │ ├── inference.py │ │ │ ├── test.py │ │ │ └── train.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── evaluation/ │ │ │ │ ├── __init__.py │ │ │ │ ├── class_names.py │ │ │ │ ├── eval_hooks.py │ │ │ │ └── metrics.py │ │ │ ├── seg/ │ │ │ │ ├── __init__.py │ │ │ │ ├── builder.py │ │ │ │ └── sampler/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base_pixel_sampler.py │ │ │ │ └── ohem_pixel_sampler.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ └── misc.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── ade.py │ │ │ ├── builder.py │ │ │ ├── chase_db1.py │ │ │ ├── cityscapes.py │ │ │ ├── custom.py │ │ │ ├── dataset_wrappers.py │ │ │ ├── drive.py │ │ │ ├── hrf.py │ │ │ ├── pascal_context.py │ │ │ ├── pipelines/ │ │ │ │ ├── __init__.py │ │ │ │ ├── compose.py │ │ │ │ ├── formating.py │ │ │ │ ├── loading.py │ │ │ │ ├── test_time_aug.py │ │ │ │ └── transforms.py │ │ │ ├── stare.py │ │ │ └── voc.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── backbones/ │ │ │ │ ├── __init__.py │ │ │ │ ├── cgnet.py │ │ │ │ ├── fast_scnn.py │ │ │ │ ├── hrnet.py │ │ │ │ ├── mobilenet_v2.py │ │ │ │ ├── mobilenet_v3.py │ │ │ │ ├── resnest.py │ │ │ │ ├── resnet.py │ │ │ │ ├── resnext.py │ │ │ │ ├── unet.py │ │ │ │ └── vit.py │ │ │ ├── builder.py │ │ │ ├── decode_heads/ │ │ │ │ ├── __init__.py │ │ │ │ ├── ann_head.py │ │ │ │ ├── apc_head.py │ │ │ │ ├── aspp_head.py │ │ │ │ ├── cascade_decode_head.py │ │ │ │ ├── cc_head.py │ │ │ │ ├── da_head.py │ │ │ │ ├── decode_head.py │ │ │ │ ├── dm_head.py │ │ │ │ ├── dnl_head.py │ │ │ │ ├── ema_head.py │ │ │ │ ├── enc_head.py │ │ │ │ ├── fcn_head.py │ │ │ │ ├── fpn_head.py │ │ │ │ ├── gc_head.py │ │ │ │ ├── lraspp_head.py │ │ │ │ ├── nl_head.py │ │ │ │ ├── ocr_head.py │ │ │ │ ├── point_head.py │ │ │ │ ├── psa_head.py │ │ │ │ ├── psp_head.py │ │ │ │ ├── sep_aspp_head.py │ │ │ │ ├── sep_fcn_head.py │ │ │ │ └── uper_head.py │ │ │ ├── losses/ │ │ │ │ ├── __init__.py │ │ │ │ ├── accuracy.py │ │ │ │ ├── cross_entropy_loss.py │ │ │ │ ├── dice_loss.py │ │ │ │ ├── lovasz_loss.py │ │ │ │ └── utils.py │ │ │ ├── necks/ │ │ │ │ ├── __init__.py │ │ │ │ ├── fpn.py │ │ │ │ └── multilevel_neck.py │ │ │ ├── segmentors/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── cascade_encoder_decoder.py │ │ │ │ └── encoder_decoder.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── drop.py │ │ │ ├── inverted_residual.py │ │ │ ├── make_divisible.py │ │ │ ├── res_layer.py │ │ │ ├── se_layer.py │ │ │ ├── self_attention_block.py │ │ │ ├── up_conv_block.py │ │ │ └── weight_init.py │ │ ├── ops/ │ │ │ ├── __init__.py │ │ │ ├── encoding.py │ │ │ └── wrappers.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── collect_env.py │ │ └── logger.py │ ├── mobile_sam/ │ │ └── __init__.py │ ├── normalbae/ │ │ ├── LICENSE │ │ ├── __init__.py │ │ └── models/ │ │ ├── NNET.py │ │ ├── baseline.py │ │ └── submodules/ │ │ ├── decoder.py │ │ ├── efficientnet_repo/ │ │ │ ├── .gitignore │ │ │ ├── BENCHMARK.md │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── caffe2_benchmark.py │ │ │ ├── caffe2_validate.py │ │ │ ├── geffnet/ │ │ │ │ ├── __init__.py │ │ │ │ ├── activations/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── activations.py │ │ │ │ │ ├── activations_jit.py │ │ │ │ │ └── activations_me.py │ │ │ │ ├── config.py │ │ │ │ ├── conv2d_layers.py │ │ │ │ ├── efficientnet_builder.py │ │ │ │ ├── gen_efficientnet.py │ │ │ │ ├── helpers.py │ │ │ │ ├── mobilenetv3.py │ │ │ │ ├── model_factory.py │ │ │ │ └── version.py │ │ │ ├── hubconf.py │ │ │ ├── onnx_export.py │ │ │ ├── onnx_optimize.py │ │ │ ├── onnx_to_caffe.py │ │ │ ├── onnx_validate.py │ │ │ ├── requirements.txt │ │ │ ├── setup.py │ │ │ ├── utils.py │ │ │ └── validate.py │ │ ├── encoder.py │ │ └── submodules.py │ ├── normaldsine/ │ │ ├── LICENSE │ │ └── __init__.py │ ├── oneformer/ │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── api.py │ │ ├── configs/ │ │ │ ├── ade20k/ │ │ │ │ ├── Base-ADE20K-UnifiedSegmentation.yaml │ │ │ │ ├── oneformer_R50_bs16_160k.yaml │ │ │ │ └── oneformer_swin_large_IN21k_384_bs16_160k.yaml │ │ │ └── coco/ │ │ │ ├── Base-COCO-UnifiedSegmentation.yaml │ │ │ ├── oneformer_R50_bs16_50ep.yaml │ │ │ └── oneformer_swin_large_IN21k_384_bs16_100ep.yaml │ │ ├── detectron2/ │ │ │ ├── __init__.py │ │ │ ├── checkpoint/ │ │ │ │ ├── __init__.py │ │ │ │ ├── c2_model_loading.py │ │ │ │ ├── catalog.py │ │ │ │ └── detection_checkpoint.py │ │ │ ├── config/ │ │ │ │ ├── __init__.py │ │ │ │ ├── compat.py │ │ │ │ ├── config.py │ │ │ │ ├── defaults.py │ │ │ │ ├── instantiate.py │ │ │ │ └── lazy.py │ │ │ ├── data/ │ │ │ │ ├── __init__.py │ │ │ │ ├── benchmark.py │ │ │ │ ├── build.py │ │ │ │ ├── catalog.py │ │ │ │ ├── common.py │ │ │ │ ├── dataset_mapper.py │ │ │ │ ├── datasets/ │ │ │ │ │ ├── README.md │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── builtin.py │ │ │ │ │ ├── builtin_meta.py │ │ │ │ │ ├── cityscapes.py │ │ │ │ │ ├── cityscapes_panoptic.py │ │ │ │ │ ├── coco.py │ │ │ │ │ ├── coco_panoptic.py │ │ │ │ │ ├── lvis.py │ │ │ │ │ ├── lvis_v0_5_categories.py │ │ │ │ │ ├── lvis_v1_categories.py │ │ │ │ │ ├── lvis_v1_category_image_count.py │ │ │ │ │ ├── pascal_voc.py │ │ │ │ │ └── register_coco.py │ │ │ │ ├── detection_utils.py │ │ │ │ ├── samplers/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── distributed_sampler.py │ │ │ │ │ └── grouped_batch_sampler.py │ │ │ │ └── transforms/ │ │ │ │ ├── __init__.py │ │ │ │ ├── augmentation.py │ │ │ │ ├── augmentation_impl.py │ │ │ │ └── transform.py │ │ │ ├── engine/ │ │ │ │ ├── __init__.py │ │ │ │ ├── defaults.py │ │ │ │ ├── hooks.py │ │ │ │ ├── launch.py │ │ │ │ └── train_loop.py │ │ │ ├── evaluation/ │ │ │ │ ├── __init__.py │ │ │ │ ├── cityscapes_evaluation.py │ │ │ │ ├── coco_evaluation.py │ │ │ │ ├── evaluator.py │ │ │ │ ├── fast_eval_api.py │ │ │ │ ├── lvis_evaluation.py │ │ │ │ ├── panoptic_evaluation.py │ │ │ │ ├── pascal_voc_evaluation.py │ │ │ │ ├── rotated_coco_evaluation.py │ │ │ │ ├── sem_seg_evaluation.py │ │ │ │ └── testing.py │ │ │ ├── export/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── api.py │ │ │ │ ├── c10.py │ │ │ │ ├── caffe2_export.py │ │ │ │ ├── caffe2_inference.py │ │ │ │ ├── caffe2_modeling.py │ │ │ │ ├── caffe2_patch.py │ │ │ │ ├── flatten.py │ │ │ │ ├── shared.py │ │ │ │ ├── torchscript.py │ │ │ │ └── torchscript_patch.py │ │ │ ├── layers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── aspp.py │ │ │ │ ├── batch_norm.py │ │ │ │ ├── blocks.py │ │ │ │ ├── csrc/ │ │ │ │ │ ├── README.md │ │ │ │ │ ├── ROIAlignRotated/ │ │ │ │ │ │ ├── ROIAlignRotated.h │ │ │ │ │ │ ├── ROIAlignRotated_cpu.cpp │ │ │ │ │ │ └── ROIAlignRotated_cuda.cu │ │ │ │ │ ├── box_iou_rotated/ │ │ │ │ │ │ ├── box_iou_rotated.h │ │ │ │ │ │ ├── box_iou_rotated_cpu.cpp │ │ │ │ │ │ ├── box_iou_rotated_cuda.cu │ │ │ │ │ │ └── box_iou_rotated_utils.h │ │ │ │ │ ├── cocoeval/ │ │ │ │ │ │ ├── cocoeval.cpp │ │ │ │ │ │ └── cocoeval.h │ │ │ │ │ ├── cuda_version.cu │ │ │ │ │ ├── deformable/ │ │ │ │ │ │ ├── deform_conv.h │ │ │ │ │ │ ├── deform_conv_cuda.cu │ │ │ │ │ │ └── deform_conv_cuda_kernel.cu │ │ │ │ │ ├── nms_rotated/ │ │ │ │ │ │ ├── nms_rotated.h │ │ │ │ │ │ ├── nms_rotated_cpu.cpp │ │ │ │ │ │ └── nms_rotated_cuda.cu │ │ │ │ │ └── vision.cpp │ │ │ │ ├── deform_conv.py │ │ │ │ ├── losses.py │ │ │ │ ├── mask_ops.py │ │ │ │ ├── nms.py │ │ │ │ ├── roi_align.py │ │ │ │ ├── roi_align_rotated.py │ │ │ │ ├── rotated_boxes.py │ │ │ │ ├── shape_spec.py │ │ │ │ └── wrappers.py │ │ │ ├── model_zoo/ │ │ │ │ ├── __init__.py │ │ │ │ └── model_zoo.py │ │ │ ├── modeling/ │ │ │ │ ├── __init__.py │ │ │ │ ├── anchor_generator.py │ │ │ │ ├── backbone/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── backbone.py │ │ │ │ │ ├── build.py │ │ │ │ │ ├── fpn.py │ │ │ │ │ ├── mvit.py │ │ │ │ │ ├── regnet.py │ │ │ │ │ ├── resnet.py │ │ │ │ │ ├── swin.py │ │ │ │ │ ├── utils.py │ │ │ │ │ └── vit.py │ │ │ │ ├── box_regression.py │ │ │ │ ├── matcher.py │ │ │ │ ├── meta_arch/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── build.py │ │ │ │ │ ├── dense_detector.py │ │ │ │ │ ├── fcos.py │ │ │ │ │ ├── panoptic_fpn.py │ │ │ │ │ ├── rcnn.py │ │ │ │ │ ├── retinanet.py │ │ │ │ │ └── semantic_seg.py │ │ │ │ ├── mmdet_wrapper.py │ │ │ │ ├── poolers.py │ │ │ │ ├── postprocessing.py │ │ │ │ ├── proposal_generator/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── build.py │ │ │ │ │ ├── proposal_utils.py │ │ │ │ │ ├── rpn.py │ │ │ │ │ └── rrpn.py │ │ │ │ ├── roi_heads/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── box_head.py │ │ │ │ │ ├── cascade_rcnn.py │ │ │ │ │ ├── fast_rcnn.py │ │ │ │ │ ├── keypoint_head.py │ │ │ │ │ ├── mask_head.py │ │ │ │ │ ├── roi_heads.py │ │ │ │ │ └── rotated_fast_rcnn.py │ │ │ │ ├── sampling.py │ │ │ │ └── test_time_augmentation.py │ │ │ ├── projects/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ └── deeplab/ │ │ │ │ ├── __init__.py │ │ │ │ ├── build_solver.py │ │ │ │ ├── config.py │ │ │ │ ├── loss.py │ │ │ │ ├── lr_scheduler.py │ │ │ │ ├── resnet.py │ │ │ │ └── semantic_seg.py │ │ │ ├── solver/ │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ └── lr_scheduler.py │ │ │ ├── structures/ │ │ │ │ ├── __init__.py │ │ │ │ ├── boxes.py │ │ │ │ ├── image_list.py │ │ │ │ ├── instances.py │ │ │ │ ├── keypoints.py │ │ │ │ ├── masks.py │ │ │ │ └── rotated_boxes.py │ │ │ ├── tracking/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base_tracker.py │ │ │ │ ├── bbox_iou_tracker.py │ │ │ │ ├── hungarian_tracker.py │ │ │ │ ├── iou_weighted_hungarian_bbox_iou_tracker.py │ │ │ │ ├── utils.py │ │ │ │ └── vanilla_hungarian_bbox_iou_tracker.py │ │ │ └── utils/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── analysis.py │ │ │ ├── collect_env.py │ │ │ ├── colormap.py │ │ │ ├── comm.py │ │ │ ├── develop.py │ │ │ ├── env.py │ │ │ ├── events.py │ │ │ ├── file_io.py │ │ │ ├── logger.py │ │ │ ├── memory.py │ │ │ ├── registry.py │ │ │ ├── serialize.py │ │ │ ├── testing.py │ │ │ ├── tracing.py │ │ │ ├── video_visualizer.py │ │ │ └── visualizer.py │ │ ├── oneformer/ │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── data/ │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ ├── dataset_mappers/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── coco_unified_new_baseline_dataset_mapper.py │ │ │ │ │ ├── dataset_mapper.py │ │ │ │ │ └── oneformer_unified_dataset_mapper.py │ │ │ │ ├── datasets/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── register_ade20k_instance.py │ │ │ │ │ ├── register_ade20k_panoptic.py │ │ │ │ │ ├── register_cityscapes_panoptic.py │ │ │ │ │ ├── register_coco_panoptic2instance.py │ │ │ │ │ └── register_coco_panoptic_annos_semseg.py │ │ │ │ └── tokenizer.py │ │ │ ├── demo/ │ │ │ │ ├── colormap.py │ │ │ │ ├── defaults.py │ │ │ │ ├── predictor.py │ │ │ │ └── visualizer.py │ │ │ ├── evaluation/ │ │ │ │ ├── __init__.py │ │ │ │ ├── cityscapes_evaluation.py │ │ │ │ ├── coco_evaluator.py │ │ │ │ ├── detection_coco_evaluator.py │ │ │ │ ├── evaluator.py │ │ │ │ └── instance_evaluation.py │ │ │ ├── modeling/ │ │ │ │ ├── __init__.py │ │ │ │ ├── backbone/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── dinat.py │ │ │ │ │ └── swin.py │ │ │ │ ├── matcher.py │ │ │ │ ├── meta_arch/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── oneformer_head.py │ │ │ │ ├── pixel_decoder/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── fpn.py │ │ │ │ │ ├── msdeformattn.py │ │ │ │ │ └── ops/ │ │ │ │ │ ├── functions/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ │ ├── make.sh │ │ │ │ │ ├── modules/ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── ms_deform_attn.py │ │ │ │ │ ├── setup.py │ │ │ │ │ ├── src/ │ │ │ │ │ │ ├── cpu/ │ │ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ │ │ ├── cuda/ │ │ │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ │ │ ├── ms_deform_attn.h │ │ │ │ │ │ └── vision.cpp │ │ │ │ │ └── test.py │ │ │ │ └── transformer_decoder/ │ │ │ │ ├── __init__.py │ │ │ │ ├── oneformer_transformer_decoder.py │ │ │ │ ├── position_encoding.py │ │ │ │ ├── text_transformer.py │ │ │ │ └── transformer.py │ │ │ ├── oneformer_model.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── box_ops.py │ │ │ ├── events.py │ │ │ ├── misc.py │ │ │ └── pos_embed.py │ │ └── pycocotools/ │ │ ├── __init__.py │ │ ├── coco.py │ │ ├── cocoeval.py │ │ └── mask.py │ ├── openpose/ │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── animalpose.py │ │ ├── body.py │ │ ├── cv_ox_det.py │ │ ├── cv_ox_pose.py │ │ ├── face.py │ │ ├── hand.py │ │ ├── model.py │ │ ├── types.py │ │ ├── util.py │ │ └── wholebody.py │ ├── pidinet/ │ │ ├── LICENSE │ │ ├── __init__.py │ │ └── model.py │ ├── shuffle/ │ │ └── __init__.py │ ├── teed/ │ │ ├── Fmish.py │ │ ├── Fsmish.py │ │ ├── LICENSE.txt │ │ ├── Xmish.py │ │ ├── Xsmish.py │ │ ├── __init__.py │ │ └── ted.py │ ├── uniformer/ │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── configs/ │ │ │ └── _base_/ │ │ │ ├── datasets/ │ │ │ │ ├── ade20k.py │ │ │ │ ├── chase_db1.py │ │ │ │ ├── cityscapes.py │ │ │ │ ├── cityscapes_769x769.py │ │ │ │ ├── drive.py │ │ │ │ ├── hrf.py │ │ │ │ ├── pascal_context.py │ │ │ │ ├── pascal_context_59.py │ │ │ │ ├── pascal_voc12.py │ │ │ │ ├── pascal_voc12_aug.py │ │ │ │ └── stare.py │ │ │ ├── default_runtime.py │ │ │ ├── models/ │ │ │ │ ├── ann_r50-d8.py │ │ │ │ ├── apcnet_r50-d8.py │ │ │ │ ├── ccnet_r50-d8.py │ │ │ │ ├── cgnet.py │ │ │ │ ├── danet_r50-d8.py │ │ │ │ ├── deeplabv3_r50-d8.py │ │ │ │ ├── deeplabv3_unet_s5-d16.py │ │ │ │ ├── deeplabv3plus_r50-d8.py │ │ │ │ ├── dmnet_r50-d8.py │ │ │ │ ├── dnl_r50-d8.py │ │ │ │ ├── emanet_r50-d8.py │ │ │ │ ├── encnet_r50-d8.py │ │ │ │ ├── fast_scnn.py │ │ │ │ ├── fcn_hr18.py │ │ │ │ ├── fcn_r50-d8.py │ │ │ │ ├── fcn_unet_s5-d16.py │ │ │ │ ├── fpn_r50.py │ │ │ │ ├── fpn_uniformer.py │ │ │ │ ├── gcnet_r50-d8.py │ │ │ │ ├── lraspp_m-v3-d8.py │ │ │ │ ├── nonlocal_r50-d8.py │ │ │ │ ├── ocrnet_hr18.py │ │ │ │ ├── ocrnet_r50-d8.py │ │ │ │ ├── pointrend_r50.py │ │ │ │ ├── psanet_r50-d8.py │ │ │ │ ├── pspnet_r50-d8.py │ │ │ │ ├── pspnet_unet_s5-d16.py │ │ │ │ ├── upernet_r50.py │ │ │ │ └── upernet_uniformer.py │ │ │ └── schedules/ │ │ │ ├── schedule_160k.py │ │ │ ├── schedule_20k.py │ │ │ ├── schedule_40k.py │ │ │ └── schedule_80k.py │ │ ├── inference.py │ │ ├── mmcv_custom/ │ │ │ ├── __init__.py │ │ │ └── checkpoint.py │ │ ├── uniformer.py │ │ └── upernet_global_small.py │ ├── util.py │ └── zoe/ │ ├── LICENSE │ ├── __init__.py │ └── zoedepth/ │ ├── models/ │ │ ├── __init__.py │ │ ├── base_models/ │ │ │ ├── __init__.py │ │ │ ├── midas.py │ │ │ └── midas_repo/ │ │ │ ├── .gitignore │ │ │ ├── Dockerfile │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── environment.yaml │ │ │ ├── hubconf.py │ │ │ ├── input/ │ │ │ │ └── .placeholder │ │ │ ├── midas/ │ │ │ │ ├── backbones/ │ │ │ │ │ ├── beit.py │ │ │ │ │ ├── levit.py │ │ │ │ │ ├── next_vit.py │ │ │ │ │ ├── swin.py │ │ │ │ │ ├── swin2.py │ │ │ │ │ ├── swin_common.py │ │ │ │ │ ├── utils.py │ │ │ │ │ └── vit.py │ │ │ │ ├── base_model.py │ │ │ │ ├── blocks.py │ │ │ │ ├── dpt_depth.py │ │ │ │ ├── midas_net.py │ │ │ │ ├── midas_net_custom.py │ │ │ │ ├── model_loader.py │ │ │ │ └── transforms.py │ │ │ ├── output/ │ │ │ │ └── .placeholder │ │ │ ├── ros/ │ │ │ │ ├── LICENSE │ │ │ │ ├── README.md │ │ │ │ ├── additions/ │ │ │ │ │ ├── do_catkin_make.sh │ │ │ │ │ ├── downloads.sh │ │ │ │ │ ├── install_ros_melodic_ubuntu_17_18.sh │ │ │ │ │ ├── install_ros_noetic_ubuntu_20.sh │ │ │ │ │ └── make_package_cpp.sh │ │ │ │ ├── launch_midas_cpp.sh │ │ │ │ ├── midas_cpp/ │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ ├── launch/ │ │ │ │ │ │ ├── midas_cpp.launch │ │ │ │ │ │ └── midas_talker_listener.launch │ │ │ │ │ ├── package.xml │ │ │ │ │ ├── scripts/ │ │ │ │ │ │ ├── listener.py │ │ │ │ │ │ ├── listener_original.py │ │ │ │ │ │ └── talker.py │ │ │ │ │ └── src/ │ │ │ │ │ └── main.cpp │ │ │ │ └── run_talker_listener_test.sh │ │ │ ├── run.py │ │ │ ├── tf/ │ │ │ │ ├── README.md │ │ │ │ ├── input/ │ │ │ │ │ └── .placeholder │ │ │ │ ├── make_onnx_model.py │ │ │ │ ├── output/ │ │ │ │ │ └── .placeholder │ │ │ │ ├── run_onnx.py │ │ │ │ ├── run_pb.py │ │ │ │ ├── transforms.py │ │ │ │ └── utils.py │ │ │ ├── utils.py │ │ │ └── weights/ │ │ │ └── .placeholder │ │ ├── builder.py │ │ ├── depth_model.py │ │ ├── layers/ │ │ │ ├── attractor.py │ │ │ ├── dist_layers.py │ │ │ ├── localbins_layers.py │ │ │ └── patch_transformer.py │ │ ├── model_io.py │ │ ├── zoedepth/ │ │ │ ├── __init__.py │ │ │ ├── config_zoedepth.json │ │ │ ├── config_zoedepth_kitti.json │ │ │ └── zoedepth_v1.py │ │ └── zoedepth_nk/ │ │ ├── __init__.py │ │ ├── config_zoedepth_nk.json │ │ └── zoedepth_nk_v1.py │ └── utils/ │ ├── __init__.py │ ├── arg_utils.py │ ├── config.py │ ├── easydict/ │ │ └── __init__.py │ ├── geometry.py │ └── misc.py ├── example/ │ ├── advanced_weighting_example/ │ │ └── api_advanced_weighting.py │ ├── chatgpt.py │ ├── inpaint_example/ │ │ └── api_inpaint.py │ ├── txt2img_example/ │ │ └── api_txt2img.py │ └── visual_chatgpt.ipynb ├── extract_controlnet.py ├── extract_controlnet_diff.py ├── install.py ├── internal_controlnet/ │ ├── __init__.py │ ├── args.py │ └── external_code.py ├── javascript/ │ ├── canvas.js │ ├── controlnet_unit.mjs │ ├── index.mjs │ ├── modal.mjs │ ├── openpose_editor.mjs │ └── photopea.mjs ├── models/ │ └── put_controlnet_models_here.txt ├── patch_version.py ├── preload.py ├── pyproject.toml ├── requirements.txt ├── scripts/ │ ├── adapter.py │ ├── animate_diff/ │ │ └── batch.py │ ├── api.py │ ├── batch_hijack.py │ ├── cldm.py │ ├── controlnet.py │ ├── controlnet_core/ │ │ └── controlnet_union.py │ ├── controlnet_diffusers.py │ ├── controlnet_lllite.py │ ├── controlnet_lora.py │ ├── controlnet_model_guess.py │ ├── controlnet_sparsectrl.py │ ├── controlnet_ui/ │ │ ├── advanced_weight_control.py │ │ ├── controlnet_ui_group.py │ │ ├── modal.py │ │ ├── openpose_editor.py │ │ └── photopea.py │ ├── controlnet_version.py │ ├── enums.py │ ├── external_code.py │ ├── global_state.py │ ├── hook.py │ ├── infotext.py │ ├── ipadapter/ │ │ ├── __init__.py │ │ ├── image_proj_models.py │ │ ├── ipadapter_model.py │ │ ├── plugable_ipadapter.py │ │ ├── presets.py │ │ ├── pulid_attn.py │ │ └── weight.py │ ├── logging.py │ ├── lvminthin.py │ ├── movie2movie.py │ ├── preprocessor/ │ │ ├── __init__.py │ │ ├── inpaint.py │ │ ├── ip_adapter_auto.py │ │ ├── lama_inpaint.py │ │ ├── legacy/ │ │ │ ├── legacy_preprocessors.py │ │ │ ├── preprocessor_compiled.py │ │ │ └── processor.py │ │ ├── mobile_sam.py │ │ ├── model_free_preprocessors.py │ │ ├── normal_dsine.py │ │ ├── pulid.py │ │ └── teed.py │ ├── supported_preprocessor.py │ ├── utils.py │ └── xyz_grid_support.py ├── style.css ├── tests/ │ ├── README.md │ ├── annotator_tests/ │ │ └── openpose_tests/ │ │ ├── body_test.py │ │ ├── detection_test.py │ │ ├── json_encode_test.py │ │ └── openpose_e2e_test_disabled.py │ ├── cn_script/ │ │ ├── __init__.py │ │ ├── batch_hijack_test.py │ │ ├── cn_script_test.py │ │ └── utils_test.py │ ├── conftest.py │ ├── external_code_api/ │ │ ├── __init__.py │ │ └── external_code_test.py │ ├── utils.py │ └── web_api/ │ ├── __init__.py │ ├── animal_pose.json │ ├── clip_mask_test.py │ ├── detect_test.py │ ├── effective_region_test.py │ ├── full_coverage/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── depth_test.py │ │ ├── inpaint_test.py │ │ ├── ipadapter_test.py │ │ └── template.py │ ├── generation_test.py │ ├── ipadapter_advanced_weighting.py │ ├── ipadapter_clip_api.py │ ├── modules_test.py │ ├── pose.json │ ├── render_openpose_json.py │ └── template.py ├── unit_tests/ │ ├── __init__.py │ └── args_test.py └── web_tests/ ├── README.md └── main.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: Bug Report description: Create a report title: "[Bug]: " labels: ["bug-report"] body: - type: checkboxes attributes: label: Is there an existing issue for this? description: Please search to see if an issue already exists for the bug you encountered, and that it hasn't been fixed in a recent build/commit. options: - label: I have searched the existing issues and checked the recent builds/commits of both this extension and the webui required: true - type: markdown attributes: value: | *Please fill this form with as much information as possible, don't forget to fill "What OS..." and "What browsers" and *provide screenshots if possible** - type: textarea id: what-did attributes: label: What happened? description: Tell us what happened in a very clear and simple way validations: required: true - type: textarea id: steps attributes: label: Steps to reproduce the problem description: Please provide us with precise step by step information on how to reproduce the bug value: | 1. Go to .... 2. Press .... 3. ... validations: required: true - type: textarea id: what-should attributes: label: What should have happened? description: Tell what you think the normal behavior should be validations: required: true - type: textarea id: commits attributes: label: Commit where the problem happens description: Which commit of the extension are you running on? Please include the commit of both the extension and the webui (Do not write *Latest version/repo/commit*, as this means nothing and will have changed by the time we read your issue. Rather, copy the **Commit** link at the bottom of the UI, or from the cmd/terminal if you can't launch it.) value: | webui: controlnet: validations: required: true - type: dropdown id: browsers attributes: label: What browsers do you use to access the UI ? multiple: true options: - Mozilla Firefox - Google Chrome - Brave - Apple Safari - Microsoft Edge - type: textarea id: cmdargs attributes: label: Command Line Arguments description: Are you using any launching parameters/command line arguments (modified webui-user .bat/.sh) ? If yes, please write them below. Write "No" otherwise. render: Shell validations: required: true - type: textarea id: extensions attributes: label: List of enabled extensions description: Please provide a full list of enabled extensions or screenshots of your "Extensions" tab. validations: required: true - type: textarea id: logs attributes: label: Console logs description: Please provide full cmd/terminal logs from the moment you started UI to the end of it, after your bug happened. If it's very long, provide a link to pastebin or similar service. render: Shell validations: required: true - type: textarea id: misc attributes: label: Additional information description: Please provide us with any relevant additional info or context. ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: true ================================================ FILE: .github/workflows/lint.yaml ================================================ name: Linter on: - push - pull_request jobs: lint-python: name: ruff runs-on: ubuntu-latest steps: - name: Checkout Code uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: 3.11 # NB: there's no cache: pip here since we're not installing anything # from the requirements.txt file(s) in the repository; it's faster # not to have GHA download an (at the time of writing) 4 GB cache # of PyTorch and other dependencies. - name: Install Ruff run: pip install ruff==0.1.6 - name: Run Ruff run: ruff . ================================================ FILE: .github/workflows/tests.yml ================================================ name: Run basic features tests on CPU on: - push - pull_request env: FORGE_CQ_TEST: "True" jobs: build: runs-on: ubuntu-latest steps: - name: Checkout Code uses: actions/checkout@v3 with: repository: 'AUTOMATIC1111/stable-diffusion-webui' path: 'stable-diffusion-webui' ref: '1c0a0c4c26f78c32095ebc7f8af82f5c04fca8c0' - name: Checkout Code uses: actions/checkout@v3 with: repository: 'Mikubill/sd-webui-controlnet' path: 'stable-diffusion-webui/extensions/sd-webui-controlnet' - name: Set up Python 3.10 uses: actions/setup-python@v4 with: python-version: 3.10.6 cache: pip cache-dependency-path: | **/requirements*txt launch.py - name: Install test dependencies run: | pip install wait-for-it pip install -r requirements-test.txt working-directory: stable-diffusion-webui env: PIP_DISABLE_PIP_VERSION_CHECK: "1" PIP_PROGRESS_BAR: "off" - name: Setup environment run: python launch.py --skip-torch-cuda-test --exit working-directory: stable-diffusion-webui env: PIP_DISABLE_PIP_VERSION_CHECK: "1" PIP_PROGRESS_BAR: "off" TORCH_INDEX_URL: https://download.pytorch.org/whl/cpu WEBUI_LAUNCH_LIVE_OUTPUT: "1" PYTHONUNBUFFERED: "1" - name: Cache ControlNet models uses: actions/cache@v3 with: path: stable-diffusion-webui/extensions/sd-webui-controlnet/models/ key: controlnet-models-v3 - name: Cache Preprocessor models uses: actions/cache@v3 with: path: stable-diffusion-webui/extensions/sd-webui-controlnet/annotator/downloads/ key: preprocessor-models-v1 - name: Download controlnet model for testing run: | declare -a urls=( "https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_canny.pth" "https://huggingface.co/h94/IP-Adapter/resolve/main/models/ip-adapter-full-face_sd15.safetensors" "https://huggingface.co/h94/IP-Adapter/resolve/main/models/ip-adapter-plus-face_sd15.safetensors" "https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_canny_sd15v2.pth" "https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_lora_rank128_v11p_sd15_canny_fp16.safetensors" ) for url in "${urls[@]}"; do filename="extensions/sd-webui-controlnet/models/${url##*/}" # Extracts the last part of the URL if [ ! -f "$filename" ]; then curl -Lo "$filename" "$url" fi done working-directory: stable-diffusion-webui - name: Start test server run: > python -m coverage run --data-file=.coverage.server launch.py --skip-prepare-environment --skip-torch-cuda-test --test-server --do-not-download-clip --no-half --disable-opt-split-attention --use-cpu all --api-server-stop 2>&1 | tee output.txt & working-directory: stable-diffusion-webui - name: Run tests run: | wait-for-it --service 127.0.0.1:7860 -t 600 python -m pytest -v --junitxml=test/results.xml --cov ./extensions/sd-webui-controlnet --cov-report=xml --verify-base-url ./extensions/sd-webui-controlnet/tests working-directory: stable-diffusion-webui - name: Run unit tests run: | python -m pytest -v ./unit_tests/ working-directory: stable-diffusion-webui/extensions/sd-webui-controlnet/ - name: Kill test server if: always() run: curl -vv -XPOST http://127.0.0.1:7860/sdapi/v1/server-stop && sleep 10 - name: Show coverage run: | python -m coverage combine .coverage* python -m coverage report -i python -m coverage html -i working-directory: stable-diffusion-webui - name: Upload main app output uses: actions/upload-artifact@v3 if: always() with: name: output path: stable-diffusion-webui/output.txt - name: Upload coverage HTML uses: actions/upload-artifact@v3 if: always() with: name: htmlcov path: stable-diffusion-webui/htmlcov ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea *.pt *.pth *.ckpt *.bin *.safetensors # Editor setting metadata .idea/ .vscode/ detected_maps/ annotator/downloads/ # test results and expectations web_tests/results/ web_tests/expectations/ tests/web_api/full_coverage/results/ tests/web_api/full_coverage/expectations/ tests/web_api/results/ tests/web_api/expectations/ *_diff.png # Presets presets/ # Ignore existing dir of hand refiner if exists. annotator/hand_refiner_portable # Ruff linter cache dir .ruff_cache/ ================================================ FILE: LICENSE ================================================ GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ================================================ FILE: README.md ================================================ # ControlNet for Stable Diffusion WebUI The WebUI extension for ControlNet and other injection-based SD controls. ![image](https://github.com/Mikubill/sd-webui-controlnet/assets/20929282/261f9a50-ba9c-472f-b398-fced61929c4a) This extension is for AUTOMATIC1111's [Stable Diffusion web UI](https://github.com/AUTOMATIC1111/stable-diffusion-webui), allows the Web UI to add [ControlNet](https://github.com/lllyasviel/ControlNet) to the original Stable Diffusion model to generate images. The addition is on-the-fly, the merging is not required. # News - [2024-07-09] 🔥[v1.1.454] ControlNet union model support [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2989] - [2024-07-01] 🔥[v1.1.452] Depth Anything V2 - UDAV2 depth Preprocessor [Pull thread: https://github.com/Mikubill/sd-webui-controlnet/pull/2969] - [2024-05-19] 🔥[v1.1.449] Anyline Preprocessor & MistoLine SDXL model [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2907] - [2024-05-04] 🔥[v1.1.447] PuLID [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2841] - [2024-04-30] 🔥[v1.1.446] Effective region mask supported for ControlNet/IPAdapter [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2831] - [2024-04-27] 🔥ControlNet-lllite Normal Dsine released [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2813] - [2024-04-19] 🔥[v1.1.445] IPAdapter advanced weight [Instant Style] [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2770] - [2024-04-17] 🔥[v1.1.444] Marigold depth preprocessor [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2760] - [2024-04-15] 🔥ControlNet++ models released [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2778] - [2024-04-13] 🔥TTPLanet_SDXL_Controlnet_Tile_Realistic v2 released [[Civitai Page](https://civitai.com/models/330313/ttplanetsdxlcontrolnettilerealistic)] - [2024-03-31] 🔥[v1.1.443] IP-Adapter CLIP mask and ip-adapter-auto preprocessor [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2723] - [2024-03-20] 🔥IPAdapter Composition [Discussion thread: https://github.com/Mikubill/sd-webui-controlnet/discussions/2781] # Installation 1. Open "Extensions" tab. 2. Open "Install from URL" tab in the tab. 3. Enter `https://github.com/Mikubill/sd-webui-controlnet.git` to "URL for extension's git repository". 4. Press "Install" button. 5. Wait for 5 seconds, and you will see the message "Installed into stable-diffusion-webui\extensions\sd-webui-controlnet. Use Installed tab to restart". 6. Go to "Installed" tab, click "Check for updates", and then click "Apply and restart UI". (The next time you can also use these buttons to update ControlNet.) 7. Completely restart A1111 webui including your terminal. (If you do not know what is a "terminal", you can reboot your computer to achieve the same effect.) 8. Download models (see below). 9. After you put models in the correct folder, you may need to refresh to see the models. The refresh button is right to your "Model" dropdown. # Download Models You can find all download links here: https://github.com/Mikubill/sd-webui-controlnet/wiki/Model-download. # Features in ControlNet 1.1 ### Perfect Support for All ControlNet 1.0/1.1 and T2I Adapter Models. Now we have perfect support all available models and preprocessors, including perfect support for T2I style adapter and ControlNet 1.1 Shuffle. (Make sure that your YAML file names and model file names are same, see also YAML files in "stable-diffusion-webui\extensions\sd-webui-controlnet\models".) ### Perfect Support for A1111 High-Res. Fix Now if you turn on High-Res Fix in A1111, each controlnet will output two different control images: a small one and a large one. The small one is for your basic generating, and the big one is for your High-Res Fix generating. The two control images are computed by a smart algorithm called "super high-quality control image resampling". This is turned on by default, and you do not need to change any setting. ### Perfect Support for All A1111 Img2Img or Inpaint Settings and All Mask Types Now ControlNet is extensively tested with A1111's different types of masks, including "Inpaint masked"/"Inpaint not masked", and "Whole picture"/"Only masked", and "Only masked padding"&"Mask blur". The resizing perfectly matches A1111's "Just resize"/"Crop and resize"/"Resize and fill". This means you can use ControlNet in nearly everywhere in your A1111 UI without difficulty! ### The New "Pixel-Perfect" Mode Now if you turn on pixel-perfect mode, you do not need to set preprocessor (annotator) resolutions manually. The ControlNet will automatically compute the best annotator resolution for you so that each pixel perfectly matches Stable Diffusion. ### User-Friendly GUI and Preprocessor Preview We reorganized some previously confusing UI like "canvas width/height for new canvas" and it is in the 📝 button now. Now the preview GUI is controlled by the "allow preview" option and the trigger button 💥. The preview image size is better than before, and you do not need to scroll up and down - your a1111 GUI will not be messed up anymore! ### Support for Almost All Upscaling Scripts Now ControlNet 1.1 can support almost all Upscaling/Tile methods. ControlNet 1.1 support the script "Ultimate SD upscale" and almost all other tile-based extensions. Please do not confuse ["Ultimate SD upscale"](https://github.com/Coyote-A/ultimate-upscale-for-automatic1111) with "SD upscale" - they are different scripts. Note that the most recommended upscaling method is ["Tiled VAE/Diffusion"](https://github.com/pkuliyi2015/multidiffusion-upscaler-for-automatic1111) but we test as many methods/extensions as possible. Note that "SD upscale" is supported since 1.1.117, and if you use it, you need to leave all ControlNet images as blank (We do not recommend "SD upscale" since it is somewhat buggy and cannot be maintained - use the "Ultimate SD upscale" instead). ### More Control Modes (previously called Guess Mode) We have fixed many bugs in previous 1.0’s Guess Mode and now it is called Control Mode ![image](https://user-images.githubusercontent.com/19834515/236641759-6c44ddf6-c7ad-4bda-92be-e90a52911d75.png) Now you can control which aspect is more important (your prompt or your ControlNet): * "Balanced": ControlNet on both sides of CFG scale, same as turning off "Guess Mode" in ControlNet 1.0 * "My prompt is more important": ControlNet on both sides of CFG scale, with progressively reduced SD U-Net injections (layer_weight*=0.825**I, where 0<=I <13, and the 13 means ControlNet injected SD 13 times). In this way, you can make sure that your prompts are perfectly displayed in your generated images. * "ControlNet is more important": ControlNet only on the Conditional Side of CFG scale (the cond in A1111's batch-cond-uncond). This means the ControlNet will be X times stronger if your cfg-scale is X. For example, if your cfg-scale is 7, then ControlNet is 7 times stronger. Note that here the X times stronger is different from "Control Weights" since your weights are not modified. This "stronger" effect usually has less artifact and give ControlNet more room to guess what is missing from your prompts (and in the previous 1.0, it is called "Guess Mode").
Input (depth+canny+hed) "Balanced" "My prompt is more important" "ControlNet is more important"
### Reference-Only Control Now we have a `reference-only` preprocessor that does not require any control models. It can guide the diffusion directly using images as references. (Prompt "a dog running on grassland, best quality, ...") ![image](samples/ref.png) This method is similar to inpaint-based reference but it does not make your image disordered. Many professional A1111 users know a trick to diffuse image with references by inpaint. For example, if you have a 512x512 image of a dog, and want to generate another 512x512 image with the same dog, some users will connect the 512x512 dog image and a 512x512 blank image into a 1024x512 image, send to inpaint, and mask out the blank 512x512 part to diffuse a dog with similar appearance. However, that method is usually not very satisfying since images are connected and many distortions will appear. This `reference-only` ControlNet can directly link the attention layers of your SD to any independent images, so that your SD will read arbitrary images for reference. You need at least ControlNet 1.1.153 to use it. To use, just select `reference-only` as preprocessor and put an image. Your SD will just use the image as reference. *Note that this method is as "non-opinioned" as possible. It only contains very basic connection codes, without any personal preferences, to connect the attention layers with your reference images. However, even if we tried best to not include any opinioned codes, we still need to write some subjective implementations to deal with weighting, cfg-scale, etc - tech report is on the way.* More examples [here](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236). # Technical Documents See also the documents of ControlNet 1.1: https://github.com/lllyasviel/ControlNet-v1-1-nightly#model-specification # Default Setting This is my setting. If you run into any problem, you can use this setting as a sanity check ![image](https://user-images.githubusercontent.com/19834515/235620638-17937171-8ac1-45bc-a3cb-3aebf605b4ef.png) # Use Previous Models ### Use ControlNet 1.0 Models https://huggingface.co/lllyasviel/ControlNet/tree/main/models You can still use all previous models in the previous ControlNet 1.0. Now, the previous "depth" is now called "depth_midas", the previous "normal" is called "normal_midas", the previous "hed" is called "softedge_hed". And starting from 1.1, all line maps, edge maps, lineart maps, boundary maps will have black background and white lines. ### Use T2I-Adapter Models (From TencentARC/T2I-Adapter) To use T2I-Adapter models: 1. Download files from https://huggingface.co/TencentARC/T2I-Adapter/tree/main/models 2. Put them in "stable-diffusion-webui\extensions\sd-webui-controlnet\models". 3. Make sure that the file names of pth files and yaml files are consistent. *Note that "CoAdapter" is not implemented yet.* # Gallery The below results are from ControlNet 1.0. | Source | Input | Output | |:-------------------------:|:-------------------------:|:-------------------------:| | (no preprocessor) | | | | (no preprocessor) | | | | | | | | | | | | | | | | | | | The below examples are from T2I-Adapter. From `t2iadapter_color_sd14v1.pth` : | Source | Input | Output | |:-------------------------:|:-------------------------:|:-------------------------:| | | | | From `t2iadapter_style_sd14v1.pth` : | Source | Input | Output | |:-------------------------:|:-------------------------:|:-------------------------:| | | (clip, non-image) | | # Minimum Requirements * (Windows) (NVIDIA: Ampere) 4gb - with `--xformers` enabled, and `Low VRAM` mode ticked in the UI, goes up to 768x832 # Multi-ControlNet This option allows multiple ControlNet inputs for a single generation. To enable this option, change `Multi ControlNet: Max models amount (requires restart)` in the settings. Note that you will need to restart the WebUI for changes to take effect.
Source A Source B Output
# Control Weight/Start/End Weight is the weight of the controlnet "influence". It's analogous to prompt attention/emphasis. E.g. (myprompt: 1.2). Technically, it's the factor by which to multiply the ControlNet outputs before merging them with original SD Unet. Guidance Start/End is the percentage of total steps the controlnet applies (guidance strength = guidance end). It's analogous to prompt editing/shifting. E.g. \[myprompt::0.8\] (It applies from the beginning until 80% of total steps) # Batch Mode Put any unit into batch mode to activate batch mode for all units. Specify a batch directory for each unit, or use the new textbox in the img2img batch tab as a fallback. Although the textbox is located in the img2img batch tab, you can use it to generate images in the txt2img tab as well. Note that this feature is only available in the gradio user interface. Call the APIs as many times as you want for custom batch scheduling. # API and Script Access This extension can accept txt2img or img2img tasks via API or external extension call. Note that you may need to enable `Allow other scripts to control this extension` in settings for external calls. To use the API: start WebUI with argument `--api` and go to `http://webui-address/docs` for documents or checkout [examples](https://github.com/Mikubill/sd-webui-controlnet/blob/main/example/txt2img_example/api_txt2img.py). To use external call: Checkout [Wiki](https://github.com/Mikubill/sd-webui-controlnet/wiki/API) # Command Line Arguments This extension adds these command line arguments to the webui: ``` --controlnet-dir ADD a controlnet models directory --controlnet-annotator-models-path SET the directory for annotator models --no-half-controlnet load controlnet models in full precision --controlnet-preprocessor-cache-size Cache size for controlnet preprocessor results --controlnet-loglevel Log level for the controlnet extension --controlnet-tracemalloc Enable malloc memory tracing ``` # MacOS Support Tested with pytorch nightly: https://github.com/Mikubill/sd-webui-controlnet/pull/143#issuecomment-1435058285 To use this extension with mps and normal pytorch, currently you may need to start WebUI with `--no-half`. # Archive of Deprecated Versions The previous version (sd-webui-controlnet 1.0) is archived in https://github.com/lllyasviel/webui-controlnet-v1-archived Using this version is not a temporary stop of updates. You will stop all updates forever. Please consider this version if you work with professional studios that requires 100% reproducing of all previous results pixel by pixel. # Thanks This implementation is inspired by kohya-ss/sd-webui-additional-networks ================================================ FILE: annotator/__init__.py ================================================ ================================================ FILE: annotator/anime_face_segment/LICENSE ================================================ MIT License Copyright (c) 2021 Miaomiao Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/anime_face_segment/__init__.py ================================================ import os import torch import torch.nn as nn import torch.nn.functional as F from PIL import Image import fnmatch import cv2 import sys import numpy as np from modules import devices from einops import rearrange from annotator.annotator_path import models_path import torchvision from torchvision.models import MobileNet_V2_Weights from torchvision import transforms COLOR_BACKGROUND = (255,255,0) COLOR_HAIR = (0,0,255) COLOR_EYE = (255,0,0) COLOR_MOUTH = (255,255,255) COLOR_FACE = (0,255,0) COLOR_SKIN = (0,255,255) COLOR_CLOTHES = (255,0,255) PALETTE = [COLOR_BACKGROUND,COLOR_HAIR,COLOR_EYE,COLOR_MOUTH,COLOR_FACE,COLOR_SKIN,COLOR_CLOTHES] class UNet(nn.Module): def __init__(self): super(UNet, self).__init__() self.NUM_SEG_CLASSES = 7 # Background, hair, face, eye, mouth, skin, clothes mobilenet_v2 = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1) mob_blocks = mobilenet_v2.features # Encoder self.en_block0 = nn.Sequential( # in_ch=3 out_ch=16 mob_blocks[0], mob_blocks[1] ) self.en_block1 = nn.Sequential( # in_ch=16 out_ch=24 mob_blocks[2], mob_blocks[3], ) self.en_block2 = nn.Sequential( # in_ch=24 out_ch=32 mob_blocks[4], mob_blocks[5], mob_blocks[6], ) self.en_block3 = nn.Sequential( # in_ch=32 out_ch=96 mob_blocks[7], mob_blocks[8], mob_blocks[9], mob_blocks[10], mob_blocks[11], mob_blocks[12], mob_blocks[13], ) self.en_block4 = nn.Sequential( # in_ch=96 out_ch=160 mob_blocks[14], mob_blocks[15], mob_blocks[16], ) # Decoder self.de_block4 = nn.Sequential( # in_ch=160 out_ch=96 nn.UpsamplingNearest2d(scale_factor=2), nn.Conv2d(160, 96, kernel_size=3, padding=1), nn.InstanceNorm2d(96), nn.LeakyReLU(0.1), nn.Dropout(p=0.2) ) self.de_block3 = nn.Sequential( # in_ch=96x2 out_ch=32 nn.UpsamplingNearest2d(scale_factor=2), nn.Conv2d(96*2, 32, kernel_size=3, padding=1), nn.InstanceNorm2d(32), nn.LeakyReLU(0.1), nn.Dropout(p=0.2) ) self.de_block2 = nn.Sequential( # in_ch=32x2 out_ch=24 nn.UpsamplingNearest2d(scale_factor=2), nn.Conv2d(32*2, 24, kernel_size=3, padding=1), nn.InstanceNorm2d(24), nn.LeakyReLU(0.1), nn.Dropout(p=0.2) ) self.de_block1 = nn.Sequential( # in_ch=24x2 out_ch=16 nn.UpsamplingNearest2d(scale_factor=2), nn.Conv2d(24*2, 16, kernel_size=3, padding=1), nn.InstanceNorm2d(16), nn.LeakyReLU(0.1), nn.Dropout(p=0.2) ) self.de_block0 = nn.Sequential( # in_ch=16x2 out_ch=7 nn.UpsamplingNearest2d(scale_factor=2), nn.Conv2d(16*2, self.NUM_SEG_CLASSES, kernel_size=3, padding=1), nn.Softmax2d() ) def forward(self, x): e0 = self.en_block0(x) e1 = self.en_block1(e0) e2 = self.en_block2(e1) e3 = self.en_block3(e2) e4 = self.en_block4(e3) d4 = self.de_block4(e4) d4 = F.interpolate(d4, size=e3.size()[2:], mode='bilinear', align_corners=True) c4 = torch.cat((d4,e3),1) d3 = self.de_block3(c4) d3 = F.interpolate(d3, size=e2.size()[2:], mode='bilinear', align_corners=True) c3 = torch.cat((d3,e2),1) d2 = self.de_block2(c3) d2 = F.interpolate(d2, size=e1.size()[2:], mode='bilinear', align_corners=True) c2 =torch.cat((d2,e1),1) d1 = self.de_block1(c2) d1 = F.interpolate(d1, size=e0.size()[2:], mode='bilinear', align_corners=True) c1 = torch.cat((d1,e0),1) y = self.de_block0(c1) return y class AnimeFaceSegment: model_dir = os.path.join(models_path, "anime_face_segment") def __init__(self): self.model = None self.device = devices.get_device_for("controlnet") def load_model(self): remote_model_path = "https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite/resolve/main/Annotators/UNet.pth" modelpath = os.path.join(self.model_dir, "UNet.pth") if not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=self.model_dir) net = UNet() ckpt = torch.load(modelpath, map_location=self.device) for key in list(ckpt.keys()): if 'module.' in key: ckpt[key.replace('module.', '')] = ckpt[key] del ckpt[key] net.load_state_dict(ckpt) net.eval() self.model = net.to(self.device) def unload_model(self): if self.model is not None: self.model.cpu() def __call__(self, input_image): if self.model is None: self.load_model() self.model.to(self.device) transform = transforms.Compose([ transforms.Resize(512,interpolation=transforms.InterpolationMode.BICUBIC), transforms.ToTensor(),]) img = Image.fromarray(input_image) with torch.no_grad(): img = transform(img).unsqueeze(dim=0).to(self.device) seg = self.model(img).squeeze(dim=0) seg = seg.cpu().detach().numpy() img = rearrange(seg,'h w c -> w c h') img = [[PALETTE[np.argmax(val)] for val in buf]for buf in img] return np.array(img).astype(np.uint8) ================================================ FILE: annotator/annotator_path.py ================================================ import os from modules import shared models_path = shared.opts.data.get('control_net_modules_path', None) if not models_path: models_path = getattr(shared.cmd_opts, 'controlnet_annotator_models_path', None) if not models_path: models_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'downloads') if not os.path.isabs(models_path): models_path = os.path.join(shared.data_path, models_path) clip_vision_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clip_vision') # clip vision is always inside controlnet "extensions\sd-webui-controlnet" # and any problem can be solved by removing controlnet and reinstall models_path = os.path.realpath(models_path) os.makedirs(models_path, exist_ok=True) print(f'ControlNet preprocessor location: {models_path}') # Make sure that the default location is inside controlnet "extensions\sd-webui-controlnet" # so that any problem can be solved by removing controlnet and reinstall # if users do not change configs on their own (otherwise users will know what is wrong) ================================================ FILE: annotator/binary/__init__.py ================================================ import cv2 def apply_binary(img, bin_threshold): img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) if bin_threshold == 0 or bin_threshold == 255: # Otsu's threshold otsu_threshold, img_bin = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) print("Otsu threshold:", otsu_threshold) else: _, img_bin = cv2.threshold(img_gray, bin_threshold, 255, cv2.THRESH_BINARY_INV) return cv2.cvtColor(img_bin, cv2.COLOR_GRAY2RGB) ================================================ FILE: annotator/canny/__init__.py ================================================ import cv2 def apply_canny(img, low_threshold, high_threshold): return cv2.Canny(img, low_threshold, high_threshold) ================================================ FILE: annotator/clipvision/__init__.py ================================================ import os import cv2 import torch import numpy as np from einops import rearrange from modules import devices from annotator.annotator_path import models_path from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, CLIPImageProcessor try: from modules.modelloader import load_file_from_url except ImportError: # backward compability for webui < 1.5.0 from scripts.utils import load_file_from_url config_clip_g = { "attention_dropout": 0.0, "dropout": 0.0, "hidden_act": "gelu", "hidden_size": 1664, "image_size": 224, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 8192, "layer_norm_eps": 1e-05, "model_type": "clip_vision_model", "num_attention_heads": 16, "num_channels": 3, "num_hidden_layers": 48, "patch_size": 14, "projection_dim": 1280, "torch_dtype": "float32" } config_clip_h = { "attention_dropout": 0.0, "dropout": 0.0, "hidden_act": "gelu", "hidden_size": 1280, "image_size": 224, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 5120, "layer_norm_eps": 1e-05, "model_type": "clip_vision_model", "num_attention_heads": 16, "num_channels": 3, "num_hidden_layers": 32, "patch_size": 14, "projection_dim": 1024, "torch_dtype": "float32" } config_clip_vitl = { "attention_dropout": 0.0, "dropout": 0.0, "hidden_act": "quick_gelu", "hidden_size": 1024, "image_size": 224, "initializer_factor": 1.0, "initializer_range": 0.02, "intermediate_size": 4096, "layer_norm_eps": 1e-05, "model_type": "clip_vision_model", "num_attention_heads": 16, "num_channels": 3, "num_hidden_layers": 24, "patch_size": 14, "projection_dim": 768, "torch_dtype": "float32" } configs = { 'clip_g': config_clip_g, 'clip_h': config_clip_h, 'clip_vitl': config_clip_vitl, } downloads = { 'clip_vitl': 'https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin', 'clip_g': 'https://huggingface.co/lllyasviel/Annotators/resolve/main/clip_g.pth', 'clip_h': 'https://huggingface.co/h94/IP-Adapter/resolve/main/models/image_encoder/pytorch_model.bin' } clip_vision_h_uc = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clip_vision_h_uc.data') clip_vision_h_uc = torch.load(clip_vision_h_uc, map_location=devices.get_device_for("controlnet") if torch.cuda.is_available() else torch.device('cpu'))['uc'] clip_vision_vith_uc = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clip_vision_vith_uc.data') clip_vision_vith_uc = torch.load(clip_vision_vith_uc, map_location=devices.get_device_for("controlnet") if torch.cuda.is_available() else torch.device('cpu'))['uc'] class ClipVisionDetector: def __init__(self, config, low_vram: bool): assert config in downloads self.download_link = downloads[config] self.model_path = os.path.join(models_path, 'clip_vision') self.file_name = config + '.pth' self.config = configs[config] self.device = ( torch.device("cpu") if low_vram else devices.get_device_for("controlnet") ) os.makedirs(self.model_path, exist_ok=True) file_path = os.path.join(self.model_path, self.file_name) if not os.path.exists(file_path): load_file_from_url(url=self.download_link, model_dir=self.model_path, file_name=self.file_name) config = CLIPVisionConfig(**self.config) self.model = CLIPVisionModelWithProjection(config) self.processor = CLIPImageProcessor(crop_size=224, do_center_crop=True, do_convert_rgb=True, do_normalize=True, do_resize=True, image_mean=[0.48145466, 0.4578275, 0.40821073], image_std=[0.26862954, 0.26130258, 0.27577711], resample=3, size=224) sd = torch.load(file_path, map_location=self.device) self.model.load_state_dict(sd, strict=False) del sd self.model.to(self.device) self.model.eval() def unload_model(self): if self.model is not None: self.model.to('meta') def __call__(self, input_image: np.ndarray): assert isinstance(input_image, np.ndarray) with torch.no_grad(): mask = None input_image = cv2.resize(input_image, (224, 224), interpolation=cv2.INTER_AREA) if input_image.shape[2] == 4: # Has alpha channel. mask = 255 - input_image[:, :, 3:4] # Invert mask input_image = input_image[:, :, :3] feat = self.processor(images=input_image, return_tensors="pt") feat['pixel_values'] = feat['pixel_values'].to(self.device) # Apply CLIP mask. if mask is not None: mask_tensor = torch.from_numpy(mask).to(self.device).float() / 255.0 feat['pixel_values'] *= rearrange(mask_tensor, "h w c -> 1 c h w") result = self.model(**feat, output_hidden_states=True) result['hidden_states'] = [v.to(self.device) for v in result['hidden_states']] result = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in result.items()} return result ================================================ FILE: annotator/color/__init__.py ================================================ import cv2 def cv2_resize_shortest_edge(image, size): h, w = image.shape[:2] if h < w: new_h = size new_w = int(round(w / h * size)) else: new_w = size new_h = int(round(h / w * size)) resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA) return resized_image def apply_color(img, res=512): img = cv2_resize_shortest_edge(img, res) h, w = img.shape[:2] input_img_color = cv2.resize(img, (w//64, h//64), interpolation=cv2.INTER_CUBIC) input_img_color = cv2.resize(input_img_color, (w, h), interpolation=cv2.INTER_NEAREST) return input_img_color ================================================ FILE: annotator/densepose/__init__.py ================================================ import torchvision # Fix issue Unknown builtin op: torchvision::nms import cv2 import numpy as np import torch from einops import rearrange from .densepose import DensePoseMaskedColormapResultsVisualizer, _extract_i_from_iuvarr, densepose_chart_predictor_output_to_result_with_confidences from modules import devices from annotator.annotator_path import models_path import os N_PART_LABELS = 24 result_visualizer = DensePoseMaskedColormapResultsVisualizer( alpha=1, data_extractor=_extract_i_from_iuvarr, segm_extractor=_extract_i_from_iuvarr, val_scale = 255.0 / N_PART_LABELS ) remote_torchscript_path = "https://huggingface.co/LayerNorm/DensePose-TorchScript-with-hint-image/resolve/main/densepose_r50_fpn_dl.torchscript" torchscript_model = None model_dir = os.path.join(models_path, "densepose") def apply_densepose(input_image, cmap="viridis"): global torchscript_model if torchscript_model is None: model_path = os.path.join(model_dir, "densepose_r50_fpn_dl.torchscript") if not os.path.exists(model_path): from scripts.utils import load_file_from_url load_file_from_url(remote_torchscript_path, model_dir=model_dir) torchscript_model = torch.jit.load(model_path, map_location="cpu").to(devices.get_device_for("controlnet")).eval() H, W = input_image.shape[:2] hint_image_canvas = np.zeros([H, W], dtype=np.uint8) hint_image_canvas = np.tile(hint_image_canvas[:, :, np.newaxis], [1, 1, 3]) input_image = rearrange(torch.from_numpy(input_image).to(devices.get_device_for("controlnet")), 'h w c -> c h w') pred_boxes, corase_segm, fine_segm, u, v = torchscript_model(input_image) extractor = densepose_chart_predictor_output_to_result_with_confidences densepose_results = [extractor(pred_boxes[i:i+1], corase_segm[i:i+1], fine_segm[i:i+1], u[i:i+1], v[i:i+1]) for i in range(len(pred_boxes))] if cmap=="viridis": result_visualizer.mask_visualizer.cmap = cv2.COLORMAP_VIRIDIS hint_image = result_visualizer.visualize(hint_image_canvas, densepose_results) hint_image = cv2.cvtColor(hint_image, cv2.COLOR_BGR2RGB) hint_image[:, :, 0][hint_image[:, :, 0] == 0] = 68 hint_image[:, :, 1][hint_image[:, :, 1] == 0] = 1 hint_image[:, :, 2][hint_image[:, :, 2] == 0] = 84 else: result_visualizer.mask_visualizer.cmap = cv2.COLORMAP_PARULA hint_image = result_visualizer.visualize(hint_image_canvas, densepose_results) hint_image = cv2.cvtColor(hint_image, cv2.COLOR_BGR2RGB) return hint_image def unload_model(): global torchscript_model if torchscript_model is not None: torchscript_model.cpu() ================================================ FILE: annotator/densepose/densepose.py ================================================ from typing import Tuple import math import numpy as np from enum import IntEnum from typing import List, Tuple, Union import torch from torch.nn import functional as F import logging import cv2 Image = np.ndarray Boxes = torch.Tensor ImageSizeType = Tuple[int, int] _RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray] IntTupleBox = Tuple[int, int, int, int] class BoxMode(IntEnum): """ Enum of different ways to represent a box. """ XYXY_ABS = 0 """ (x0, y0, x1, y1) in absolute floating points coordinates. The coordinates in range [0, width or height]. """ XYWH_ABS = 1 """ (x0, y0, w, h) in absolute floating points coordinates. """ XYXY_REL = 2 """ Not yet supported! (x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image. """ XYWH_REL = 3 """ Not yet supported! (x0, y0, w, h) in range [0, 1]. They are relative to the size of the image. """ XYWHA_ABS = 4 """ (xc, yc, w, h, a) in absolute floating points coordinates. (xc, yc) is the center of the rotated box, and the angle a is in degrees ccw. """ @staticmethod def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType: """ Args: box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5 from_mode, to_mode (BoxMode) Returns: The converted box of the same type. """ if from_mode == to_mode: return box original_type = type(box) is_numpy = isinstance(box, np.ndarray) single_box = isinstance(box, (list, tuple)) if single_box: assert len(box) == 4 or len(box) == 5, ( "BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor," " where k == 4 or 5" ) arr = torch.tensor(box)[None, :] else: # avoid modifying the input box if is_numpy: arr = torch.from_numpy(np.asarray(box)).clone() else: arr = box.clone() assert to_mode not in [BoxMode.XYXY_REL, BoxMode.XYWH_REL] and from_mode not in [ BoxMode.XYXY_REL, BoxMode.XYWH_REL, ], "Relative mode not yet supported!" if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS: assert ( arr.shape[-1] == 5 ), "The last dimension of input shape must be 5 for XYWHA format" original_dtype = arr.dtype arr = arr.double() w = arr[:, 2] h = arr[:, 3] a = arr[:, 4] c = torch.abs(torch.cos(a * math.pi / 180.0)) s = torch.abs(torch.sin(a * math.pi / 180.0)) # This basically computes the horizontal bounding rectangle of the rotated box new_w = c * w + s * h new_h = c * h + s * w # convert center to top-left corner arr[:, 0] -= new_w / 2.0 arr[:, 1] -= new_h / 2.0 # bottom-right corner arr[:, 2] = arr[:, 0] + new_w arr[:, 3] = arr[:, 1] + new_h arr = arr[:, :4].to(dtype=original_dtype) elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS: original_dtype = arr.dtype arr = arr.double() arr[:, 0] += arr[:, 2] / 2.0 arr[:, 1] += arr[:, 3] / 2.0 angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype) arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype) else: if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS: arr[:, 2] += arr[:, 0] arr[:, 3] += arr[:, 1] elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS: arr[:, 2] -= arr[:, 0] arr[:, 3] -= arr[:, 1] else: raise NotImplementedError( "Conversion from BoxMode {} to {} is not supported yet".format( from_mode, to_mode ) ) if single_box: return original_type(arr.flatten().tolist()) if is_numpy: return arr.numpy() else: return arr class MatrixVisualizer: """ Base visualizer for matrix data """ def __init__( self, inplace=True, cmap=cv2.COLORMAP_PARULA, val_scale=1.0, alpha=0.7, interp_method_matrix=cv2.INTER_LINEAR, interp_method_mask=cv2.INTER_NEAREST, ): self.inplace = inplace self.cmap = cmap self.val_scale = val_scale self.alpha = alpha self.interp_method_matrix = interp_method_matrix self.interp_method_mask = interp_method_mask def visualize(self, image_bgr, mask, matrix, bbox_xywh): self._check_image(image_bgr) self._check_mask_matrix(mask, matrix) if self.inplace: image_target_bgr = image_bgr else: image_target_bgr = image_bgr * 0 x, y, w, h = [int(v) for v in bbox_xywh] if w <= 0 or h <= 0: return image_bgr mask, matrix = self._resize(mask, matrix, w, h) mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3]) matrix_scaled = matrix.astype(np.float32) * self.val_scale _EPSILON = 1e-6 if np.any(matrix_scaled > 255 + _EPSILON): logger = logging.getLogger(__name__) logger.warning( f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]" ) matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8) matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap) matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg] image_target_bgr[y : y + h, x : x + w, :] = ( image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha ) return image_target_bgr.astype(np.uint8) def _resize(self, mask, matrix, w, h): if (w != mask.shape[1]) or (h != mask.shape[0]): mask = cv2.resize(mask, (w, h), self.interp_method_mask) if (w != matrix.shape[1]) or (h != matrix.shape[0]): matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix) return mask, matrix def _check_image(self, image_rgb): assert len(image_rgb.shape) == 3 assert image_rgb.shape[2] == 3 assert image_rgb.dtype == np.uint8 def _check_mask_matrix(self, mask, matrix): assert len(matrix.shape) == 2 assert len(mask.shape) == 2 assert mask.dtype == np.uint8 class DensePoseResultsVisualizer: def visualize( self, image_bgr: Image, results, ) -> Image: context = self.create_visualization_context(image_bgr) for i, result in enumerate(results): boxes_xywh, labels, uv = result iuv_array = torch.cat( (labels[None].type(torch.float32), uv * 255.0) ).type(torch.uint8) self.visualize_iuv_arr(context, iuv_array.cpu().numpy(), boxes_xywh) image_bgr = self.context_to_image_bgr(context) return image_bgr def create_visualization_context(self, image_bgr: Image): return image_bgr def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None: pass def context_to_image_bgr(self, context): return context def get_image_bgr_from_context(self, context): return context class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer): def __init__( self, data_extractor, segm_extractor, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, val_scale=1.0, **kwargs, ): self.mask_visualizer = MatrixVisualizer( inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha ) self.data_extractor = data_extractor self.segm_extractor = segm_extractor def context_to_image_bgr(self, context): return context def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None: image_bgr = self.get_image_bgr_from_context(context) matrix = self.data_extractor(iuv_arr) segm = self.segm_extractor(iuv_arr) mask = np.zeros(matrix.shape, dtype=np.uint8) mask[segm > 0] = 1 image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh) def _extract_i_from_iuvarr(iuv_arr): return iuv_arr[0, :, :] def _extract_u_from_iuvarr(iuv_arr): return iuv_arr[1, :, :] def _extract_v_from_iuvarr(iuv_arr): return iuv_arr[2, :, :] def make_int_box(box: torch.Tensor) -> IntTupleBox: int_box = [0, 0, 0, 0] int_box[0], int_box[1], int_box[2], int_box[3] = tuple(box.long().tolist()) return int_box[0], int_box[1], int_box[2], int_box[3] def densepose_chart_predictor_output_to_result_with_confidences( boxes: Boxes, coarse_segm, fine_segm, u, v ): boxes_xyxy_abs = boxes.clone() boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) box_xywh = make_int_box(boxes_xywh_abs[0]) labels = resample_fine_and_coarse_segm_tensors_to_bbox(fine_segm, coarse_segm, box_xywh).squeeze(0) uv = resample_uv_tensors_to_bbox(u, v, labels, box_xywh) confidences = [] return box_xywh, labels, uv def resample_fine_and_coarse_segm_tensors_to_bbox( fine_segm: torch.Tensor, coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox ): """ Resample fine and coarse segmentation tensors to the given bounding box and derive labels for each pixel of the bounding box Args: fine_segm: float tensor of shape [1, C, Hout, Wout] coarse_segm: float tensor of shape [1, K, Hout, Wout] box_xywh_abs (tuple of 4 int): bounding box given by its upper-left corner coordinates, width (W) and height (H) Return: Labels for each pixel of the bounding box, a long tensor of size [1, H, W] """ x, y, w, h = box_xywh_abs w = max(int(w), 1) h = max(int(h), 1) # coarse segmentation coarse_segm_bbox = F.interpolate( coarse_segm, (h, w), mode="bilinear", align_corners=False, ).argmax(dim=1) # combined coarse and fine segmentation labels = ( F.interpolate(fine_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1) * (coarse_segm_bbox > 0).long() ) return labels def resample_uv_tensors_to_bbox( u: torch.Tensor, v: torch.Tensor, labels: torch.Tensor, box_xywh_abs: IntTupleBox, ) -> torch.Tensor: """ Resamples U and V coordinate estimates for the given bounding box Args: u (tensor [1, C, H, W] of float): U coordinates v (tensor [1, C, H, W] of float): V coordinates labels (tensor [H, W] of long): labels obtained by resampling segmentation outputs for the given bounding box box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs Return: Resampled U and V coordinates - a tensor [2, H, W] of float """ x, y, w, h = box_xywh_abs w = max(int(w), 1) h = max(int(h), 1) u_bbox = F.interpolate(u, (h, w), mode="bilinear", align_corners=False) v_bbox = F.interpolate(v, (h, w), mode="bilinear", align_corners=False) uv = torch.zeros([2, h, w], dtype=torch.float32, device=u.device) for part_id in range(1, u_bbox.size(1)): uv[0][labels == part_id] = u_bbox[0, part_id][labels == part_id] uv[1][labels == part_id] = v_bbox[0, part_id][labels == part_id] return uv ================================================ FILE: annotator/depth_anything.py ================================================ import os import torch import cv2 import numpy as np import torch.nn.functional as F from torchvision.transforms import Compose from depth_anything.dpt import DPT_DINOv2 from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet from .util import load_model from .annotator_path import models_path transform = Compose( [ Resize( width=518, height=518, resize_target=False, keep_aspect_ratio=True, ensure_multiple_of=14, resize_method="lower_bound", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), PrepareForNet(), ] ) class DepthAnythingDetector: """https://github.com/LiheYoung/Depth-Anything""" model_dir = os.path.join(models_path, "depth_anything") def __init__(self, device: torch.device): self.device = device self.model = ( DPT_DINOv2( encoder="vitl", features=256, out_channels=[256, 512, 1024, 1024], localhub=False, ) .to(device) .eval() ) remote_url = os.environ.get( "CONTROLNET_DEPTH_ANYTHING_MODEL_URL", "https://huggingface.co/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vitl14.pth", ) model_path = load_model( "depth_anything_vitl14.pth", remote_url=remote_url, model_dir=self.model_dir ) self.model.load_state_dict(torch.load(model_path)) def __call__(self, image: np.ndarray, colored: bool = True) -> np.ndarray: self.model.to(self.device) h, w = image.shape[:2] image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 image = transform({"image": image})["image"] image = torch.from_numpy(image).unsqueeze(0).to(self.device) @torch.no_grad() def predict_depth(model, image): return model(image) depth = predict_depth(self.model, image) depth = F.interpolate( depth[None], (h, w), mode="bilinear", align_corners=False )[0, 0] depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 depth = depth.cpu().numpy().astype(np.uint8) if colored: return cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1] else: return depth def unload_model(self): self.model.to("cpu") ================================================ FILE: annotator/depth_anything_v2.py ================================================ import os import torch import cv2 import numpy as np import torch.nn.functional as F from torchvision.transforms import Compose from safetensors.torch import load_file from depth_anything_v2.dpt import DepthAnythingV2 from depth_anything_v2.util.transform import Resize, NormalizeImage, PrepareForNet from .util import load_model from .annotator_path import models_path transform = Compose( [ Resize( width=518, height=518, resize_target=False, keep_aspect_ratio=True, ensure_multiple_of=14, resize_method="lower_bound", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), PrepareForNet(), ] ) class DepthAnythingV2Detector: """https://github.com/MackinationsAi/Upgraded-Depth-Anything-V2""" model_dir = os.path.join(models_path, "depth_anything_v2") def __init__(self, device: torch.device): self.device = device self.model = ( DepthAnythingV2( encoder="vitl", features=256, out_channels=[256, 512, 1024, 1024], ) .to(device) .eval() ) remote_url = os.environ.get( "CONTROLNET_DEPTH_ANYTHING_V2_MODEL_URL", "https://huggingface.co/MackinationsAi/Depth-Anything-V2_Safetensors/resolve/main/depth_anything_v2_vitl.safetensors", ) model_path = load_model( "depth_anything_v2_vitl.safetensors", remote_url=remote_url, model_dir=self.model_dir ) self.model.load_state_dict(load_file(model_path)) def __call__(self, image: np.ndarray, colored: bool = True) -> np.ndarray: self.model.to(self.device) h, w = image.shape[:2] image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0 image = transform({"image": image})["image"] image = torch.from_numpy(image).unsqueeze(0).to(self.device) @torch.no_grad() def predict_depth(model, image): return model(image) depth = predict_depth(self.model, image) depth = F.interpolate( depth[None], (h, w), mode="bilinear", align_corners=False )[0, 0] depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 depth = depth.cpu().numpy().astype(np.uint8) if colored: depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)[:, :, ::-1] return depth_color else: return depth def unload_model(self): self.model.to("cpu") ================================================ FILE: annotator/hed/__init__.py ================================================ # This is an improved version and model of HED edge detection with Apache License, Version 2.0. # Please use this implementation in your products # This implementation may produce slightly different results from Saining Xie's official implementations, # but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations. # Different from official models and other implementations, this is an RGB-input model (rather than BGR) # and in this way it works better for gradio's RGB protocol import os import cv2 import torch import numpy as np from einops import rearrange import os from modules import devices from annotator.annotator_path import models_path from annotator.util import safe_step, nms class DoubleConvBlock(torch.nn.Module): def __init__(self, input_channel, output_channel, layer_number): super().__init__() self.convs = torch.nn.Sequential() self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1)) for i in range(1, layer_number): self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1)) self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0) def __call__(self, x, down_sampling=False): h = x if down_sampling: h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2)) for conv in self.convs: h = conv(h) h = torch.nn.functional.relu(h) return h, self.projection(h) class ControlNetHED_Apache2(torch.nn.Module): def __init__(self): super().__init__() self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1))) self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2) self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2) self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3) self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3) self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3) def __call__(self, x): h = x - self.norm h, projection1 = self.block1(h) h, projection2 = self.block2(h, down_sampling=True) h, projection3 = self.block3(h, down_sampling=True) h, projection4 = self.block4(h, down_sampling=True) h, projection5 = self.block5(h, down_sampling=True) return projection1, projection2, projection3, projection4, projection5 netNetwork = None remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetHED.pth" modeldir = os.path.join(models_path, "hed") old_modeldir = os.path.dirname(os.path.realpath(__file__)) def apply_hed(input_image, is_safe=False): global netNetwork if netNetwork is None: modelpath = os.path.join(modeldir, "ControlNetHED.pth") old_modelpath = os.path.join(old_modeldir, "ControlNetHED.pth") if os.path.exists(old_modelpath): modelpath = old_modelpath elif not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=modeldir) netNetwork = ControlNetHED_Apache2().to(devices.get_device_for("controlnet")) netNetwork.load_state_dict(torch.load(modelpath, map_location='cpu')) netNetwork.to(devices.get_device_for("controlnet")).float().eval() assert input_image.ndim == 3 H, W, C = input_image.shape with torch.no_grad(): image_hed = torch.from_numpy(input_image.copy()).float().to(devices.get_device_for("controlnet")) image_hed = rearrange(image_hed, 'h w c -> 1 c h w') edges = netNetwork(image_hed) edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges] edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges] edges = np.stack(edges, axis=2) edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64))) if is_safe: edge = safe_step(edge) edge = (edge * 255.0).clip(0, 255).astype(np.uint8) return edge def unload_hed_model(): global netNetwork if netNetwork is not None: netNetwork.cpu() ================================================ FILE: annotator/keypose/__init__.py ================================================ import numpy as np import cv2 import torch import os from modules import devices from annotator.annotator_path import models_path import mmcv from mmdet.apis import inference_detector, init_detector from mmpose.apis import inference_top_down_pose_model from mmpose.apis import init_pose_model, process_mmdet_results, vis_pose_result def preprocessing(image, device): # Resize scale = 640 / max(image.shape[:2]) image = cv2.resize(image, dsize=None, fx=scale, fy=scale) raw_image = image.astype(np.uint8) # Subtract mean values image = image.astype(np.float32) image -= np.array( [ float(104.008), float(116.669), float(122.675), ] ) # Convert to torch.Tensor and add "batch" axis image = torch.from_numpy(image.transpose(2, 0, 1)).float().unsqueeze(0) image = image.to(device) return image, raw_image def imshow_keypoints(img, pose_result, skeleton=None, kpt_score_thr=0.1, pose_kpt_color=None, pose_link_color=None, radius=4, thickness=1): """Draw keypoints and links on an image. Args: img (ndarry): The image to draw poses on. pose_result (list[kpts]): The poses to draw. Each element kpts is a set of K keypoints as an Kx3 numpy.ndarray, where each keypoint is represented as x, y, score. kpt_score_thr (float, optional): Minimum score of keypoints to be shown. Default: 0.3. pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None, the keypoint will not be drawn. pose_link_color (np.array[Mx3]): Color of M links. If None, the links will not be drawn. thickness (int): Thickness of lines. """ img_h, img_w, _ = img.shape img = np.zeros(img.shape) for idx, kpts in enumerate(pose_result): if idx > 1: continue kpts = kpts['keypoints'] # print(kpts) kpts = np.array(kpts, copy=False) # draw each point on image if pose_kpt_color is not None: assert len(pose_kpt_color) == len(kpts) for kid, kpt in enumerate(kpts): x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2] if kpt_score < kpt_score_thr or pose_kpt_color[kid] is None: # skip the point that should not be drawn continue color = tuple(int(c) for c in pose_kpt_color[kid]) cv2.circle(img, (int(x_coord), int(y_coord)), radius, color, -1) # draw links if skeleton is not None and pose_link_color is not None: assert len(pose_link_color) == len(skeleton) for sk_id, sk in enumerate(skeleton): pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1])) pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1])) if (pos1[0] <= 0 or pos1[0] >= img_w or pos1[1] <= 0 or pos1[1] >= img_h or pos2[0] <= 0 or pos2[0] >= img_w or pos2[1] <= 0 or pos2[1] >= img_h or kpts[sk[0], 2] < kpt_score_thr or kpts[sk[1], 2] < kpt_score_thr or pose_link_color[sk_id] is None): # skip the link that should not be drawn continue color = tuple(int(c) for c in pose_link_color[sk_id]) cv2.line(img, pos1, pos2, color, thickness=thickness) return img human_det, pose_model = None, None det_model_path = "https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth" pose_model_path = "https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth" modeldir = os.path.join(models_path, "keypose") old_modeldir = os.path.dirname(os.path.realpath(__file__)) det_config = 'faster_rcnn_r50_fpn_coco.py' pose_config = 'hrnet_w48_coco_256x192.py' det_checkpoint = 'faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth' pose_checkpoint = 'hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth' det_cat_id = 1 bbox_thr = 0.2 skeleton = [ [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6] ] pose_kpt_color = [ [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0] ] pose_link_color = [ [0, 255, 0], [0, 255, 0], [255, 128, 0], [255, 128, 0], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255] ] def find_download_model(checkpoint, remote_path): modelpath = os.path.join(modeldir, checkpoint) old_modelpath = os.path.join(old_modeldir, checkpoint) if os.path.exists(old_modelpath): modelpath = old_modelpath elif not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_path, model_dir=modeldir) return modelpath def apply_keypose(input_image): global human_det, pose_model if netNetwork is None: det_model_local = find_download_model(det_checkpoint, det_model_path) hrnet_model_local = find_download_model(pose_checkpoint, pose_model_path) det_config_mmcv = mmcv.Config.fromfile(det_config) pose_config_mmcv = mmcv.Config.fromfile(pose_config) human_det = init_detector(det_config_mmcv, det_model_local, device=devices.get_device_for("controlnet")) pose_model = init_pose_model(pose_config_mmcv, hrnet_model_local, device=devices.get_device_for("controlnet")) assert input_image.ndim == 3 input_image = input_image.copy() with torch.no_grad(): image = torch.from_numpy(input_image).float().to(devices.get_device_for("controlnet")) image = image / 255.0 mmdet_results = inference_detector(human_det, image) # keep the person class bounding boxes. person_results = process_mmdet_results(mmdet_results, det_cat_id) return_heatmap = False dataset = pose_model.cfg.data['test']['type'] # e.g. use ('backbone', ) to return backbone feature output_layer_names = None pose_results, _ = inference_top_down_pose_model( pose_model, image, person_results, bbox_thr=bbox_thr, format='xyxy', dataset=dataset, dataset_info=None, return_heatmap=return_heatmap, outputs=output_layer_names ) im_keypose_out = imshow_keypoints( image, pose_results, skeleton=skeleton, pose_kpt_color=pose_kpt_color, pose_link_color=pose_link_color, radius=2, thickness=2 ) im_keypose_out = im_keypose_out.astype(np.uint8) # image_hed = rearrange(image_hed, 'h w c -> 1 c h w') # edge = netNetwork(image_hed)[0] # edge = (edge.cpu().numpy() * 255.0).clip(0, 255).astype(np.uint8) return im_keypose_out def unload_hed_model(): global netNetwork if netNetwork is not None: netNetwork.cpu() ================================================ FILE: annotator/keypose/faster_rcnn_r50_fpn_coco.py ================================================ checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) # yapf:enable dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[8, 11]) total_epochs = 12 model = dict( type='FasterRCNN', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=2000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)), test_cfg=dict( rpn=dict( nms_pre=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100) # soft-nms is also supported for rcnn testing # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) )) dataset_type = 'CocoDataset' data_root = 'data/coco' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=f'{data_root}/annotations/instances_train2017.json', img_prefix=f'{data_root}/train2017/', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=f'{data_root}/annotations/instances_val2017.json', img_prefix=f'{data_root}/val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=f'{data_root}/annotations/instances_val2017.json', img_prefix=f'{data_root}/val2017/', pipeline=test_pipeline)) evaluation = dict(interval=1, metric='bbox') ================================================ FILE: annotator/keypose/hrnet_w48_coco_256x192.py ================================================ # _base_ = [ # '../../../../_base_/default_runtime.py', # '../../../../_base_/datasets/coco.py' # ] evaluation = dict(interval=10, metric='mAP', save_best='AP') optimizer = dict( type='Adam', lr=5e-4, ) optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[170, 200]) total_epochs = 210 channel_cfg = dict( num_output_channels=17, dataset_joints=17, dataset_channel=[ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], ], inference_channel=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]) # model settings model = dict( type='TopDown', pretrained='https://download.openmmlab.com/mmpose/' 'pretrain_models/hrnet_w48-8ef0771d.pth', backbone=dict( type='HRNet', in_channels=3, extra=dict( stage1=dict( num_modules=1, num_branches=1, block='BOTTLENECK', num_blocks=(4, ), num_channels=(64, )), stage2=dict( num_modules=1, num_branches=2, block='BASIC', num_blocks=(4, 4), num_channels=(48, 96)), stage3=dict( num_modules=4, num_branches=3, block='BASIC', num_blocks=(4, 4, 4), num_channels=(48, 96, 192)), stage4=dict( num_modules=3, num_branches=4, block='BASIC', num_blocks=(4, 4, 4, 4), num_channels=(48, 96, 192, 384))), ), keypoint_head=dict( type='TopdownHeatmapSimpleHead', in_channels=48, out_channels=channel_cfg['num_output_channels'], num_deconv_layers=0, extra=dict(final_conv_kernel=1, ), loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), train_cfg=dict(), test_cfg=dict( flip_test=True, post_process='default', shift_heatmap=True, modulate_kernel=11)) data_cfg = dict( image_size=[192, 256], heatmap_size=[48, 64], num_output_channels=channel_cfg['num_output_channels'], num_joints=channel_cfg['dataset_joints'], dataset_channel=channel_cfg['dataset_channel'], inference_channel=channel_cfg['inference_channel'], soft_nms=False, nms_thr=1.0, oks_thr=0.9, vis_thr=0.2, use_gt_bbox=False, det_bbox_thr=0.0, bbox_file='data/coco/person_detection_results/' 'COCO_val2017_detections_AP_H_56_person.json', ) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='TopDownGetBboxCenterScale', padding=1.25), dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3), dict(type='TopDownRandomFlip', flip_prob=0.5), dict( type='TopDownHalfBodyTransform', num_joints_half_body=8, prob_half_body=0.3), dict( type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), dict(type='TopDownAffine'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), dict(type='TopDownGenerateTarget', sigma=2), dict( type='Collect', keys=['img', 'target', 'target_weight'], meta_keys=[ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs' ]), ] val_pipeline = [ dict(type='LoadImageFromFile'), dict(type='TopDownGetBboxCenterScale', padding=1.25), dict(type='TopDownAffine'), dict(type='ToTensor'), dict( type='NormalizeTensor', mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), dict( type='Collect', keys=['img'], meta_keys=[ 'image_file', 'center', 'scale', 'rotation', 'bbox_score', 'flip_pairs' ]), ] test_pipeline = val_pipeline data_root = 'data/coco' data = dict( samples_per_gpu=32, workers_per_gpu=2, val_dataloader=dict(samples_per_gpu=32), test_dataloader=dict(samples_per_gpu=32), train=dict( type='TopDownCocoDataset', ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', img_prefix=f'{data_root}/train2017/', data_cfg=data_cfg, pipeline=train_pipeline, dataset_info={{_base_.dataset_info}}), val=dict( type='TopDownCocoDataset', ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', img_prefix=f'{data_root}/val2017/', data_cfg=data_cfg, pipeline=val_pipeline, dataset_info={{_base_.dataset_info}}), test=dict( type='TopDownCocoDataset', ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', img_prefix=f'{data_root}/val2017/', data_cfg=data_cfg, pipeline=test_pipeline, dataset_info={{_base_.dataset_info}}), ) ================================================ FILE: annotator/lama/__init__.py ================================================ # https://github.com/advimman/lama import yaml import torch from omegaconf import OmegaConf import numpy as np from einops import rearrange import os from modules import devices from annotator.annotator_path import models_path from annotator.lama.saicinpainting.training.trainers import load_checkpoint class LamaInpainting: model_dir = os.path.join(models_path, "lama") def __init__(self): self.model = None self.device = devices.get_device_for("controlnet") def load_model(self): remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetLama.pth" modelpath = os.path.join(self.model_dir, "ControlNetLama.pth") if not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=self.model_dir) config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.yaml') cfg = yaml.safe_load(open(config_path, 'rt')) cfg = OmegaConf.create(cfg) cfg.training_model.predict_only = True cfg.visualizer.kind = 'noop' self.model = load_checkpoint(cfg, os.path.abspath(modelpath), strict=False, map_location='cpu') self.model = self.model.to(self.device) self.model.eval() def unload_model(self): if self.model is not None: self.model.cpu() def __call__(self, input_image): if self.model is None: self.load_model() self.model.to(self.device) color = np.ascontiguousarray(input_image[:, :, 0:3]).astype(np.float32) / 255.0 mask = np.ascontiguousarray(input_image[:, :, 3:4]).astype(np.float32) / 255.0 with torch.no_grad(): color = torch.from_numpy(color).float().to(self.device) mask = torch.from_numpy(mask).float().to(self.device) mask = (mask > 0.5).float() color = color * (1 - mask) image_feed = torch.cat([color, mask], dim=2) image_feed = rearrange(image_feed, 'h w c -> 1 c h w') result = self.model(image_feed)[0] result = rearrange(result, 'c h w -> h w c') result = result * mask + color * (1 - mask) result *= 255.0 return result.detach().cpu().numpy().clip(0, 255).astype(np.uint8) ================================================ FILE: annotator/lama/config.yaml ================================================ run_title: b18_ffc075_batch8x15 training_model: kind: default visualize_each_iters: 1000 concat_mask: true store_discr_outputs_for_vis: true losses: l1: weight_missing: 0 weight_known: 10 perceptual: weight: 0 adversarial: kind: r1 weight: 10 gp_coef: 0.001 mask_as_fake_target: true allow_scale_mask: true feature_matching: weight: 100 resnet_pl: weight: 30 weights_path: ${env:TORCH_HOME} optimizers: generator: kind: adam lr: 0.001 discriminator: kind: adam lr: 0.0001 visualizer: key_order: - image - predicted_image - discr_output_fake - discr_output_real - inpainted rescale_keys: - discr_output_fake - discr_output_real kind: directory outdir: /group-volume/User-Driven-Content-Generation/r.suvorov/inpainting/experiments/r.suvorov_2021-04-30_14-41-12_train_simple_pix2pix2_gap_sdpl_novgg_large_b18_ffc075_batch8x15/samples location: data_root_dir: /group-volume/User-Driven-Content-Generation/datasets/inpainting_data_root_large out_root_dir: /group-volume/User-Driven-Content-Generation/${env:USER}/inpainting/experiments tb_dir: /group-volume/User-Driven-Content-Generation/${env:USER}/inpainting/tb_logs data: batch_size: 15 val_batch_size: 2 num_workers: 3 train: indir: ${location.data_root_dir}/train out_size: 256 mask_gen_kwargs: irregular_proba: 1 irregular_kwargs: max_angle: 4 max_len: 200 max_width: 100 max_times: 5 min_times: 1 box_proba: 1 box_kwargs: margin: 10 bbox_min_size: 30 bbox_max_size: 150 max_times: 3 min_times: 1 segm_proba: 0 segm_kwargs: confidence_threshold: 0.5 max_object_area: 0.5 min_mask_area: 0.07 downsample_levels: 6 num_variants_per_mask: 1 rigidness_mode: 1 max_foreground_coverage: 0.3 max_foreground_intersection: 0.7 max_mask_intersection: 0.1 max_hidden_area: 0.1 max_scale_change: 0.25 horizontal_flip: true max_vertical_shift: 0.2 position_shuffle: true transform_variant: distortions dataloader_kwargs: batch_size: ${data.batch_size} shuffle: true num_workers: ${data.num_workers} val: indir: ${location.data_root_dir}/val img_suffix: .png dataloader_kwargs: batch_size: ${data.val_batch_size} shuffle: false num_workers: ${data.num_workers} visual_test: indir: ${location.data_root_dir}/korean_test img_suffix: _input.png pad_out_to_modulo: 32 dataloader_kwargs: batch_size: 1 shuffle: false num_workers: ${data.num_workers} generator: kind: ffc_resnet input_nc: 4 output_nc: 3 ngf: 64 n_downsampling: 3 n_blocks: 18 add_out_act: sigmoid init_conv_kwargs: ratio_gin: 0 ratio_gout: 0 enable_lfu: false downsample_conv_kwargs: ratio_gin: ${generator.init_conv_kwargs.ratio_gout} ratio_gout: ${generator.downsample_conv_kwargs.ratio_gin} enable_lfu: false resnet_conv_kwargs: ratio_gin: 0.75 ratio_gout: ${generator.resnet_conv_kwargs.ratio_gin} enable_lfu: false discriminator: kind: pix2pixhd_nlayer input_nc: 3 ndf: 64 n_layers: 4 evaluator: kind: default inpainted_key: inpainted integral_kind: ssim_fid100_f1 trainer: kwargs: gpus: -1 accelerator: ddp max_epochs: 200 gradient_clip_val: 1 log_gpu_memory: None limit_train_batches: 25000 val_check_interval: ${trainer.kwargs.limit_train_batches} log_every_n_steps: 1000 precision: 32 terminate_on_nan: false check_val_every_n_epoch: 1 num_sanity_val_steps: 8 limit_val_batches: 1000 replace_sampler_ddp: false checkpoint_kwargs: verbose: true save_top_k: 5 save_last: true period: 1 monitor: val_ssim_fid100_f1_total_mean mode: max ================================================ FILE: annotator/lama/saicinpainting/__init__.py ================================================ ================================================ FILE: annotator/lama/saicinpainting/training/__init__.py ================================================ ================================================ FILE: annotator/lama/saicinpainting/training/data/__init__.py ================================================ ================================================ FILE: annotator/lama/saicinpainting/training/data/masks.py ================================================ import math import random import hashlib import logging from enum import Enum import cv2 import numpy as np # from annotator.lama.saicinpainting.evaluation.masks.mask import SegmentationMask from annotator.lama.saicinpainting.utils import LinearRamp LOGGER = logging.getLogger(__name__) class DrawMethod(Enum): LINE = 'line' CIRCLE = 'circle' SQUARE = 'square' def make_random_irregular_mask(shape, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10, draw_method=DrawMethod.LINE): draw_method = DrawMethod(draw_method) height, width = shape mask = np.zeros((height, width), np.float32) times = np.random.randint(min_times, max_times + 1) for i in range(times): start_x = np.random.randint(width) start_y = np.random.randint(height) for j in range(1 + np.random.randint(5)): angle = 0.01 + np.random.randint(max_angle) if i % 2 == 0: angle = 2 * 3.1415926 - angle length = 10 + np.random.randint(max_len) brush_w = 5 + np.random.randint(max_width) end_x = np.clip((start_x + length * np.sin(angle)).astype(np.int32), 0, width) end_y = np.clip((start_y + length * np.cos(angle)).astype(np.int32), 0, height) if draw_method == DrawMethod.LINE: cv2.line(mask, (start_x, start_y), (end_x, end_y), 1.0, brush_w) elif draw_method == DrawMethod.CIRCLE: cv2.circle(mask, (start_x, start_y), radius=brush_w, color=1., thickness=-1) elif draw_method == DrawMethod.SQUARE: radius = brush_w // 2 mask[start_y - radius:start_y + radius, start_x - radius:start_x + radius] = 1 start_x, start_y = end_x, end_y return mask[None, ...] class RandomIrregularMaskGenerator: def __init__(self, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10, ramp_kwargs=None, draw_method=DrawMethod.LINE): self.max_angle = max_angle self.max_len = max_len self.max_width = max_width self.min_times = min_times self.max_times = max_times self.draw_method = draw_method self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None def __call__(self, img, iter_i=None, raw_image=None): coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1 cur_max_len = int(max(1, self.max_len * coef)) cur_max_width = int(max(1, self.max_width * coef)) cur_max_times = int(self.min_times + 1 + (self.max_times - self.min_times) * coef) return make_random_irregular_mask(img.shape[1:], max_angle=self.max_angle, max_len=cur_max_len, max_width=cur_max_width, min_times=self.min_times, max_times=cur_max_times, draw_method=self.draw_method) def make_random_rectangle_mask(shape, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3): height, width = shape mask = np.zeros((height, width), np.float32) bbox_max_size = min(bbox_max_size, height - margin * 2, width - margin * 2) times = np.random.randint(min_times, max_times + 1) for i in range(times): box_width = np.random.randint(bbox_min_size, bbox_max_size) box_height = np.random.randint(bbox_min_size, bbox_max_size) start_x = np.random.randint(margin, width - margin - box_width + 1) start_y = np.random.randint(margin, height - margin - box_height + 1) mask[start_y:start_y + box_height, start_x:start_x + box_width] = 1 return mask[None, ...] class RandomRectangleMaskGenerator: def __init__(self, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3, ramp_kwargs=None): self.margin = margin self.bbox_min_size = bbox_min_size self.bbox_max_size = bbox_max_size self.min_times = min_times self.max_times = max_times self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None def __call__(self, img, iter_i=None, raw_image=None): coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1 cur_bbox_max_size = int(self.bbox_min_size + 1 + (self.bbox_max_size - self.bbox_min_size) * coef) cur_max_times = int(self.min_times + (self.max_times - self.min_times) * coef) return make_random_rectangle_mask(img.shape[1:], margin=self.margin, bbox_min_size=self.bbox_min_size, bbox_max_size=cur_bbox_max_size, min_times=self.min_times, max_times=cur_max_times) class RandomSegmentationMaskGenerator: def __init__(self, **kwargs): self.impl = None # will be instantiated in first call (effectively in subprocess) self.kwargs = kwargs def __call__(self, img, iter_i=None, raw_image=None): if self.impl is None: self.impl = SegmentationMask(**self.kwargs) masks = self.impl.get_masks(np.transpose(img, (1, 2, 0))) masks = [m for m in masks if len(np.unique(m)) > 1] return np.random.choice(masks) def make_random_superres_mask(shape, min_step=2, max_step=4, min_width=1, max_width=3): height, width = shape mask = np.zeros((height, width), np.float32) step_x = np.random.randint(min_step, max_step + 1) width_x = np.random.randint(min_width, min(step_x, max_width + 1)) offset_x = np.random.randint(0, step_x) step_y = np.random.randint(min_step, max_step + 1) width_y = np.random.randint(min_width, min(step_y, max_width + 1)) offset_y = np.random.randint(0, step_y) for dy in range(width_y): mask[offset_y + dy::step_y] = 1 for dx in range(width_x): mask[:, offset_x + dx::step_x] = 1 return mask[None, ...] class RandomSuperresMaskGenerator: def __init__(self, **kwargs): self.kwargs = kwargs def __call__(self, img, iter_i=None): return make_random_superres_mask(img.shape[1:], **self.kwargs) class DumbAreaMaskGenerator: min_ratio = 0.1 max_ratio = 0.35 default_ratio = 0.225 def __init__(self, is_training): #Parameters: # is_training(bool): If true - random rectangular mask, if false - central square mask self.is_training = is_training def _random_vector(self, dimension): if self.is_training: lower_limit = math.sqrt(self.min_ratio) upper_limit = math.sqrt(self.max_ratio) mask_side = round((random.random() * (upper_limit - lower_limit) + lower_limit) * dimension) u = random.randint(0, dimension-mask_side-1) v = u+mask_side else: margin = (math.sqrt(self.default_ratio) / 2) * dimension u = round(dimension/2 - margin) v = round(dimension/2 + margin) return u, v def __call__(self, img, iter_i=None, raw_image=None): c, height, width = img.shape mask = np.zeros((height, width), np.float32) x1, x2 = self._random_vector(width) y1, y2 = self._random_vector(height) mask[x1:x2, y1:y2] = 1 return mask[None, ...] class OutpaintingMaskGenerator: def __init__(self, min_padding_percent:float=0.04, max_padding_percent:int=0.25, left_padding_prob:float=0.5, top_padding_prob:float=0.5, right_padding_prob:float=0.5, bottom_padding_prob:float=0.5, is_fixed_randomness:bool=False): """ is_fixed_randomness - get identical paddings for the same image if args are the same """ self.min_padding_percent = min_padding_percent self.max_padding_percent = max_padding_percent self.probs = [left_padding_prob, top_padding_prob, right_padding_prob, bottom_padding_prob] self.is_fixed_randomness = is_fixed_randomness assert self.min_padding_percent <= self.max_padding_percent assert self.max_padding_percent > 0 assert len([x for x in [self.min_padding_percent, self.max_padding_percent] if (x>=0 and x<=1)]) == 2, f"Padding percentage should be in [0,1]" assert sum(self.probs) > 0, f"At least one of the padding probs should be greater than 0 - {self.probs}" assert len([x for x in self.probs if (x >= 0) and (x <= 1)]) == 4, f"At least one of padding probs is not in [0,1] - {self.probs}" if len([x for x in self.probs if x > 0]) == 1: LOGGER.warning(f"Only one padding prob is greater than zero - {self.probs}. That means that the outpainting masks will be always on the same side") def apply_padding(self, mask, coord): mask[int(coord[0][0]*self.img_h):int(coord[1][0]*self.img_h), int(coord[0][1]*self.img_w):int(coord[1][1]*self.img_w)] = 1 return mask def get_padding(self, size): n1 = int(self.min_padding_percent*size) n2 = int(self.max_padding_percent*size) return self.rnd.randint(n1, n2) / size @staticmethod def _img2rs(img): arr = np.ascontiguousarray(img.astype(np.uint8)) str_hash = hashlib.sha1(arr).hexdigest() res = hash(str_hash)%(2**32) return res def __call__(self, img, iter_i=None, raw_image=None): c, self.img_h, self.img_w = img.shape mask = np.zeros((self.img_h, self.img_w), np.float32) at_least_one_mask_applied = False if self.is_fixed_randomness: assert raw_image is not None, f"Cant calculate hash on raw_image=None" rs = self._img2rs(raw_image) self.rnd = np.random.RandomState(rs) else: self.rnd = np.random coords = [[ (0,0), (1,self.get_padding(size=self.img_h)) ], [ (0,0), (self.get_padding(size=self.img_w),1) ], [ (0,1-self.get_padding(size=self.img_h)), (1,1) ], [ (1-self.get_padding(size=self.img_w),0), (1,1) ]] for pp, coord in zip(self.probs, coords): if self.rnd.random() < pp: at_least_one_mask_applied = True mask = self.apply_padding(mask=mask, coord=coord) if not at_least_one_mask_applied: idx = self.rnd.choice(range(len(coords)), p=np.array(self.probs)/sum(self.probs)) mask = self.apply_padding(mask=mask, coord=coords[idx]) return mask[None, ...] class MixedMaskGenerator: def __init__(self, irregular_proba=1/3, irregular_kwargs=None, box_proba=1/3, box_kwargs=None, segm_proba=1/3, segm_kwargs=None, squares_proba=0, squares_kwargs=None, superres_proba=0, superres_kwargs=None, outpainting_proba=0, outpainting_kwargs=None, invert_proba=0): self.probas = [] self.gens = [] if irregular_proba > 0: self.probas.append(irregular_proba) if irregular_kwargs is None: irregular_kwargs = {} else: irregular_kwargs = dict(irregular_kwargs) irregular_kwargs['draw_method'] = DrawMethod.LINE self.gens.append(RandomIrregularMaskGenerator(**irregular_kwargs)) if box_proba > 0: self.probas.append(box_proba) if box_kwargs is None: box_kwargs = {} self.gens.append(RandomRectangleMaskGenerator(**box_kwargs)) if segm_proba > 0: self.probas.append(segm_proba) if segm_kwargs is None: segm_kwargs = {} self.gens.append(RandomSegmentationMaskGenerator(**segm_kwargs)) if squares_proba > 0: self.probas.append(squares_proba) if squares_kwargs is None: squares_kwargs = {} else: squares_kwargs = dict(squares_kwargs) squares_kwargs['draw_method'] = DrawMethod.SQUARE self.gens.append(RandomIrregularMaskGenerator(**squares_kwargs)) if superres_proba > 0: self.probas.append(superres_proba) if superres_kwargs is None: superres_kwargs = {} self.gens.append(RandomSuperresMaskGenerator(**superres_kwargs)) if outpainting_proba > 0: self.probas.append(outpainting_proba) if outpainting_kwargs is None: outpainting_kwargs = {} self.gens.append(OutpaintingMaskGenerator(**outpainting_kwargs)) self.probas = np.array(self.probas, dtype='float32') self.probas /= self.probas.sum() self.invert_proba = invert_proba def __call__(self, img, iter_i=None, raw_image=None): kind = np.random.choice(len(self.probas), p=self.probas) gen = self.gens[kind] result = gen(img, iter_i=iter_i, raw_image=raw_image) if self.invert_proba > 0 and random.random() < self.invert_proba: result = 1 - result return result def get_mask_generator(kind, kwargs): if kind is None: kind = "mixed" if kwargs is None: kwargs = {} if kind == "mixed": cl = MixedMaskGenerator elif kind == "outpainting": cl = OutpaintingMaskGenerator elif kind == "dumb": cl = DumbAreaMaskGenerator else: raise NotImplementedError(f"No such generator kind = {kind}") return cl(**kwargs) ================================================ FILE: annotator/lama/saicinpainting/training/losses/__init__.py ================================================ ================================================ FILE: annotator/lama/saicinpainting/training/losses/adversarial.py ================================================ from typing import Tuple, Dict, Optional import torch import torch.nn as nn import torch.nn.functional as F class BaseAdversarialLoss: def pre_generator_step(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, generator: nn.Module, discriminator: nn.Module): """ Prepare for generator step :param real_batch: Tensor, a batch of real samples :param fake_batch: Tensor, a batch of samples produced by generator :param generator: :param discriminator: :return: None """ def pre_discriminator_step(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, generator: nn.Module, discriminator: nn.Module): """ Prepare for discriminator step :param real_batch: Tensor, a batch of real samples :param fake_batch: Tensor, a batch of samples produced by generator :param generator: :param discriminator: :return: None """ def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, mask: Optional[torch.Tensor] = None) \ -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: """ Calculate generator loss :param real_batch: Tensor, a batch of real samples :param fake_batch: Tensor, a batch of samples produced by generator :param discr_real_pred: Tensor, discriminator output for real_batch :param discr_fake_pred: Tensor, discriminator output for fake_batch :param mask: Tensor, actual mask, which was at input of generator when making fake_batch :return: total generator loss along with some values that might be interesting to log """ raise NotImplemented() def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, mask: Optional[torch.Tensor] = None) \ -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: """ Calculate discriminator loss and call .backward() on it :param real_batch: Tensor, a batch of real samples :param fake_batch: Tensor, a batch of samples produced by generator :param discr_real_pred: Tensor, discriminator output for real_batch :param discr_fake_pred: Tensor, discriminator output for fake_batch :param mask: Tensor, actual mask, which was at input of generator when making fake_batch :return: total discriminator loss along with some values that might be interesting to log """ raise NotImplemented() def interpolate_mask(self, mask, shape): assert mask is not None assert self.allow_scale_mask or shape == mask.shape[-2:] if shape != mask.shape[-2:] and self.allow_scale_mask: if self.mask_scale_mode == 'maxpool': mask = F.adaptive_max_pool2d(mask, shape) else: mask = F.interpolate(mask, size=shape, mode=self.mask_scale_mode) return mask def make_r1_gp(discr_real_pred, real_batch): if torch.is_grad_enabled(): grad_real = torch.autograd.grad(outputs=discr_real_pred.sum(), inputs=real_batch, create_graph=True)[0] grad_penalty = (grad_real.view(grad_real.shape[0], -1).norm(2, dim=1) ** 2).mean() else: grad_penalty = 0 real_batch.requires_grad = False return grad_penalty class NonSaturatingWithR1(BaseAdversarialLoss): def __init__(self, gp_coef=5, weight=1, mask_as_fake_target=False, allow_scale_mask=False, mask_scale_mode='nearest', extra_mask_weight_for_gen=0, use_unmasked_for_gen=True, use_unmasked_for_discr=True): self.gp_coef = gp_coef self.weight = weight # use for discr => use for gen; # otherwise we teach only the discr to pay attention to very small difference assert use_unmasked_for_gen or (not use_unmasked_for_discr) # mask as target => use unmasked for discr: # if we don't care about unmasked regions at all # then it doesn't matter if the value of mask_as_fake_target is true or false assert use_unmasked_for_discr or (not mask_as_fake_target) self.use_unmasked_for_gen = use_unmasked_for_gen self.use_unmasked_for_discr = use_unmasked_for_discr self.mask_as_fake_target = mask_as_fake_target self.allow_scale_mask = allow_scale_mask self.mask_scale_mode = mask_scale_mode self.extra_mask_weight_for_gen = extra_mask_weight_for_gen def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, mask=None) \ -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: fake_loss = F.softplus(-discr_fake_pred) if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \ not self.use_unmasked_for_gen: # == if masked region should be treated differently mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:]) if not self.use_unmasked_for_gen: fake_loss = fake_loss * mask else: pixel_weights = 1 + mask * self.extra_mask_weight_for_gen fake_loss = fake_loss * pixel_weights return fake_loss.mean() * self.weight, dict() def pre_discriminator_step(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, generator: nn.Module, discriminator: nn.Module): real_batch.requires_grad = True def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, mask=None) \ -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: real_loss = F.softplus(-discr_real_pred) grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef fake_loss = F.softplus(discr_fake_pred) if not self.use_unmasked_for_discr or self.mask_as_fake_target: # == if masked region should be treated differently mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:]) # use_unmasked_for_discr=False only makes sense for fakes; # for reals there is no difference beetween two regions fake_loss = fake_loss * mask if self.mask_as_fake_target: fake_loss = fake_loss + (1 - mask) * F.softplus(-discr_fake_pred) sum_discr_loss = real_loss + grad_penalty + fake_loss metrics = dict(discr_real_out=discr_real_pred.mean(), discr_fake_out=discr_fake_pred.mean(), discr_real_gp=grad_penalty) return sum_discr_loss.mean(), metrics class BCELoss(BaseAdversarialLoss): def __init__(self, weight): self.weight = weight self.bce_loss = nn.BCEWithLogitsLoss() def generator_loss(self, discr_fake_pred: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: real_mask_gt = torch.zeros(discr_fake_pred.shape).to(discr_fake_pred.device) fake_loss = self.bce_loss(discr_fake_pred, real_mask_gt) * self.weight return fake_loss, dict() def pre_discriminator_step(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, generator: nn.Module, discriminator: nn.Module): real_batch.requires_grad = True def discriminator_loss(self, mask: torch.Tensor, discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: real_mask_gt = torch.zeros(discr_real_pred.shape).to(discr_real_pred.device) sum_discr_loss = (self.bce_loss(discr_real_pred, real_mask_gt) + self.bce_loss(discr_fake_pred, mask)) / 2 metrics = dict(discr_real_out=discr_real_pred.mean(), discr_fake_out=discr_fake_pred.mean(), discr_real_gp=0) return sum_discr_loss, metrics def make_discrim_loss(kind, **kwargs): if kind == 'r1': return NonSaturatingWithR1(**kwargs) elif kind == 'bce': return BCELoss(**kwargs) raise ValueError(f'Unknown adversarial loss kind {kind}') ================================================ FILE: annotator/lama/saicinpainting/training/losses/constants.py ================================================ weights = {"ade20k": [6.34517766497462, 9.328358208955224, 11.389521640091116, 16.10305958132045, 20.833333333333332, 22.22222222222222, 25.125628140703515, 43.29004329004329, 50.5050505050505, 54.6448087431694, 55.24861878453038, 60.24096385542168, 62.5, 66.2251655629139, 84.74576271186442, 90.90909090909092, 91.74311926605505, 96.15384615384616, 96.15384615384616, 97.08737864077669, 102.04081632653062, 135.13513513513513, 149.2537313432836, 153.84615384615384, 163.93442622950818, 166.66666666666666, 188.67924528301887, 192.30769230769232, 217.3913043478261, 227.27272727272725, 227.27272727272725, 227.27272727272725, 303.03030303030306, 322.5806451612903, 333.3333333333333, 370.3703703703703, 384.61538461538464, 416.6666666666667, 416.6666666666667, 434.7826086956522, 434.7826086956522, 454.5454545454545, 454.5454545454545, 500.0, 526.3157894736842, 526.3157894736842, 555.5555555555555, 555.5555555555555, 555.5555555555555, 555.5555555555555, 555.5555555555555, 555.5555555555555, 555.5555555555555, 588.2352941176471, 588.2352941176471, 588.2352941176471, 588.2352941176471, 588.2352941176471, 666.6666666666666, 666.6666666666666, 666.6666666666666, 666.6666666666666, 714.2857142857143, 714.2857142857143, 714.2857142857143, 714.2857142857143, 714.2857142857143, 769.2307692307693, 769.2307692307693, 769.2307692307693, 833.3333333333334, 833.3333333333334, 833.3333333333334, 833.3333333333334, 909.090909090909, 1000.0, 1111.111111111111, 1111.111111111111, 1111.111111111111, 1111.111111111111, 1111.111111111111, 1250.0, 1250.0, 1250.0, 1250.0, 1250.0, 1428.5714285714287, 1428.5714285714287, 1428.5714285714287, 1428.5714285714287, 1428.5714285714287, 1428.5714285714287, 1428.5714285714287, 1666.6666666666667, 1666.6666666666667, 1666.6666666666667, 1666.6666666666667, 1666.6666666666667, 1666.6666666666667, 1666.6666666666667, 1666.6666666666667, 1666.6666666666667, 1666.6666666666667, 1666.6666666666667, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2000.0, 2500.0, 2500.0, 2500.0, 2500.0, 2500.0, 2500.0, 2500.0, 2500.0, 2500.0, 2500.0, 2500.0, 2500.0, 2500.0, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 3333.3333333333335, 5000.0, 5000.0, 5000.0] } ================================================ FILE: annotator/lama/saicinpainting/training/losses/distance_weighting.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import torchvision from annotator.lama.saicinpainting.training.losses.perceptual import IMAGENET_STD, IMAGENET_MEAN def dummy_distance_weighter(real_img, pred_img, mask): return mask def get_gauss_kernel(kernel_size, width_factor=1): coords = torch.stack(torch.meshgrid(torch.arange(kernel_size), torch.arange(kernel_size)), dim=0).float() diff = torch.exp(-((coords - kernel_size // 2) ** 2).sum(0) / kernel_size / width_factor) diff /= diff.sum() return diff class BlurMask(nn.Module): def __init__(self, kernel_size=5, width_factor=1): super().__init__() self.filter = nn.Conv2d(1, 1, kernel_size, padding=kernel_size // 2, padding_mode='replicate', bias=False) self.filter.weight.data.copy_(get_gauss_kernel(kernel_size, width_factor=width_factor)) def forward(self, real_img, pred_img, mask): with torch.no_grad(): result = self.filter(mask) * mask return result class EmulatedEDTMask(nn.Module): def __init__(self, dilate_kernel_size=5, blur_kernel_size=5, width_factor=1): super().__init__() self.dilate_filter = nn.Conv2d(1, 1, dilate_kernel_size, padding=dilate_kernel_size// 2, padding_mode='replicate', bias=False) self.dilate_filter.weight.data.copy_(torch.ones(1, 1, dilate_kernel_size, dilate_kernel_size, dtype=torch.float)) self.blur_filter = nn.Conv2d(1, 1, blur_kernel_size, padding=blur_kernel_size // 2, padding_mode='replicate', bias=False) self.blur_filter.weight.data.copy_(get_gauss_kernel(blur_kernel_size, width_factor=width_factor)) def forward(self, real_img, pred_img, mask): with torch.no_grad(): known_mask = 1 - mask dilated_known_mask = (self.dilate_filter(known_mask) > 1).float() result = self.blur_filter(1 - dilated_known_mask) * mask return result class PropagatePerceptualSim(nn.Module): def __init__(self, level=2, max_iters=10, temperature=500, erode_mask_size=3): super().__init__() vgg = torchvision.models.vgg19(pretrained=True).features vgg_avg_pooling = [] for weights in vgg.parameters(): weights.requires_grad = False cur_level_i = 0 for module in vgg.modules(): if module.__class__.__name__ == 'Sequential': continue elif module.__class__.__name__ == 'MaxPool2d': vgg_avg_pooling.append(nn.AvgPool2d(kernel_size=2, stride=2, padding=0)) else: vgg_avg_pooling.append(module) if module.__class__.__name__ == 'ReLU': cur_level_i += 1 if cur_level_i == level: break self.features = nn.Sequential(*vgg_avg_pooling) self.max_iters = max_iters self.temperature = temperature self.do_erode = erode_mask_size > 0 if self.do_erode: self.erode_mask = nn.Conv2d(1, 1, erode_mask_size, padding=erode_mask_size // 2, bias=False) self.erode_mask.weight.data.fill_(1) def forward(self, real_img, pred_img, mask): with torch.no_grad(): real_img = (real_img - IMAGENET_MEAN.to(real_img)) / IMAGENET_STD.to(real_img) real_feats = self.features(real_img) vertical_sim = torch.exp(-(real_feats[:, :, 1:] - real_feats[:, :, :-1]).pow(2).sum(1, keepdim=True) / self.temperature) horizontal_sim = torch.exp(-(real_feats[:, :, :, 1:] - real_feats[:, :, :, :-1]).pow(2).sum(1, keepdim=True) / self.temperature) mask_scaled = F.interpolate(mask, size=real_feats.shape[-2:], mode='bilinear', align_corners=False) if self.do_erode: mask_scaled = (self.erode_mask(mask_scaled) > 1).float() cur_knowness = 1 - mask_scaled for iter_i in range(self.max_iters): new_top_knowness = F.pad(cur_knowness[:, :, :-1] * vertical_sim, (0, 0, 1, 0), mode='replicate') new_bottom_knowness = F.pad(cur_knowness[:, :, 1:] * vertical_sim, (0, 0, 0, 1), mode='replicate') new_left_knowness = F.pad(cur_knowness[:, :, :, :-1] * horizontal_sim, (1, 0, 0, 0), mode='replicate') new_right_knowness = F.pad(cur_knowness[:, :, :, 1:] * horizontal_sim, (0, 1, 0, 0), mode='replicate') new_knowness = torch.stack([new_top_knowness, new_bottom_knowness, new_left_knowness, new_right_knowness], dim=0).max(0).values cur_knowness = torch.max(cur_knowness, new_knowness) cur_knowness = F.interpolate(cur_knowness, size=mask.shape[-2:], mode='bilinear') result = torch.min(mask, 1 - cur_knowness) return result def make_mask_distance_weighter(kind='none', **kwargs): if kind == 'none': return dummy_distance_weighter if kind == 'blur': return BlurMask(**kwargs) if kind == 'edt': return EmulatedEDTMask(**kwargs) if kind == 'pps': return PropagatePerceptualSim(**kwargs) raise ValueError(f'Unknown mask distance weighter kind {kind}') ================================================ FILE: annotator/lama/saicinpainting/training/losses/feature_matching.py ================================================ from typing import List import torch import torch.nn.functional as F def masked_l2_loss(pred, target, mask, weight_known, weight_missing): per_pixel_l2 = F.mse_loss(pred, target, reduction='none') pixel_weights = mask * weight_missing + (1 - mask) * weight_known return (pixel_weights * per_pixel_l2).mean() def masked_l1_loss(pred, target, mask, weight_known, weight_missing): per_pixel_l1 = F.l1_loss(pred, target, reduction='none') pixel_weights = mask * weight_missing + (1 - mask) * weight_known return (pixel_weights * per_pixel_l1).mean() def feature_matching_loss(fake_features: List[torch.Tensor], target_features: List[torch.Tensor], mask=None): if mask is None: res = torch.stack([F.mse_loss(fake_feat, target_feat) for fake_feat, target_feat in zip(fake_features, target_features)]).mean() else: res = 0 norm = 0 for fake_feat, target_feat in zip(fake_features, target_features): cur_mask = F.interpolate(mask, size=fake_feat.shape[-2:], mode='bilinear', align_corners=False) error_weights = 1 - cur_mask cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean() res = res + cur_val norm += 1 res = res / norm return res ================================================ FILE: annotator/lama/saicinpainting/training/losses/perceptual.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import torchvision # from models.ade20k import ModelBuilder from annotator.lama.saicinpainting.utils import check_and_warn_input_range IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None] IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None] class PerceptualLoss(nn.Module): def __init__(self, normalize_inputs=True): super(PerceptualLoss, self).__init__() self.normalize_inputs = normalize_inputs self.mean_ = IMAGENET_MEAN self.std_ = IMAGENET_STD vgg = torchvision.models.vgg19(pretrained=True).features vgg_avg_pooling = [] for weights in vgg.parameters(): weights.requires_grad = False for module in vgg.modules(): if module.__class__.__name__ == 'Sequential': continue elif module.__class__.__name__ == 'MaxPool2d': vgg_avg_pooling.append(nn.AvgPool2d(kernel_size=2, stride=2, padding=0)) else: vgg_avg_pooling.append(module) self.vgg = nn.Sequential(*vgg_avg_pooling) def do_normalize_inputs(self, x): return (x - self.mean_.to(x.device)) / self.std_.to(x.device) def partial_losses(self, input, target, mask=None): check_and_warn_input_range(target, 0, 1, 'PerceptualLoss target in partial_losses') # we expect input and target to be in [0, 1] range losses = [] if self.normalize_inputs: features_input = self.do_normalize_inputs(input) features_target = self.do_normalize_inputs(target) else: features_input = input features_target = target for layer in self.vgg[:30]: features_input = layer(features_input) features_target = layer(features_target) if layer.__class__.__name__ == 'ReLU': loss = F.mse_loss(features_input, features_target, reduction='none') if mask is not None: cur_mask = F.interpolate(mask, size=features_input.shape[-2:], mode='bilinear', align_corners=False) loss = loss * (1 - cur_mask) loss = loss.mean(dim=tuple(range(1, len(loss.shape)))) losses.append(loss) return losses def forward(self, input, target, mask=None): losses = self.partial_losses(input, target, mask=mask) return torch.stack(losses).sum(dim=0) def get_global_features(self, input): check_and_warn_input_range(input, 0, 1, 'PerceptualLoss input in get_global_features') if self.normalize_inputs: features_input = self.do_normalize_inputs(input) else: features_input = input features_input = self.vgg(features_input) return features_input class ResNetPL(nn.Module): def __init__(self, weight=1, weights_path=None, arch_encoder='resnet50dilated', segmentation=True): super().__init__() self.impl = ModelBuilder.get_encoder(weights_path=weights_path, arch_encoder=arch_encoder, arch_decoder='ppm_deepsup', fc_dim=2048, segmentation=segmentation) self.impl.eval() for w in self.impl.parameters(): w.requires_grad_(False) self.weight = weight def forward(self, pred, target): pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred) target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target) pred_feats = self.impl(pred, return_feature_maps=True) target_feats = self.impl(target, return_feature_maps=True) result = torch.stack([F.mse_loss(cur_pred, cur_target) for cur_pred, cur_target in zip(pred_feats, target_feats)]).sum() * self.weight return result ================================================ FILE: annotator/lama/saicinpainting/training/losses/segmentation.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from .constants import weights as constant_weights from modules import devices class CrossEntropy2d(nn.Module): def __init__(self, reduction="mean", ignore_label=255, weights=None, *args, **kwargs): """ weight (Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size "nclasses" """ super(CrossEntropy2d, self).__init__() self.reduction = reduction self.ignore_label = ignore_label self.weights = weights if self.weights is not None: device = devices.get_device_for("controlnet") if torch.cuda.is_available() else torch.device('cpu') self.weights = torch.FloatTensor(constant_weights[weights]).to(device) def forward(self, predict, target): """ Args: predict:(n, c, h, w) target:(n, 1, h, w) """ target = target.long() assert not target.requires_grad assert predict.dim() == 4, "{0}".format(predict.size()) assert target.dim() == 4, "{0}".format(target.size()) assert predict.size(0) == target.size(0), "{0} vs {1} ".format(predict.size(0), target.size(0)) assert target.size(1) == 1, "{0}".format(target.size(1)) assert predict.size(2) == target.size(2), "{0} vs {1} ".format(predict.size(2), target.size(2)) assert predict.size(3) == target.size(3), "{0} vs {1} ".format(predict.size(3), target.size(3)) target = target.squeeze(1) n, c, h, w = predict.size() target_mask = (target >= 0) * (target != self.ignore_label) target = target[target_mask] predict = predict.transpose(1, 2).transpose(2, 3).contiguous() predict = predict[target_mask.view(n, h, w, 1).repeat(1, 1, 1, c)].view(-1, c) loss = F.cross_entropy(predict, target, weight=self.weights, reduction=self.reduction) return loss ================================================ FILE: annotator/lama/saicinpainting/training/losses/style_loss.py ================================================ import torch import torch.nn as nn import torchvision.models as models class PerceptualLoss(nn.Module): r""" Perceptual loss, VGG-based https://arxiv.org/abs/1603.08155 https://github.com/dxyang/StyleTransfer/blob/master/utils.py """ def __init__(self, weights=[1.0, 1.0, 1.0, 1.0, 1.0]): super(PerceptualLoss, self).__init__() self.add_module('vgg', VGG19()) self.criterion = torch.nn.L1Loss() self.weights = weights def __call__(self, x, y): # Compute features x_vgg, y_vgg = self.vgg(x), self.vgg(y) content_loss = 0.0 content_loss += self.weights[0] * self.criterion(x_vgg['relu1_1'], y_vgg['relu1_1']) content_loss += self.weights[1] * self.criterion(x_vgg['relu2_1'], y_vgg['relu2_1']) content_loss += self.weights[2] * self.criterion(x_vgg['relu3_1'], y_vgg['relu3_1']) content_loss += self.weights[3] * self.criterion(x_vgg['relu4_1'], y_vgg['relu4_1']) content_loss += self.weights[4] * self.criterion(x_vgg['relu5_1'], y_vgg['relu5_1']) return content_loss class VGG19(torch.nn.Module): def __init__(self): super(VGG19, self).__init__() features = models.vgg19(pretrained=True).features self.relu1_1 = torch.nn.Sequential() self.relu1_2 = torch.nn.Sequential() self.relu2_1 = torch.nn.Sequential() self.relu2_2 = torch.nn.Sequential() self.relu3_1 = torch.nn.Sequential() self.relu3_2 = torch.nn.Sequential() self.relu3_3 = torch.nn.Sequential() self.relu3_4 = torch.nn.Sequential() self.relu4_1 = torch.nn.Sequential() self.relu4_2 = torch.nn.Sequential() self.relu4_3 = torch.nn.Sequential() self.relu4_4 = torch.nn.Sequential() self.relu5_1 = torch.nn.Sequential() self.relu5_2 = torch.nn.Sequential() self.relu5_3 = torch.nn.Sequential() self.relu5_4 = torch.nn.Sequential() for x in range(2): self.relu1_1.add_module(str(x), features[x]) for x in range(2, 4): self.relu1_2.add_module(str(x), features[x]) for x in range(4, 7): self.relu2_1.add_module(str(x), features[x]) for x in range(7, 9): self.relu2_2.add_module(str(x), features[x]) for x in range(9, 12): self.relu3_1.add_module(str(x), features[x]) for x in range(12, 14): self.relu3_2.add_module(str(x), features[x]) for x in range(14, 16): self.relu3_2.add_module(str(x), features[x]) for x in range(16, 18): self.relu3_4.add_module(str(x), features[x]) for x in range(18, 21): self.relu4_1.add_module(str(x), features[x]) for x in range(21, 23): self.relu4_2.add_module(str(x), features[x]) for x in range(23, 25): self.relu4_3.add_module(str(x), features[x]) for x in range(25, 27): self.relu4_4.add_module(str(x), features[x]) for x in range(27, 30): self.relu5_1.add_module(str(x), features[x]) for x in range(30, 32): self.relu5_2.add_module(str(x), features[x]) for x in range(32, 34): self.relu5_3.add_module(str(x), features[x]) for x in range(34, 36): self.relu5_4.add_module(str(x), features[x]) # don't need the gradients, just want the features for param in self.parameters(): param.requires_grad = False def forward(self, x): relu1_1 = self.relu1_1(x) relu1_2 = self.relu1_2(relu1_1) relu2_1 = self.relu2_1(relu1_2) relu2_2 = self.relu2_2(relu2_1) relu3_1 = self.relu3_1(relu2_2) relu3_2 = self.relu3_2(relu3_1) relu3_3 = self.relu3_3(relu3_2) relu3_4 = self.relu3_4(relu3_3) relu4_1 = self.relu4_1(relu3_4) relu4_2 = self.relu4_2(relu4_1) relu4_3 = self.relu4_3(relu4_2) relu4_4 = self.relu4_4(relu4_3) relu5_1 = self.relu5_1(relu4_4) relu5_2 = self.relu5_2(relu5_1) relu5_3 = self.relu5_3(relu5_2) relu5_4 = self.relu5_4(relu5_3) out = { 'relu1_1': relu1_1, 'relu1_2': relu1_2, 'relu2_1': relu2_1, 'relu2_2': relu2_2, 'relu3_1': relu3_1, 'relu3_2': relu3_2, 'relu3_3': relu3_3, 'relu3_4': relu3_4, 'relu4_1': relu4_1, 'relu4_2': relu4_2, 'relu4_3': relu4_3, 'relu4_4': relu4_4, 'relu5_1': relu5_1, 'relu5_2': relu5_2, 'relu5_3': relu5_3, 'relu5_4': relu5_4, } return out ================================================ FILE: annotator/lama/saicinpainting/training/modules/__init__.py ================================================ import logging from annotator.lama.saicinpainting.training.modules.ffc import FFCResNetGenerator from annotator.lama.saicinpainting.training.modules.pix2pixhd import GlobalGenerator, MultiDilatedGlobalGenerator, \ NLayerDiscriminator, MultidilatedNLayerDiscriminator def make_generator(config, kind, **kwargs): logging.info(f'Make generator {kind}') if kind == 'pix2pixhd_multidilated': return MultiDilatedGlobalGenerator(**kwargs) if kind == 'pix2pixhd_global': return GlobalGenerator(**kwargs) if kind == 'ffc_resnet': return FFCResNetGenerator(**kwargs) raise ValueError(f'Unknown generator kind {kind}') def make_discriminator(kind, **kwargs): logging.info(f'Make discriminator {kind}') if kind == 'pix2pixhd_nlayer_multidilated': return MultidilatedNLayerDiscriminator(**kwargs) if kind == 'pix2pixhd_nlayer': return NLayerDiscriminator(**kwargs) raise ValueError(f'Unknown discriminator kind {kind}') ================================================ FILE: annotator/lama/saicinpainting/training/modules/base.py ================================================ import abc from typing import Tuple, List import torch import torch.nn as nn from annotator.lama.saicinpainting.training.modules.depthwise_sep_conv import DepthWiseSeperableConv from annotator.lama.saicinpainting.training.modules.multidilated_conv import MultidilatedConv class BaseDiscriminator(nn.Module): @abc.abstractmethod def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]: """ Predict scores and get intermediate activations. Useful for feature matching loss :return tuple (scores, list of intermediate activations) """ raise NotImplemented() def get_conv_block_ctor(kind='default'): if not isinstance(kind, str): return kind if kind == 'default': return nn.Conv2d if kind == 'depthwise': return DepthWiseSeperableConv if kind == 'multidilated': return MultidilatedConv raise ValueError(f'Unknown convolutional block kind {kind}') def get_norm_layer(kind='bn'): if not isinstance(kind, str): return kind if kind == 'bn': return nn.BatchNorm2d if kind == 'in': return nn.InstanceNorm2d raise ValueError(f'Unknown norm block kind {kind}') def get_activation(kind='tanh'): if kind == 'tanh': return nn.Tanh() if kind == 'sigmoid': return nn.Sigmoid() if kind is False: return nn.Identity() raise ValueError(f'Unknown activation kind {kind}') class SimpleMultiStepGenerator(nn.Module): def __init__(self, steps: List[nn.Module]): super().__init__() self.steps = nn.ModuleList(steps) def forward(self, x): cur_in = x outs = [] for step in self.steps: cur_out = step(cur_in) outs.append(cur_out) cur_in = torch.cat((cur_in, cur_out), dim=1) return torch.cat(outs[::-1], dim=1) def deconv_factory(kind, ngf, mult, norm_layer, activation, max_features): if kind == 'convtranspose': return [nn.ConvTranspose2d(min(max_features, ngf * mult), min(max_features, int(ngf * mult / 2)), kernel_size=3, stride=2, padding=1, output_padding=1), norm_layer(min(max_features, int(ngf * mult / 2))), activation] elif kind == 'bilinear': return [nn.Upsample(scale_factor=2, mode='bilinear'), DepthWiseSeperableConv(min(max_features, ngf * mult), min(max_features, int(ngf * mult / 2)), kernel_size=3, stride=1, padding=1), norm_layer(min(max_features, int(ngf * mult / 2))), activation] else: raise Exception(f"Invalid deconv kind: {kind}") ================================================ FILE: annotator/lama/saicinpainting/training/modules/depthwise_sep_conv.py ================================================ import torch import torch.nn as nn class DepthWiseSeperableConv(nn.Module): def __init__(self, in_dim, out_dim, *args, **kwargs): super().__init__() if 'groups' in kwargs: # ignoring groups for Depthwise Sep Conv del kwargs['groups'] self.depthwise = nn.Conv2d(in_dim, in_dim, *args, groups=in_dim, **kwargs) self.pointwise = nn.Conv2d(in_dim, out_dim, kernel_size=1) def forward(self, x): out = self.depthwise(x) out = self.pointwise(out) return out ================================================ FILE: annotator/lama/saicinpainting/training/modules/fake_fakes.py ================================================ import torch from kornia import SamplePadding from kornia.augmentation import RandomAffine, CenterCrop class FakeFakesGenerator: def __init__(self, aug_proba=0.5, img_aug_degree=30, img_aug_translate=0.2): self.grad_aug = RandomAffine(degrees=360, translate=0.2, padding_mode=SamplePadding.REFLECTION, keepdim=False, p=1) self.img_aug = RandomAffine(degrees=img_aug_degree, translate=img_aug_translate, padding_mode=SamplePadding.REFLECTION, keepdim=True, p=1) self.aug_proba = aug_proba def __call__(self, input_images, masks): blend_masks = self._fill_masks_with_gradient(masks) blend_target = self._make_blend_target(input_images) result = input_images * (1 - blend_masks) + blend_target * blend_masks return result, blend_masks def _make_blend_target(self, input_images): batch_size = input_images.shape[0] permuted = input_images[torch.randperm(batch_size)] augmented = self.img_aug(input_images) is_aug = (torch.rand(batch_size, device=input_images.device)[:, None, None, None] < self.aug_proba).float() result = augmented * is_aug + permuted * (1 - is_aug) return result def _fill_masks_with_gradient(self, masks): batch_size, _, height, width = masks.shape grad = torch.linspace(0, 1, steps=width * 2, device=masks.device, dtype=masks.dtype) \ .view(1, 1, 1, -1).expand(batch_size, 1, height * 2, width * 2) grad = self.grad_aug(grad) grad = CenterCrop((height, width))(grad) grad *= masks grad_for_min = grad + (1 - masks) * 10 grad -= grad_for_min.view(batch_size, -1).min(-1).values[:, None, None, None] grad /= grad.view(batch_size, -1).max(-1).values[:, None, None, None] + 1e-6 grad.clamp_(min=0, max=1) return grad ================================================ FILE: annotator/lama/saicinpainting/training/modules/ffc.py ================================================ # Fast Fourier Convolution NeurIPS 2020 # original implementation https://github.com/pkumivision/FFC/blob/main/model_zoo/ffc.py # paper https://proceedings.neurips.cc/paper/2020/file/2fd5d41ec6cfab47e32164d5624269b1-Paper.pdf import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from annotator.lama.saicinpainting.training.modules.base import get_activation, BaseDiscriminator from annotator.lama.saicinpainting.training.modules.spatial_transform import LearnableSpatialTransformWrapper from annotator.lama.saicinpainting.training.modules.squeeze_excitation import SELayer from annotator.lama.saicinpainting.utils import get_shape class FFCSE_block(nn.Module): def __init__(self, channels, ratio_g): super(FFCSE_block, self).__init__() in_cg = int(channels * ratio_g) in_cl = channels - in_cg r = 16 self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.conv1 = nn.Conv2d(channels, channels // r, kernel_size=1, bias=True) self.relu1 = nn.ReLU(inplace=True) self.conv_a2l = None if in_cl == 0 else nn.Conv2d( channels // r, in_cl, kernel_size=1, bias=True) self.conv_a2g = None if in_cg == 0 else nn.Conv2d( channels // r, in_cg, kernel_size=1, bias=True) self.sigmoid = nn.Sigmoid() def forward(self, x): x = x if type(x) is tuple else (x, 0) id_l, id_g = x x = id_l if type(id_g) is int else torch.cat([id_l, id_g], dim=1) x = self.avgpool(x) x = self.relu1(self.conv1(x)) x_l = 0 if self.conv_a2l is None else id_l * \ self.sigmoid(self.conv_a2l(x)) x_g = 0 if self.conv_a2g is None else id_g * \ self.sigmoid(self.conv_a2g(x)) return x_l, x_g class FourierUnit(nn.Module): def __init__(self, in_channels, out_channels, groups=1, spatial_scale_factor=None, spatial_scale_mode='bilinear', spectral_pos_encoding=False, use_se=False, se_kwargs=None, ffc3d=False, fft_norm='ortho'): # bn_layer not used super(FourierUnit, self).__init__() self.groups = groups self.conv_layer = torch.nn.Conv2d(in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0), out_channels=out_channels * 2, kernel_size=1, stride=1, padding=0, groups=self.groups, bias=False) self.bn = torch.nn.BatchNorm2d(out_channels * 2) self.relu = torch.nn.ReLU(inplace=True) # squeeze and excitation block self.use_se = use_se if use_se: if se_kwargs is None: se_kwargs = {} self.se = SELayer(self.conv_layer.in_channels, **se_kwargs) self.spatial_scale_factor = spatial_scale_factor self.spatial_scale_mode = spatial_scale_mode self.spectral_pos_encoding = spectral_pos_encoding self.ffc3d = ffc3d self.fft_norm = fft_norm def forward(self, x): batch = x.shape[0] if self.spatial_scale_factor is not None: orig_size = x.shape[-2:] x = F.interpolate(x, scale_factor=self.spatial_scale_factor, mode=self.spatial_scale_mode, align_corners=False) r_size = x.size() # (batch, c, h, w/2+1, 2) fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1) ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm) ffted = torch.stack((ffted.real, ffted.imag), dim=-1) ffted = ffted.permute(0, 1, 4, 2, 3).contiguous() # (batch, c, 2, h, w/2+1) ffted = ffted.view((batch, -1,) + ffted.size()[3:]) if self.spectral_pos_encoding: height, width = ffted.shape[-2:] coords_vert = torch.linspace(0, 1, height)[None, None, :, None].expand(batch, 1, height, width).to(ffted) coords_hor = torch.linspace(0, 1, width)[None, None, None, :].expand(batch, 1, height, width).to(ffted) ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1) if self.use_se: ffted = self.se(ffted) ffted = self.conv_layer(ffted) # (batch, c*2, h, w/2+1) ffted = self.relu(self.bn(ffted)) ffted = ffted.view((batch, -1, 2,) + ffted.size()[2:]).permute( 0, 1, 3, 4, 2).contiguous() # (batch,c, t, h, w/2+1, 2) ffted = torch.complex(ffted[..., 0], ffted[..., 1]) ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:] output = torch.fft.irfftn(ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm) if self.spatial_scale_factor is not None: output = F.interpolate(output, size=orig_size, mode=self.spatial_scale_mode, align_corners=False) return output class SeparableFourierUnit(nn.Module): def __init__(self, in_channels, out_channels, groups=1, kernel_size=3): # bn_layer not used super(SeparableFourierUnit, self).__init__() self.groups = groups row_out_channels = out_channels // 2 col_out_channels = out_channels - row_out_channels self.row_conv = torch.nn.Conv2d(in_channels=in_channels * 2, out_channels=row_out_channels * 2, kernel_size=(kernel_size, 1), # kernel size is always like this, but the data will be transposed stride=1, padding=(kernel_size // 2, 0), padding_mode='reflect', groups=self.groups, bias=False) self.col_conv = torch.nn.Conv2d(in_channels=in_channels * 2, out_channels=col_out_channels * 2, kernel_size=(kernel_size, 1), # kernel size is always like this, but the data will be transposed stride=1, padding=(kernel_size // 2, 0), padding_mode='reflect', groups=self.groups, bias=False) self.row_bn = torch.nn.BatchNorm2d(row_out_channels * 2) self.col_bn = torch.nn.BatchNorm2d(col_out_channels * 2) self.relu = torch.nn.ReLU(inplace=True) def process_branch(self, x, conv, bn): batch = x.shape[0] r_size = x.size() # (batch, c, h, w/2+1, 2) ffted = torch.fft.rfft(x, norm="ortho") ffted = torch.stack((ffted.real, ffted.imag), dim=-1) ffted = ffted.permute(0, 1, 4, 2, 3).contiguous() # (batch, c, 2, h, w/2+1) ffted = ffted.view((batch, -1,) + ffted.size()[3:]) ffted = self.relu(bn(conv(ffted))) ffted = ffted.view((batch, -1, 2,) + ffted.size()[2:]).permute( 0, 1, 3, 4, 2).contiguous() # (batch,c, t, h, w/2+1, 2) ffted = torch.complex(ffted[..., 0], ffted[..., 1]) output = torch.fft.irfft(ffted, s=x.shape[-1:], norm="ortho") return output def forward(self, x): rowwise = self.process_branch(x, self.row_conv, self.row_bn) colwise = self.process_branch(x.permute(0, 1, 3, 2), self.col_conv, self.col_bn).permute(0, 1, 3, 2) out = torch.cat((rowwise, colwise), dim=1) return out class SpectralTransform(nn.Module): def __init__(self, in_channels, out_channels, stride=1, groups=1, enable_lfu=True, separable_fu=False, **fu_kwargs): # bn_layer not used super(SpectralTransform, self).__init__() self.enable_lfu = enable_lfu if stride == 2: self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2) else: self.downsample = nn.Identity() self.stride = stride self.conv1 = nn.Sequential( nn.Conv2d(in_channels, out_channels // 2, kernel_size=1, groups=groups, bias=False), nn.BatchNorm2d(out_channels // 2), nn.ReLU(inplace=True) ) fu_class = SeparableFourierUnit if separable_fu else FourierUnit self.fu = fu_class( out_channels // 2, out_channels // 2, groups, **fu_kwargs) if self.enable_lfu: self.lfu = fu_class( out_channels // 2, out_channels // 2, groups) self.conv2 = torch.nn.Conv2d( out_channels // 2, out_channels, kernel_size=1, groups=groups, bias=False) def forward(self, x): x = self.downsample(x) x = self.conv1(x) output = self.fu(x) if self.enable_lfu: n, c, h, w = x.shape split_no = 2 split_s = h // split_no xs = torch.cat(torch.split( x[:, :c // 4], split_s, dim=-2), dim=1).contiguous() xs = torch.cat(torch.split(xs, split_s, dim=-1), dim=1).contiguous() xs = self.lfu(xs) xs = xs.repeat(1, 1, split_no, split_no).contiguous() else: xs = 0 output = self.conv2(x + output + xs) return output class FFC(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, ratio_gin, ratio_gout, stride=1, padding=0, dilation=1, groups=1, bias=False, enable_lfu=True, padding_type='reflect', gated=False, **spectral_kwargs): super(FFC, self).__init__() assert stride == 1 or stride == 2, "Stride should be 1 or 2." self.stride = stride in_cg = int(in_channels * ratio_gin) in_cl = in_channels - in_cg out_cg = int(out_channels * ratio_gout) out_cl = out_channels - out_cg #groups_g = 1 if groups == 1 else int(groups * ratio_gout) #groups_l = 1 if groups == 1 else groups - groups_g self.ratio_gin = ratio_gin self.ratio_gout = ratio_gout self.global_in_num = in_cg module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d self.convl2l = module(in_cl, out_cl, kernel_size, stride, padding, dilation, groups, bias, padding_mode=padding_type) module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d self.convl2g = module(in_cl, out_cg, kernel_size, stride, padding, dilation, groups, bias, padding_mode=padding_type) module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d self.convg2l = module(in_cg, out_cl, kernel_size, stride, padding, dilation, groups, bias, padding_mode=padding_type) module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform self.convg2g = module( in_cg, out_cg, stride, 1 if groups == 1 else groups // 2, enable_lfu, **spectral_kwargs) self.gated = gated module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d self.gate = module(in_channels, 2, 1) def forward(self, x): x_l, x_g = x if type(x) is tuple else (x, 0) out_xl, out_xg = 0, 0 if self.gated: total_input_parts = [x_l] if torch.is_tensor(x_g): total_input_parts.append(x_g) total_input = torch.cat(total_input_parts, dim=1) gates = torch.sigmoid(self.gate(total_input)) g2l_gate, l2g_gate = gates.chunk(2, dim=1) else: g2l_gate, l2g_gate = 1, 1 if self.ratio_gout != 1: out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate if self.ratio_gout != 0: out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g) return out_xl, out_xg class FFC_BN_ACT(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, ratio_gin, ratio_gout, stride=1, padding=0, dilation=1, groups=1, bias=False, norm_layer=nn.BatchNorm2d, activation_layer=nn.Identity, padding_type='reflect', enable_lfu=True, **kwargs): super(FFC_BN_ACT, self).__init__() self.ffc = FFC(in_channels, out_channels, kernel_size, ratio_gin, ratio_gout, stride, padding, dilation, groups, bias, enable_lfu, padding_type=padding_type, **kwargs) lnorm = nn.Identity if ratio_gout == 1 else norm_layer gnorm = nn.Identity if ratio_gout == 0 else norm_layer global_channels = int(out_channels * ratio_gout) self.bn_l = lnorm(out_channels - global_channels) self.bn_g = gnorm(global_channels) lact = nn.Identity if ratio_gout == 1 else activation_layer gact = nn.Identity if ratio_gout == 0 else activation_layer self.act_l = lact(inplace=True) self.act_g = gact(inplace=True) def forward(self, x): x_l, x_g = self.ffc(x) x_l = self.act_l(self.bn_l(x_l)) x_g = self.act_g(self.bn_g(x_g)) return x_l, x_g class FFCResnetBlock(nn.Module): def __init__(self, dim, padding_type, norm_layer, activation_layer=nn.ReLU, dilation=1, spatial_transform_kwargs=None, inline=False, **conv_kwargs): super().__init__() self.conv1 = FFC_BN_ACT(dim, dim, kernel_size=3, padding=dilation, dilation=dilation, norm_layer=norm_layer, activation_layer=activation_layer, padding_type=padding_type, **conv_kwargs) self.conv2 = FFC_BN_ACT(dim, dim, kernel_size=3, padding=dilation, dilation=dilation, norm_layer=norm_layer, activation_layer=activation_layer, padding_type=padding_type, **conv_kwargs) if spatial_transform_kwargs is not None: self.conv1 = LearnableSpatialTransformWrapper(self.conv1, **spatial_transform_kwargs) self.conv2 = LearnableSpatialTransformWrapper(self.conv2, **spatial_transform_kwargs) self.inline = inline def forward(self, x): if self.inline: x_l, x_g = x[:, :-self.conv1.ffc.global_in_num], x[:, -self.conv1.ffc.global_in_num:] else: x_l, x_g = x if type(x) is tuple else (x, 0) id_l, id_g = x_l, x_g x_l, x_g = self.conv1((x_l, x_g)) x_l, x_g = self.conv2((x_l, x_g)) x_l, x_g = id_l + x_l, id_g + x_g out = x_l, x_g if self.inline: out = torch.cat(out, dim=1) return out class ConcatTupleLayer(nn.Module): def forward(self, x): assert isinstance(x, tuple) x_l, x_g = x assert torch.is_tensor(x_l) or torch.is_tensor(x_g) if not torch.is_tensor(x_g): return x_l return torch.cat(x, dim=1) class FFCResNetGenerator(nn.Module): def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d, padding_type='reflect', activation_layer=nn.ReLU, up_norm_layer=nn.BatchNorm2d, up_activation=nn.ReLU(True), init_conv_kwargs={}, downsample_conv_kwargs={}, resnet_conv_kwargs={}, spatial_transform_layers=None, spatial_transform_kwargs={}, add_out_act=True, max_features=1024, out_ffc=False, out_ffc_kwargs={}): assert (n_blocks >= 0) super().__init__() model = [nn.ReflectionPad2d(3), FFC_BN_ACT(input_nc, ngf, kernel_size=7, padding=0, norm_layer=norm_layer, activation_layer=activation_layer, **init_conv_kwargs)] ### downsample for i in range(n_downsampling): mult = 2 ** i if i == n_downsampling - 1: cur_conv_kwargs = dict(downsample_conv_kwargs) cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get('ratio_gin', 0) else: cur_conv_kwargs = downsample_conv_kwargs model += [FFC_BN_ACT(min(max_features, ngf * mult), min(max_features, ngf * mult * 2), kernel_size=3, stride=2, padding=1, norm_layer=norm_layer, activation_layer=activation_layer, **cur_conv_kwargs)] mult = 2 ** n_downsampling feats_num_bottleneck = min(max_features, ngf * mult) ### resnet blocks for i in range(n_blocks): cur_resblock = FFCResnetBlock(feats_num_bottleneck, padding_type=padding_type, activation_layer=activation_layer, norm_layer=norm_layer, **resnet_conv_kwargs) if spatial_transform_layers is not None and i in spatial_transform_layers: cur_resblock = LearnableSpatialTransformWrapper(cur_resblock, **spatial_transform_kwargs) model += [cur_resblock] model += [ConcatTupleLayer()] ### upsample for i in range(n_downsampling): mult = 2 ** (n_downsampling - i) model += [nn.ConvTranspose2d(min(max_features, ngf * mult), min(max_features, int(ngf * mult / 2)), kernel_size=3, stride=2, padding=1, output_padding=1), up_norm_layer(min(max_features, int(ngf * mult / 2))), up_activation] if out_ffc: model += [FFCResnetBlock(ngf, padding_type=padding_type, activation_layer=activation_layer, norm_layer=norm_layer, inline=True, **out_ffc_kwargs)] model += [nn.ReflectionPad2d(3), nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)] if add_out_act: model.append(get_activation('tanh' if add_out_act is True else add_out_act)) self.model = nn.Sequential(*model) def forward(self, input): return self.model(input) class FFCNLayerDiscriminator(BaseDiscriminator): def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d, max_features=512, init_conv_kwargs={}, conv_kwargs={}): super().__init__() self.n_layers = n_layers def _act_ctor(inplace=True): return nn.LeakyReLU(negative_slope=0.2, inplace=inplace) kw = 3 padw = int(np.ceil((kw-1.0)/2)) sequence = [[FFC_BN_ACT(input_nc, ndf, kernel_size=kw, padding=padw, norm_layer=norm_layer, activation_layer=_act_ctor, **init_conv_kwargs)]] nf = ndf for n in range(1, n_layers): nf_prev = nf nf = min(nf * 2, max_features) cur_model = [ FFC_BN_ACT(nf_prev, nf, kernel_size=kw, stride=2, padding=padw, norm_layer=norm_layer, activation_layer=_act_ctor, **conv_kwargs) ] sequence.append(cur_model) nf_prev = nf nf = min(nf * 2, 512) cur_model = [ FFC_BN_ACT(nf_prev, nf, kernel_size=kw, stride=1, padding=padw, norm_layer=norm_layer, activation_layer=lambda *args, **kwargs: nn.LeakyReLU(*args, negative_slope=0.2, **kwargs), **conv_kwargs), ConcatTupleLayer() ] sequence.append(cur_model) sequence += [[nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)]] for n in range(len(sequence)): setattr(self, 'model'+str(n), nn.Sequential(*sequence[n])) def get_all_activations(self, x): res = [x] for n in range(self.n_layers + 2): model = getattr(self, 'model' + str(n)) res.append(model(res[-1])) return res[1:] def forward(self, x): act = self.get_all_activations(x) feats = [] for out in act[:-1]: if isinstance(out, tuple): if torch.is_tensor(out[1]): out = torch.cat(out, dim=1) else: out = out[0] feats.append(out) return act[-1], feats ================================================ FILE: annotator/lama/saicinpainting/training/modules/multidilated_conv.py ================================================ import torch import torch.nn as nn import random from annotator.lama.saicinpainting.training.modules.depthwise_sep_conv import DepthWiseSeperableConv class MultidilatedConv(nn.Module): def __init__(self, in_dim, out_dim, kernel_size, dilation_num=3, comb_mode='sum', equal_dim=True, shared_weights=False, padding=1, min_dilation=1, shuffle_in_channels=False, use_depthwise=False, **kwargs): super().__init__() convs = [] self.equal_dim = equal_dim assert comb_mode in ('cat_out', 'sum', 'cat_in', 'cat_both'), comb_mode if comb_mode in ('cat_out', 'cat_both'): self.cat_out = True if equal_dim: assert out_dim % dilation_num == 0 out_dims = [out_dim // dilation_num] * dilation_num self.index = sum([[i + j * (out_dims[0]) for j in range(dilation_num)] for i in range(out_dims[0])], []) else: out_dims = [out_dim // 2 ** (i + 1) for i in range(dilation_num - 1)] out_dims.append(out_dim - sum(out_dims)) index = [] starts = [0] + out_dims[:-1] lengths = [out_dims[i] // out_dims[-1] for i in range(dilation_num)] for i in range(out_dims[-1]): for j in range(dilation_num): index += list(range(starts[j], starts[j] + lengths[j])) starts[j] += lengths[j] self.index = index assert(len(index) == out_dim) self.out_dims = out_dims else: self.cat_out = False self.out_dims = [out_dim] * dilation_num if comb_mode in ('cat_in', 'cat_both'): if equal_dim: assert in_dim % dilation_num == 0 in_dims = [in_dim // dilation_num] * dilation_num else: in_dims = [in_dim // 2 ** (i + 1) for i in range(dilation_num - 1)] in_dims.append(in_dim - sum(in_dims)) self.in_dims = in_dims self.cat_in = True else: self.cat_in = False self.in_dims = [in_dim] * dilation_num conv_type = DepthWiseSeperableConv if use_depthwise else nn.Conv2d dilation = min_dilation for i in range(dilation_num): if isinstance(padding, int): cur_padding = padding * dilation else: cur_padding = padding[i] convs.append(conv_type( self.in_dims[i], self.out_dims[i], kernel_size, padding=cur_padding, dilation=dilation, **kwargs )) if i > 0 and shared_weights: convs[-1].weight = convs[0].weight convs[-1].bias = convs[0].bias dilation *= 2 self.convs = nn.ModuleList(convs) self.shuffle_in_channels = shuffle_in_channels if self.shuffle_in_channels: # shuffle list as shuffling of tensors is nondeterministic in_channels_permute = list(range(in_dim)) random.shuffle(in_channels_permute) # save as buffer so it is saved and loaded with checkpoint self.register_buffer('in_channels_permute', torch.tensor(in_channels_permute)) def forward(self, x): if self.shuffle_in_channels: x = x[:, self.in_channels_permute] outs = [] if self.cat_in: if self.equal_dim: x = x.chunk(len(self.convs), dim=1) else: new_x = [] start = 0 for dim in self.in_dims: new_x.append(x[:, start:start+dim]) start += dim x = new_x for i, conv in enumerate(self.convs): if self.cat_in: input = x[i] else: input = x outs.append(conv(input)) if self.cat_out: out = torch.cat(outs, dim=1)[:, self.index] else: out = sum(outs) return out ================================================ FILE: annotator/lama/saicinpainting/training/modules/multiscale.py ================================================ from typing import List, Tuple, Union, Optional import torch import torch.nn as nn import torch.nn.functional as F from annotator.lama.saicinpainting.training.modules.base import get_conv_block_ctor, get_activation from annotator.lama.saicinpainting.training.modules.pix2pixhd import ResnetBlock class ResNetHead(nn.Module): def __init__(self, input_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d, padding_type='reflect', conv_kind='default', activation=nn.ReLU(True)): assert (n_blocks >= 0) super(ResNetHead, self).__init__() conv_layer = get_conv_block_ctor(conv_kind) model = [nn.ReflectionPad2d(3), conv_layer(input_nc, ngf, kernel_size=7, padding=0), norm_layer(ngf), activation] ### downsample for i in range(n_downsampling): mult = 2 ** i model += [conv_layer(ngf * mult, ngf * mult * 2, kernel_size=3, stride=2, padding=1), norm_layer(ngf * mult * 2), activation] mult = 2 ** n_downsampling ### resnet blocks for i in range(n_blocks): model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer, conv_kind=conv_kind)] self.model = nn.Sequential(*model) def forward(self, input): return self.model(input) class ResNetTail(nn.Module): def __init__(self, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d, padding_type='reflect', conv_kind='default', activation=nn.ReLU(True), up_norm_layer=nn.BatchNorm2d, up_activation=nn.ReLU(True), add_out_act=False, out_extra_layers_n=0, add_in_proj=None): assert (n_blocks >= 0) super(ResNetTail, self).__init__() mult = 2 ** n_downsampling model = [] if add_in_proj is not None: model.append(nn.Conv2d(add_in_proj, ngf * mult, kernel_size=1)) ### resnet blocks for i in range(n_blocks): model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer, conv_kind=conv_kind)] ### upsample for i in range(n_downsampling): mult = 2 ** (n_downsampling - i) model += [nn.ConvTranspose2d(ngf * mult, int(ngf * mult / 2), kernel_size=3, stride=2, padding=1, output_padding=1), up_norm_layer(int(ngf * mult / 2)), up_activation] self.model = nn.Sequential(*model) out_layers = [] for _ in range(out_extra_layers_n): out_layers += [nn.Conv2d(ngf, ngf, kernel_size=1, padding=0), up_norm_layer(ngf), up_activation] out_layers += [nn.ReflectionPad2d(3), nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)] if add_out_act: out_layers.append(get_activation('tanh' if add_out_act is True else add_out_act)) self.out_proj = nn.Sequential(*out_layers) def forward(self, input, return_last_act=False): features = self.model(input) out = self.out_proj(features) if return_last_act: return out, features else: return out class MultiscaleResNet(nn.Module): def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=2, n_blocks_head=2, n_blocks_tail=6, n_scales=3, norm_layer=nn.BatchNorm2d, padding_type='reflect', conv_kind='default', activation=nn.ReLU(True), up_norm_layer=nn.BatchNorm2d, up_activation=nn.ReLU(True), add_out_act=False, out_extra_layers_n=0, out_cumulative=False, return_only_hr=False): super().__init__() self.heads = nn.ModuleList([ResNetHead(input_nc, ngf=ngf, n_downsampling=n_downsampling, n_blocks=n_blocks_head, norm_layer=norm_layer, padding_type=padding_type, conv_kind=conv_kind, activation=activation) for i in range(n_scales)]) tail_in_feats = ngf * (2 ** n_downsampling) + ngf self.tails = nn.ModuleList([ResNetTail(output_nc, ngf=ngf, n_downsampling=n_downsampling, n_blocks=n_blocks_tail, norm_layer=norm_layer, padding_type=padding_type, conv_kind=conv_kind, activation=activation, up_norm_layer=up_norm_layer, up_activation=up_activation, add_out_act=add_out_act, out_extra_layers_n=out_extra_layers_n, add_in_proj=None if (i == n_scales - 1) else tail_in_feats) for i in range(n_scales)]) self.out_cumulative = out_cumulative self.return_only_hr = return_only_hr @property def num_scales(self): return len(self.heads) def forward(self, ms_inputs: List[torch.Tensor], smallest_scales_num: Optional[int] = None) \ -> Union[torch.Tensor, List[torch.Tensor]]: """ :param ms_inputs: List of inputs of different resolutions from HR to LR :param smallest_scales_num: int or None, number of smallest scales to take at input :return: Depending on return_only_hr: True: Only the most HR output False: List of outputs of different resolutions from HR to LR """ if smallest_scales_num is None: assert len(self.heads) == len(ms_inputs), (len(self.heads), len(ms_inputs), smallest_scales_num) smallest_scales_num = len(self.heads) else: assert smallest_scales_num == len(ms_inputs) <= len(self.heads), (len(self.heads), len(ms_inputs), smallest_scales_num) cur_heads = self.heads[-smallest_scales_num:] ms_features = [cur_head(cur_inp) for cur_head, cur_inp in zip(cur_heads, ms_inputs)] all_outputs = [] prev_tail_features = None for i in range(len(ms_features)): scale_i = -i - 1 cur_tail_input = ms_features[-i - 1] if prev_tail_features is not None: if prev_tail_features.shape != cur_tail_input.shape: prev_tail_features = F.interpolate(prev_tail_features, size=cur_tail_input.shape[2:], mode='bilinear', align_corners=False) cur_tail_input = torch.cat((cur_tail_input, prev_tail_features), dim=1) cur_out, cur_tail_feats = self.tails[scale_i](cur_tail_input, return_last_act=True) prev_tail_features = cur_tail_feats all_outputs.append(cur_out) if self.out_cumulative: all_outputs_cum = [all_outputs[0]] for i in range(1, len(ms_features)): cur_out = all_outputs[i] cur_out_cum = cur_out + F.interpolate(all_outputs_cum[-1], size=cur_out.shape[2:], mode='bilinear', align_corners=False) all_outputs_cum.append(cur_out_cum) all_outputs = all_outputs_cum if self.return_only_hr: return all_outputs[-1] else: return all_outputs[::-1] class MultiscaleDiscriminatorSimple(nn.Module): def __init__(self, ms_impl): super().__init__() self.ms_impl = nn.ModuleList(ms_impl) @property def num_scales(self): return len(self.ms_impl) def forward(self, ms_inputs: List[torch.Tensor], smallest_scales_num: Optional[int] = None) \ -> List[Tuple[torch.Tensor, List[torch.Tensor]]]: """ :param ms_inputs: List of inputs of different resolutions from HR to LR :param smallest_scales_num: int or None, number of smallest scales to take at input :return: List of pairs (prediction, features) for different resolutions from HR to LR """ if smallest_scales_num is None: assert len(self.ms_impl) == len(ms_inputs), (len(self.ms_impl), len(ms_inputs), smallest_scales_num) smallest_scales_num = len(self.heads) else: assert smallest_scales_num == len(ms_inputs) <= len(self.ms_impl), \ (len(self.ms_impl), len(ms_inputs), smallest_scales_num) return [cur_discr(cur_input) for cur_discr, cur_input in zip(self.ms_impl[-smallest_scales_num:], ms_inputs)] class SingleToMultiScaleInputMixin: def forward(self, x: torch.Tensor) -> List: orig_height, orig_width = x.shape[2:] factors = [2 ** i for i in range(self.num_scales)] ms_inputs = [F.interpolate(x, size=(orig_height // f, orig_width // f), mode='bilinear', align_corners=False) for f in factors] return super().forward(ms_inputs) class GeneratorMultiToSingleOutputMixin: def forward(self, x): return super().forward(x)[0] class DiscriminatorMultiToSingleOutputMixin: def forward(self, x): out_feat_tuples = super().forward(x) return out_feat_tuples[0][0], [f for _, flist in out_feat_tuples for f in flist] class DiscriminatorMultiToSingleOutputStackedMixin: def __init__(self, *args, return_feats_only_levels=None, **kwargs): super().__init__(*args, **kwargs) self.return_feats_only_levels = return_feats_only_levels def forward(self, x): out_feat_tuples = super().forward(x) outs = [out for out, _ in out_feat_tuples] scaled_outs = [outs[0]] + [F.interpolate(cur_out, size=outs[0].shape[-2:], mode='bilinear', align_corners=False) for cur_out in outs[1:]] out = torch.cat(scaled_outs, dim=1) if self.return_feats_only_levels is not None: feat_lists = [out_feat_tuples[i][1] for i in self.return_feats_only_levels] else: feat_lists = [flist for _, flist in out_feat_tuples] feats = [f for flist in feat_lists for f in flist] return out, feats class MultiscaleDiscrSingleInput(SingleToMultiScaleInputMixin, DiscriminatorMultiToSingleOutputStackedMixin, MultiscaleDiscriminatorSimple): pass class MultiscaleResNetSingle(GeneratorMultiToSingleOutputMixin, SingleToMultiScaleInputMixin, MultiscaleResNet): pass ================================================ FILE: annotator/lama/saicinpainting/training/modules/pix2pixhd.py ================================================ # original: https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py import collections from functools import partial import functools import logging from collections import defaultdict import numpy as np import torch.nn as nn from annotator.lama.saicinpainting.training.modules.base import BaseDiscriminator, deconv_factory, get_conv_block_ctor, get_norm_layer, get_activation from annotator.lama.saicinpainting.training.modules.ffc import FFCResnetBlock from annotator.lama.saicinpainting.training.modules.multidilated_conv import MultidilatedConv class DotDict(defaultdict): # https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary """dot.notation access to dictionary attributes""" __getattr__ = defaultdict.get __setattr__ = defaultdict.__setitem__ __delattr__ = defaultdict.__delitem__ class Identity(nn.Module): def __init__(self): super().__init__() def forward(self, x): return x class ResnetBlock(nn.Module): def __init__(self, dim, padding_type, norm_layer, activation=nn.ReLU(True), use_dropout=False, conv_kind='default', dilation=1, in_dim=None, groups=1, second_dilation=None): super(ResnetBlock, self).__init__() self.in_dim = in_dim self.dim = dim if second_dilation is None: second_dilation = dilation self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, activation, use_dropout, conv_kind=conv_kind, dilation=dilation, in_dim=in_dim, groups=groups, second_dilation=second_dilation) if self.in_dim is not None: self.input_conv = nn.Conv2d(in_dim, dim, 1) self.out_channnels = dim def build_conv_block(self, dim, padding_type, norm_layer, activation, use_dropout, conv_kind='default', dilation=1, in_dim=None, groups=1, second_dilation=1): conv_layer = get_conv_block_ctor(conv_kind) conv_block = [] p = 0 if padding_type == 'reflect': conv_block += [nn.ReflectionPad2d(dilation)] elif padding_type == 'replicate': conv_block += [nn.ReplicationPad2d(dilation)] elif padding_type == 'zero': p = dilation else: raise NotImplementedError('padding [%s] is not implemented' % padding_type) if in_dim is None: in_dim = dim conv_block += [conv_layer(in_dim, dim, kernel_size=3, padding=p, dilation=dilation), norm_layer(dim), activation] if use_dropout: conv_block += [nn.Dropout(0.5)] p = 0 if padding_type == 'reflect': conv_block += [nn.ReflectionPad2d(second_dilation)] elif padding_type == 'replicate': conv_block += [nn.ReplicationPad2d(second_dilation)] elif padding_type == 'zero': p = second_dilation else: raise NotImplementedError('padding [%s] is not implemented' % padding_type) conv_block += [conv_layer(dim, dim, kernel_size=3, padding=p, dilation=second_dilation, groups=groups), norm_layer(dim)] return nn.Sequential(*conv_block) def forward(self, x): x_before = x if self.in_dim is not None: x = self.input_conv(x) out = x + self.conv_block(x_before) return out class ResnetBlock5x5(nn.Module): def __init__(self, dim, padding_type, norm_layer, activation=nn.ReLU(True), use_dropout=False, conv_kind='default', dilation=1, in_dim=None, groups=1, second_dilation=None): super(ResnetBlock5x5, self).__init__() self.in_dim = in_dim self.dim = dim if second_dilation is None: second_dilation = dilation self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, activation, use_dropout, conv_kind=conv_kind, dilation=dilation, in_dim=in_dim, groups=groups, second_dilation=second_dilation) if self.in_dim is not None: self.input_conv = nn.Conv2d(in_dim, dim, 1) self.out_channnels = dim def build_conv_block(self, dim, padding_type, norm_layer, activation, use_dropout, conv_kind='default', dilation=1, in_dim=None, groups=1, second_dilation=1): conv_layer = get_conv_block_ctor(conv_kind) conv_block = [] p = 0 if padding_type == 'reflect': conv_block += [nn.ReflectionPad2d(dilation * 2)] elif padding_type == 'replicate': conv_block += [nn.ReplicationPad2d(dilation * 2)] elif padding_type == 'zero': p = dilation * 2 else: raise NotImplementedError('padding [%s] is not implemented' % padding_type) if in_dim is None: in_dim = dim conv_block += [conv_layer(in_dim, dim, kernel_size=5, padding=p, dilation=dilation), norm_layer(dim), activation] if use_dropout: conv_block += [nn.Dropout(0.5)] p = 0 if padding_type == 'reflect': conv_block += [nn.ReflectionPad2d(second_dilation * 2)] elif padding_type == 'replicate': conv_block += [nn.ReplicationPad2d(second_dilation * 2)] elif padding_type == 'zero': p = second_dilation * 2 else: raise NotImplementedError('padding [%s] is not implemented' % padding_type) conv_block += [conv_layer(dim, dim, kernel_size=5, padding=p, dilation=second_dilation, groups=groups), norm_layer(dim)] return nn.Sequential(*conv_block) def forward(self, x): x_before = x if self.in_dim is not None: x = self.input_conv(x) out = x + self.conv_block(x_before) return out class MultidilatedResnetBlock(nn.Module): def __init__(self, dim, padding_type, conv_layer, norm_layer, activation=nn.ReLU(True), use_dropout=False): super().__init__() self.conv_block = self.build_conv_block(dim, padding_type, conv_layer, norm_layer, activation, use_dropout) def build_conv_block(self, dim, padding_type, conv_layer, norm_layer, activation, use_dropout, dilation=1): conv_block = [] conv_block += [conv_layer(dim, dim, kernel_size=3, padding_mode=padding_type), norm_layer(dim), activation] if use_dropout: conv_block += [nn.Dropout(0.5)] conv_block += [conv_layer(dim, dim, kernel_size=3, padding_mode=padding_type), norm_layer(dim)] return nn.Sequential(*conv_block) def forward(self, x): out = x + self.conv_block(x) return out class MultiDilatedGlobalGenerator(nn.Module): def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3, n_blocks=3, norm_layer=nn.BatchNorm2d, padding_type='reflect', conv_kind='default', deconv_kind='convtranspose', activation=nn.ReLU(True), up_norm_layer=nn.BatchNorm2d, affine=None, up_activation=nn.ReLU(True), add_out_act=True, max_features=1024, multidilation_kwargs={}, ffc_positions=None, ffc_kwargs={}): assert (n_blocks >= 0) super().__init__() conv_layer = get_conv_block_ctor(conv_kind) resnet_conv_layer = functools.partial(get_conv_block_ctor('multidilated'), **multidilation_kwargs) norm_layer = get_norm_layer(norm_layer) if affine is not None: norm_layer = partial(norm_layer, affine=affine) up_norm_layer = get_norm_layer(up_norm_layer) if affine is not None: up_norm_layer = partial(up_norm_layer, affine=affine) model = [nn.ReflectionPad2d(3), conv_layer(input_nc, ngf, kernel_size=7, padding=0), norm_layer(ngf), activation] identity = Identity() ### downsample for i in range(n_downsampling): mult = 2 ** i model += [conv_layer(min(max_features, ngf * mult), min(max_features, ngf * mult * 2), kernel_size=3, stride=2, padding=1), norm_layer(min(max_features, ngf * mult * 2)), activation] mult = 2 ** n_downsampling feats_num_bottleneck = min(max_features, ngf * mult) ### resnet blocks for i in range(n_blocks): if ffc_positions is not None and i in ffc_positions: model += [FFCResnetBlock(feats_num_bottleneck, padding_type, norm_layer, activation_layer=nn.ReLU, inline=True, **ffc_kwargs)] model += [MultidilatedResnetBlock(feats_num_bottleneck, padding_type=padding_type, conv_layer=resnet_conv_layer, activation=activation, norm_layer=norm_layer)] ### upsample for i in range(n_downsampling): mult = 2 ** (n_downsampling - i) model += deconv_factory(deconv_kind, ngf, mult, up_norm_layer, up_activation, max_features) model += [nn.ReflectionPad2d(3), nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)] if add_out_act: model.append(get_activation('tanh' if add_out_act is True else add_out_act)) self.model = nn.Sequential(*model) def forward(self, input): return self.model(input) class ConfigGlobalGenerator(nn.Module): def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3, n_blocks=3, norm_layer=nn.BatchNorm2d, padding_type='reflect', conv_kind='default', deconv_kind='convtranspose', activation=nn.ReLU(True), up_norm_layer=nn.BatchNorm2d, affine=None, up_activation=nn.ReLU(True), add_out_act=True, max_features=1024, manual_block_spec=[], resnet_block_kind='multidilatedresnetblock', resnet_conv_kind='multidilated', resnet_dilation=1, multidilation_kwargs={}): assert (n_blocks >= 0) super().__init__() conv_layer = get_conv_block_ctor(conv_kind) resnet_conv_layer = functools.partial(get_conv_block_ctor(resnet_conv_kind), **multidilation_kwargs) norm_layer = get_norm_layer(norm_layer) if affine is not None: norm_layer = partial(norm_layer, affine=affine) up_norm_layer = get_norm_layer(up_norm_layer) if affine is not None: up_norm_layer = partial(up_norm_layer, affine=affine) model = [nn.ReflectionPad2d(3), conv_layer(input_nc, ngf, kernel_size=7, padding=0), norm_layer(ngf), activation] identity = Identity() ### downsample for i in range(n_downsampling): mult = 2 ** i model += [conv_layer(min(max_features, ngf * mult), min(max_features, ngf * mult * 2), kernel_size=3, stride=2, padding=1), norm_layer(min(max_features, ngf * mult * 2)), activation] mult = 2 ** n_downsampling feats_num_bottleneck = min(max_features, ngf * mult) if len(manual_block_spec) == 0: manual_block_spec = [ DotDict(lambda : None, { 'n_blocks': n_blocks, 'use_default': True}) ] ### resnet blocks for block_spec in manual_block_spec: def make_and_add_blocks(model, block_spec): block_spec = DotDict(lambda : None, block_spec) if not block_spec.use_default: resnet_conv_layer = functools.partial(get_conv_block_ctor(block_spec.resnet_conv_kind), **block_spec.multidilation_kwargs) resnet_conv_kind = block_spec.resnet_conv_kind resnet_block_kind = block_spec.resnet_block_kind if block_spec.resnet_dilation is not None: resnet_dilation = block_spec.resnet_dilation for i in range(block_spec.n_blocks): if resnet_block_kind == "multidilatedresnetblock": model += [MultidilatedResnetBlock(feats_num_bottleneck, padding_type=padding_type, conv_layer=resnet_conv_layer, activation=activation, norm_layer=norm_layer)] if resnet_block_kind == "resnetblock": model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer, conv_kind=resnet_conv_kind)] if resnet_block_kind == "resnetblock5x5": model += [ResnetBlock5x5(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer, conv_kind=resnet_conv_kind)] if resnet_block_kind == "resnetblockdwdil": model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer, conv_kind=resnet_conv_kind, dilation=resnet_dilation, second_dilation=resnet_dilation)] make_and_add_blocks(model, block_spec) ### upsample for i in range(n_downsampling): mult = 2 ** (n_downsampling - i) model += deconv_factory(deconv_kind, ngf, mult, up_norm_layer, up_activation, max_features) model += [nn.ReflectionPad2d(3), nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)] if add_out_act: model.append(get_activation('tanh' if add_out_act is True else add_out_act)) self.model = nn.Sequential(*model) def forward(self, input): return self.model(input) def make_dil_blocks(dilated_blocks_n, dilation_block_kind, dilated_block_kwargs): blocks = [] for i in range(dilated_blocks_n): if dilation_block_kind == 'simple': blocks.append(ResnetBlock(**dilated_block_kwargs, dilation=2 ** (i + 1))) elif dilation_block_kind == 'multi': blocks.append(MultidilatedResnetBlock(**dilated_block_kwargs)) else: raise ValueError(f'dilation_block_kind could not be "{dilation_block_kind}"') return blocks class GlobalGenerator(nn.Module): def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d, padding_type='reflect', conv_kind='default', activation=nn.ReLU(True), up_norm_layer=nn.BatchNorm2d, affine=None, up_activation=nn.ReLU(True), dilated_blocks_n=0, dilated_blocks_n_start=0, dilated_blocks_n_middle=0, add_out_act=True, max_features=1024, is_resblock_depthwise=False, ffc_positions=None, ffc_kwargs={}, dilation=1, second_dilation=None, dilation_block_kind='simple', multidilation_kwargs={}): assert (n_blocks >= 0) super().__init__() conv_layer = get_conv_block_ctor(conv_kind) norm_layer = get_norm_layer(norm_layer) if affine is not None: norm_layer = partial(norm_layer, affine=affine) up_norm_layer = get_norm_layer(up_norm_layer) if affine is not None: up_norm_layer = partial(up_norm_layer, affine=affine) if ffc_positions is not None: ffc_positions = collections.Counter(ffc_positions) model = [nn.ReflectionPad2d(3), conv_layer(input_nc, ngf, kernel_size=7, padding=0), norm_layer(ngf), activation] identity = Identity() ### downsample for i in range(n_downsampling): mult = 2 ** i model += [conv_layer(min(max_features, ngf * mult), min(max_features, ngf * mult * 2), kernel_size=3, stride=2, padding=1), norm_layer(min(max_features, ngf * mult * 2)), activation] mult = 2 ** n_downsampling feats_num_bottleneck = min(max_features, ngf * mult) dilated_block_kwargs = dict(dim=feats_num_bottleneck, padding_type=padding_type, activation=activation, norm_layer=norm_layer) if dilation_block_kind == 'simple': dilated_block_kwargs['conv_kind'] = conv_kind elif dilation_block_kind == 'multi': dilated_block_kwargs['conv_layer'] = functools.partial( get_conv_block_ctor('multidilated'), **multidilation_kwargs) # dilated blocks at the start of the bottleneck sausage if dilated_blocks_n_start is not None and dilated_blocks_n_start > 0: model += make_dil_blocks(dilated_blocks_n_start, dilation_block_kind, dilated_block_kwargs) # resnet blocks for i in range(n_blocks): # dilated blocks at the middle of the bottleneck sausage if i == n_blocks // 2 and dilated_blocks_n_middle is not None and dilated_blocks_n_middle > 0: model += make_dil_blocks(dilated_blocks_n_middle, dilation_block_kind, dilated_block_kwargs) if ffc_positions is not None and i in ffc_positions: for _ in range(ffc_positions[i]): # same position can occur more than once model += [FFCResnetBlock(feats_num_bottleneck, padding_type, norm_layer, activation_layer=nn.ReLU, inline=True, **ffc_kwargs)] if is_resblock_depthwise: resblock_groups = feats_num_bottleneck else: resblock_groups = 1 model += [ResnetBlock(feats_num_bottleneck, padding_type=padding_type, activation=activation, norm_layer=norm_layer, conv_kind=conv_kind, groups=resblock_groups, dilation=dilation, second_dilation=second_dilation)] # dilated blocks at the end of the bottleneck sausage if dilated_blocks_n is not None and dilated_blocks_n > 0: model += make_dil_blocks(dilated_blocks_n, dilation_block_kind, dilated_block_kwargs) # upsample for i in range(n_downsampling): mult = 2 ** (n_downsampling - i) model += [nn.ConvTranspose2d(min(max_features, ngf * mult), min(max_features, int(ngf * mult / 2)), kernel_size=3, stride=2, padding=1, output_padding=1), up_norm_layer(min(max_features, int(ngf * mult / 2))), up_activation] model += [nn.ReflectionPad2d(3), nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)] if add_out_act: model.append(get_activation('tanh' if add_out_act is True else add_out_act)) self.model = nn.Sequential(*model) def forward(self, input): return self.model(input) class GlobalGeneratorGated(GlobalGenerator): def __init__(self, *args, **kwargs): real_kwargs=dict( conv_kind='gated_bn_relu', activation=nn.Identity(), norm_layer=nn.Identity ) real_kwargs.update(kwargs) super().__init__(*args, **real_kwargs) class GlobalGeneratorFromSuperChannels(nn.Module): def __init__(self, input_nc, output_nc, n_downsampling, n_blocks, super_channels, norm_layer="bn", padding_type='reflect', add_out_act=True): super().__init__() self.n_downsampling = n_downsampling norm_layer = get_norm_layer(norm_layer) if type(norm_layer) == functools.partial: use_bias = (norm_layer.func == nn.InstanceNorm2d) else: use_bias = (norm_layer == nn.InstanceNorm2d) channels = self.convert_super_channels(super_channels) self.channels = channels model = [nn.ReflectionPad2d(3), nn.Conv2d(input_nc, channels[0], kernel_size=7, padding=0, bias=use_bias), norm_layer(channels[0]), nn.ReLU(True)] for i in range(n_downsampling): # add downsampling layers mult = 2 ** i model += [nn.Conv2d(channels[0+i], channels[1+i], kernel_size=3, stride=2, padding=1, bias=use_bias), norm_layer(channels[1+i]), nn.ReLU(True)] mult = 2 ** n_downsampling n_blocks1 = n_blocks // 3 n_blocks2 = n_blocks1 n_blocks3 = n_blocks - n_blocks1 - n_blocks2 for i in range(n_blocks1): c = n_downsampling dim = channels[c] model += [ResnetBlock(dim, padding_type=padding_type, norm_layer=norm_layer)] for i in range(n_blocks2): c = n_downsampling+1 dim = channels[c] kwargs = {} if i == 0: kwargs = {"in_dim": channels[c-1]} model += [ResnetBlock(dim, padding_type=padding_type, norm_layer=norm_layer, **kwargs)] for i in range(n_blocks3): c = n_downsampling+2 dim = channels[c] kwargs = {} if i == 0: kwargs = {"in_dim": channels[c-1]} model += [ResnetBlock(dim, padding_type=padding_type, norm_layer=norm_layer, **kwargs)] for i in range(n_downsampling): # add upsampling layers mult = 2 ** (n_downsampling - i) model += [nn.ConvTranspose2d(channels[n_downsampling+3+i], channels[n_downsampling+3+i+1], kernel_size=3, stride=2, padding=1, output_padding=1, bias=use_bias), norm_layer(channels[n_downsampling+3+i+1]), nn.ReLU(True)] model += [nn.ReflectionPad2d(3)] model += [nn.Conv2d(channels[2*n_downsampling+3], output_nc, kernel_size=7, padding=0)] if add_out_act: model.append(get_activation('tanh' if add_out_act is True else add_out_act)) self.model = nn.Sequential(*model) def convert_super_channels(self, super_channels): n_downsampling = self.n_downsampling result = [] cnt = 0 if n_downsampling == 2: N1 = 10 elif n_downsampling == 3: N1 = 13 else: raise NotImplementedError for i in range(0, N1): if i in [1,4,7,10]: channel = super_channels[cnt] * (2 ** cnt) config = {'channel': channel} result.append(channel) logging.info(f"Downsample channels {result[-1]}") cnt += 1 for i in range(3): for counter, j in enumerate(range(N1 + i * 3, N1 + 3 + i * 3)): if len(super_channels) == 6: channel = super_channels[3] * 4 else: channel = super_channels[i + 3] * 4 config = {'channel': channel} if counter == 0: result.append(channel) logging.info(f"Bottleneck channels {result[-1]}") cnt = 2 for i in range(N1+9, N1+21): if i in [22, 25,28]: cnt -= 1 if len(super_channels) == 6: channel = super_channels[5 - cnt] * (2 ** cnt) else: channel = super_channels[7 - cnt] * (2 ** cnt) result.append(int(channel)) logging.info(f"Upsample channels {result[-1]}") return result def forward(self, input): return self.model(input) # Defines the PatchGAN discriminator with the specified arguments. class NLayerDiscriminator(BaseDiscriminator): def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d,): super().__init__() self.n_layers = n_layers kw = 4 padw = int(np.ceil((kw-1.0)/2)) sequence = [[nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]] nf = ndf for n in range(1, n_layers): nf_prev = nf nf = min(nf * 2, 512) cur_model = [] cur_model += [ nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw), norm_layer(nf), nn.LeakyReLU(0.2, True) ] sequence.append(cur_model) nf_prev = nf nf = min(nf * 2, 512) cur_model = [] cur_model += [ nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw), norm_layer(nf), nn.LeakyReLU(0.2, True) ] sequence.append(cur_model) sequence += [[nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)]] for n in range(len(sequence)): setattr(self, 'model'+str(n), nn.Sequential(*sequence[n])) def get_all_activations(self, x): res = [x] for n in range(self.n_layers + 2): model = getattr(self, 'model' + str(n)) res.append(model(res[-1])) return res[1:] def forward(self, x): act = self.get_all_activations(x) return act[-1], act[:-1] class MultidilatedNLayerDiscriminator(BaseDiscriminator): def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d, multidilation_kwargs={}): super().__init__() self.n_layers = n_layers kw = 4 padw = int(np.ceil((kw-1.0)/2)) sequence = [[nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]] nf = ndf for n in range(1, n_layers): nf_prev = nf nf = min(nf * 2, 512) cur_model = [] cur_model += [ MultidilatedConv(nf_prev, nf, kernel_size=kw, stride=2, padding=[2, 3], **multidilation_kwargs), norm_layer(nf), nn.LeakyReLU(0.2, True) ] sequence.append(cur_model) nf_prev = nf nf = min(nf * 2, 512) cur_model = [] cur_model += [ nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw), norm_layer(nf), nn.LeakyReLU(0.2, True) ] sequence.append(cur_model) sequence += [[nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)]] for n in range(len(sequence)): setattr(self, 'model'+str(n), nn.Sequential(*sequence[n])) def get_all_activations(self, x): res = [x] for n in range(self.n_layers + 2): model = getattr(self, 'model' + str(n)) res.append(model(res[-1])) return res[1:] def forward(self, x): act = self.get_all_activations(x) return act[-1], act[:-1] class NLayerDiscriminatorAsGen(NLayerDiscriminator): def forward(self, x): return super().forward(x)[0] ================================================ FILE: annotator/lama/saicinpainting/training/modules/spatial_transform.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from kornia.geometry.transform import rotate class LearnableSpatialTransformWrapper(nn.Module): def __init__(self, impl, pad_coef=0.5, angle_init_range=80, train_angle=True): super().__init__() self.impl = impl self.angle = torch.rand(1) * angle_init_range if train_angle: self.angle = nn.Parameter(self.angle, requires_grad=True) self.pad_coef = pad_coef def forward(self, x): if torch.is_tensor(x): return self.inverse_transform(self.impl(self.transform(x)), x) elif isinstance(x, tuple): x_trans = tuple(self.transform(elem) for elem in x) y_trans = self.impl(x_trans) return tuple(self.inverse_transform(elem, orig_x) for elem, orig_x in zip(y_trans, x)) else: raise ValueError(f'Unexpected input type {type(x)}') def transform(self, x): height, width = x.shape[2:] pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect') x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded)) return x_padded_rotated def inverse_transform(self, y_padded_rotated, orig_x): height, width = orig_x.shape[2:] pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) y_padded = rotate(y_padded_rotated, angle=-self.angle.to(y_padded_rotated)) y_height, y_width = y_padded.shape[2:] y = y_padded[:, :, pad_h : y_height - pad_h, pad_w : y_width - pad_w] return y if __name__ == '__main__': layer = LearnableSpatialTransformWrapper(nn.Identity()) x = torch.arange(2* 3 * 15 * 15).view(2, 3, 15, 15).float() y = layer(x) assert x.shape == y.shape assert torch.allclose(x[:, :, 1:, 1:][:, :, :-1, :-1], y[:, :, 1:, 1:][:, :, :-1, :-1]) print('all ok') ================================================ FILE: annotator/lama/saicinpainting/training/modules/squeeze_excitation.py ================================================ import torch.nn as nn class SELayer(nn.Module): def __init__(self, channel, reduction=16): super(SELayer, self).__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Sequential( nn.Linear(channel, channel // reduction, bias=False), nn.ReLU(inplace=True), nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid() ) def forward(self, x): b, c, _, _ = x.size() y = self.avg_pool(x).view(b, c) y = self.fc(y).view(b, c, 1, 1) res = x * y.expand_as(x) return res ================================================ FILE: annotator/lama/saicinpainting/training/trainers/__init__.py ================================================ import logging import torch from annotator.lama.saicinpainting.training.trainers.default import DefaultInpaintingTrainingModule def get_training_model_class(kind): if kind == 'default': return DefaultInpaintingTrainingModule raise ValueError(f'Unknown trainer module {kind}') def make_training_model(config): kind = config.training_model.kind kwargs = dict(config.training_model) kwargs.pop('kind') kwargs['use_ddp'] = config.trainer.kwargs.get('accelerator', None) == 'ddp' logging.info(f'Make training model {kind}') cls = get_training_model_class(kind) return cls(config, **kwargs) def load_checkpoint(train_config, path, map_location='cuda', strict=True): model = make_training_model(train_config).generator state = torch.load(path, map_location=map_location) model.load_state_dict(state, strict=strict) return model ================================================ FILE: annotator/lama/saicinpainting/training/trainers/base.py ================================================ import copy import logging from typing import Dict, Tuple import pandas as pd import pytorch_lightning as ptl import torch import torch.nn as nn import torch.nn.functional as F # from torch.utils.data import DistributedSampler # from annotator.lama.saicinpainting.evaluation import make_evaluator # from annotator.lama.saicinpainting.training.data.datasets import make_default_train_dataloader, make_default_val_dataloader # from annotator.lama.saicinpainting.training.losses.adversarial import make_discrim_loss # from annotator.lama.saicinpainting.training.losses.perceptual import PerceptualLoss, ResNetPL from annotator.lama.saicinpainting.training.modules import make_generator #, make_discriminator # from annotator.lama.saicinpainting.training.visualizers import make_visualizer from annotator.lama.saicinpainting.utils import add_prefix_to_keys, average_dicts, set_requires_grad, flatten_dict, \ get_has_ddp_rank LOGGER = logging.getLogger(__name__) def make_optimizer(parameters, kind='adamw', **kwargs): if kind == 'adam': optimizer_class = torch.optim.Adam elif kind == 'adamw': optimizer_class = torch.optim.AdamW else: raise ValueError(f'Unknown optimizer kind {kind}') return optimizer_class(parameters, **kwargs) def update_running_average(result: nn.Module, new_iterate_model: nn.Module, decay=0.999): with torch.no_grad(): res_params = dict(result.named_parameters()) new_params = dict(new_iterate_model.named_parameters()) for k in res_params.keys(): res_params[k].data.mul_(decay).add_(new_params[k].data, alpha=1 - decay) def make_multiscale_noise(base_tensor, scales=6, scale_mode='bilinear'): batch_size, _, height, width = base_tensor.shape cur_height, cur_width = height, width result = [] align_corners = False if scale_mode in ('bilinear', 'bicubic') else None for _ in range(scales): cur_sample = torch.randn(batch_size, 1, cur_height, cur_width, device=base_tensor.device) cur_sample_scaled = F.interpolate(cur_sample, size=(height, width), mode=scale_mode, align_corners=align_corners) result.append(cur_sample_scaled) cur_height //= 2 cur_width //= 2 return torch.cat(result, dim=1) class BaseInpaintingTrainingModule(ptl.LightningModule): def __init__(self, config, use_ddp, *args, predict_only=False, visualize_each_iters=100, average_generator=False, generator_avg_beta=0.999, average_generator_start_step=30000, average_generator_period=10, store_discr_outputs_for_vis=False, **kwargs): super().__init__(*args, **kwargs) LOGGER.info('BaseInpaintingTrainingModule init called') self.config = config self.generator = make_generator(config, **self.config.generator) self.use_ddp = use_ddp if not get_has_ddp_rank(): LOGGER.info(f'Generator\n{self.generator}') # if not predict_only: # self.save_hyperparameters(self.config) # self.discriminator = make_discriminator(**self.config.discriminator) # self.adversarial_loss = make_discrim_loss(**self.config.losses.adversarial) # self.visualizer = make_visualizer(**self.config.visualizer) # self.val_evaluator = make_evaluator(**self.config.evaluator) # self.test_evaluator = make_evaluator(**self.config.evaluator) # # if not get_has_ddp_rank(): # LOGGER.info(f'Discriminator\n{self.discriminator}') # # extra_val = self.config.data.get('extra_val', ()) # if extra_val: # self.extra_val_titles = list(extra_val) # self.extra_evaluators = nn.ModuleDict({k: make_evaluator(**self.config.evaluator) # for k in extra_val}) # else: # self.extra_evaluators = {} # # self.average_generator = average_generator # self.generator_avg_beta = generator_avg_beta # self.average_generator_start_step = average_generator_start_step # self.average_generator_period = average_generator_period # self.generator_average = None # self.last_generator_averaging_step = -1 # self.store_discr_outputs_for_vis = store_discr_outputs_for_vis # # if self.config.losses.get("l1", {"weight_known": 0})['weight_known'] > 0: # self.loss_l1 = nn.L1Loss(reduction='none') # # if self.config.losses.get("mse", {"weight": 0})['weight'] > 0: # self.loss_mse = nn.MSELoss(reduction='none') # # if self.config.losses.perceptual.weight > 0: # self.loss_pl = PerceptualLoss() # # # if self.config.losses.get("resnet_pl", {"weight": 0})['weight'] > 0: # # self.loss_resnet_pl = ResNetPL(**self.config.losses.resnet_pl) # # else: # # self.loss_resnet_pl = None # # self.loss_resnet_pl = None self.visualize_each_iters = visualize_each_iters LOGGER.info('BaseInpaintingTrainingModule init done') def configure_optimizers(self): discriminator_params = list(self.discriminator.parameters()) return [ dict(optimizer=make_optimizer(self.generator.parameters(), **self.config.optimizers.generator)), dict(optimizer=make_optimizer(discriminator_params, **self.config.optimizers.discriminator)), ] def train_dataloader(self): kwargs = dict(self.config.data.train) if self.use_ddp: kwargs['ddp_kwargs'] = dict(num_replicas=self.trainer.num_nodes * self.trainer.num_processes, rank=self.trainer.global_rank, shuffle=True) dataloader = make_default_train_dataloader(**self.config.data.train) return dataloader def val_dataloader(self): res = [make_default_val_dataloader(**self.config.data.val)] if self.config.data.visual_test is not None: res = res + [make_default_val_dataloader(**self.config.data.visual_test)] else: res = res + res extra_val = self.config.data.get('extra_val', ()) if extra_val: res += [make_default_val_dataloader(**extra_val[k]) for k in self.extra_val_titles] return res def training_step(self, batch, batch_idx, optimizer_idx=None): self._is_training_step = True return self._do_step(batch, batch_idx, mode='train', optimizer_idx=optimizer_idx) def validation_step(self, batch, batch_idx, dataloader_idx): extra_val_key = None if dataloader_idx == 0: mode = 'val' elif dataloader_idx == 1: mode = 'test' else: mode = 'extra_val' extra_val_key = self.extra_val_titles[dataloader_idx - 2] self._is_training_step = False return self._do_step(batch, batch_idx, mode=mode, extra_val_key=extra_val_key) def training_step_end(self, batch_parts_outputs): if self.training and self.average_generator \ and self.global_step >= self.average_generator_start_step \ and self.global_step >= self.last_generator_averaging_step + self.average_generator_period: if self.generator_average is None: self.generator_average = copy.deepcopy(self.generator) else: update_running_average(self.generator_average, self.generator, decay=self.generator_avg_beta) self.last_generator_averaging_step = self.global_step full_loss = (batch_parts_outputs['loss'].mean() if torch.is_tensor(batch_parts_outputs['loss']) # loss is not tensor when no discriminator used else torch.tensor(batch_parts_outputs['loss']).float().requires_grad_(True)) log_info = {k: v.mean() for k, v in batch_parts_outputs['log_info'].items()} self.log_dict(log_info, on_step=True, on_epoch=False) return full_loss def validation_epoch_end(self, outputs): outputs = [step_out for out_group in outputs for step_out in out_group] averaged_logs = average_dicts(step_out['log_info'] for step_out in outputs) self.log_dict({k: v.mean() for k, v in averaged_logs.items()}) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) # standard validation val_evaluator_states = [s['val_evaluator_state'] for s in outputs if 'val_evaluator_state' in s] val_evaluator_res = self.val_evaluator.evaluation_end(states=val_evaluator_states) val_evaluator_res_df = pd.DataFrame(val_evaluator_res).stack(1).unstack(0) val_evaluator_res_df.dropna(axis=1, how='all', inplace=True) LOGGER.info(f'Validation metrics after epoch #{self.current_epoch}, ' f'total {self.global_step} iterations:\n{val_evaluator_res_df}') for k, v in flatten_dict(val_evaluator_res).items(): self.log(f'val_{k}', v) # standard visual test test_evaluator_states = [s['test_evaluator_state'] for s in outputs if 'test_evaluator_state' in s] test_evaluator_res = self.test_evaluator.evaluation_end(states=test_evaluator_states) test_evaluator_res_df = pd.DataFrame(test_evaluator_res).stack(1).unstack(0) test_evaluator_res_df.dropna(axis=1, how='all', inplace=True) LOGGER.info(f'Test metrics after epoch #{self.current_epoch}, ' f'total {self.global_step} iterations:\n{test_evaluator_res_df}') for k, v in flatten_dict(test_evaluator_res).items(): self.log(f'test_{k}', v) # extra validations if self.extra_evaluators: for cur_eval_title, cur_evaluator in self.extra_evaluators.items(): cur_state_key = f'extra_val_{cur_eval_title}_evaluator_state' cur_states = [s[cur_state_key] for s in outputs if cur_state_key in s] cur_evaluator_res = cur_evaluator.evaluation_end(states=cur_states) cur_evaluator_res_df = pd.DataFrame(cur_evaluator_res).stack(1).unstack(0) cur_evaluator_res_df.dropna(axis=1, how='all', inplace=True) LOGGER.info(f'Extra val {cur_eval_title} metrics after epoch #{self.current_epoch}, ' f'total {self.global_step} iterations:\n{cur_evaluator_res_df}') for k, v in flatten_dict(cur_evaluator_res).items(): self.log(f'extra_val_{cur_eval_title}_{k}', v) def _do_step(self, batch, batch_idx, mode='train', optimizer_idx=None, extra_val_key=None): if optimizer_idx == 0: # step for generator set_requires_grad(self.generator, True) set_requires_grad(self.discriminator, False) elif optimizer_idx == 1: # step for discriminator set_requires_grad(self.generator, False) set_requires_grad(self.discriminator, True) batch = self(batch) total_loss = 0 metrics = {} if optimizer_idx is None or optimizer_idx == 0: # step for generator total_loss, metrics = self.generator_loss(batch) elif optimizer_idx is None or optimizer_idx == 1: # step for discriminator if self.config.losses.adversarial.weight > 0: total_loss, metrics = self.discriminator_loss(batch) if self.get_ddp_rank() in (None, 0) and (batch_idx % self.visualize_each_iters == 0 or mode == 'test'): if self.config.losses.adversarial.weight > 0: if self.store_discr_outputs_for_vis: with torch.no_grad(): self.store_discr_outputs(batch) vis_suffix = f'_{mode}' if mode == 'extra_val': vis_suffix += f'_{extra_val_key}' self.visualizer(self.current_epoch, batch_idx, batch, suffix=vis_suffix) metrics_prefix = f'{mode}_' if mode == 'extra_val': metrics_prefix += f'{extra_val_key}_' result = dict(loss=total_loss, log_info=add_prefix_to_keys(metrics, metrics_prefix)) if mode == 'val': result['val_evaluator_state'] = self.val_evaluator.process_batch(batch) elif mode == 'test': result['test_evaluator_state'] = self.test_evaluator.process_batch(batch) elif mode == 'extra_val': result[f'extra_val_{extra_val_key}_evaluator_state'] = self.extra_evaluators[extra_val_key].process_batch(batch) return result def get_current_generator(self, no_average=False): if not no_average and not self.training and self.average_generator and self.generator_average is not None: return self.generator_average return self.generator def forward(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys""" raise NotImplementedError() def generator_loss(self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: raise NotImplementedError() def discriminator_loss(self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: raise NotImplementedError() def store_discr_outputs(self, batch): out_size = batch['image'].shape[2:] discr_real_out, _ = self.discriminator(batch['image']) discr_fake_out, _ = self.discriminator(batch['predicted_image']) batch['discr_output_real'] = F.interpolate(discr_real_out, size=out_size, mode='nearest') batch['discr_output_fake'] = F.interpolate(discr_fake_out, size=out_size, mode='nearest') batch['discr_output_diff'] = batch['discr_output_real'] - batch['discr_output_fake'] def get_ddp_rank(self): return self.trainer.global_rank if (self.trainer.num_nodes * self.trainer.num_processes) > 1 else None ================================================ FILE: annotator/lama/saicinpainting/training/trainers/default.py ================================================ import logging import torch import torch.nn.functional as F from omegaconf import OmegaConf # from annotator.lama.saicinpainting.training.data.datasets import make_constant_area_crop_params from annotator.lama.saicinpainting.training.losses.distance_weighting import make_mask_distance_weighter from annotator.lama.saicinpainting.training.losses.feature_matching import feature_matching_loss, masked_l1_loss # from annotator.lama.saicinpainting.training.modules.fake_fakes import FakeFakesGenerator from annotator.lama.saicinpainting.training.trainers.base import BaseInpaintingTrainingModule, make_multiscale_noise from annotator.lama.saicinpainting.utils import add_prefix_to_keys, get_ramp LOGGER = logging.getLogger(__name__) def make_constant_area_crop_batch(batch, **kwargs): crop_y, crop_x, crop_height, crop_width = make_constant_area_crop_params(img_height=batch['image'].shape[2], img_width=batch['image'].shape[3], **kwargs) batch['image'] = batch['image'][:, :, crop_y : crop_y + crop_height, crop_x : crop_x + crop_width] batch['mask'] = batch['mask'][:, :, crop_y: crop_y + crop_height, crop_x: crop_x + crop_width] return batch class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule): def __init__(self, *args, concat_mask=True, rescale_scheduler_kwargs=None, image_to_discriminator='predicted_image', add_noise_kwargs=None, noise_fill_hole=False, const_area_crop_kwargs=None, distance_weighter_kwargs=None, distance_weighted_mask_for_discr=False, fake_fakes_proba=0, fake_fakes_generator_kwargs=None, **kwargs): super().__init__(*args, **kwargs) self.concat_mask = concat_mask self.rescale_size_getter = get_ramp(**rescale_scheduler_kwargs) if rescale_scheduler_kwargs is not None else None self.image_to_discriminator = image_to_discriminator self.add_noise_kwargs = add_noise_kwargs self.noise_fill_hole = noise_fill_hole self.const_area_crop_kwargs = const_area_crop_kwargs self.refine_mask_for_losses = make_mask_distance_weighter(**distance_weighter_kwargs) \ if distance_weighter_kwargs is not None else None self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr self.fake_fakes_proba = fake_fakes_proba if self.fake_fakes_proba > 1e-3: self.fake_fakes_gen = FakeFakesGenerator(**(fake_fakes_generator_kwargs or {})) def forward(self, batch): if self.training and self.rescale_size_getter is not None: cur_size = self.rescale_size_getter(self.global_step) batch['image'] = F.interpolate(batch['image'], size=cur_size, mode='bilinear', align_corners=False) batch['mask'] = F.interpolate(batch['mask'], size=cur_size, mode='nearest') if self.training and self.const_area_crop_kwargs is not None: batch = make_constant_area_crop_batch(batch, **self.const_area_crop_kwargs) img = batch['image'] mask = batch['mask'] masked_img = img * (1 - mask) if self.add_noise_kwargs is not None: noise = make_multiscale_noise(masked_img, **self.add_noise_kwargs) if self.noise_fill_hole: masked_img = masked_img + mask * noise[:, :masked_img.shape[1]] masked_img = torch.cat([masked_img, noise], dim=1) if self.concat_mask: masked_img = torch.cat([masked_img, mask], dim=1) batch['predicted_image'] = self.generator(masked_img) batch['inpainted'] = mask * batch['predicted_image'] + (1 - mask) * batch['image'] if self.fake_fakes_proba > 1e-3: if self.training and torch.rand(1).item() < self.fake_fakes_proba: batch['fake_fakes'], batch['fake_fakes_masks'] = self.fake_fakes_gen(img, mask) batch['use_fake_fakes'] = True else: batch['fake_fakes'] = torch.zeros_like(img) batch['fake_fakes_masks'] = torch.zeros_like(mask) batch['use_fake_fakes'] = False batch['mask_for_losses'] = self.refine_mask_for_losses(img, batch['predicted_image'], mask) \ if self.refine_mask_for_losses is not None and self.training \ else mask return batch def generator_loss(self, batch): img = batch['image'] predicted_img = batch[self.image_to_discriminator] original_mask = batch['mask'] supervised_mask = batch['mask_for_losses'] # L1 l1_value = masked_l1_loss(predicted_img, img, supervised_mask, self.config.losses.l1.weight_known, self.config.losses.l1.weight_missing) total_loss = l1_value metrics = dict(gen_l1=l1_value) # vgg-based perceptual loss if self.config.losses.perceptual.weight > 0: pl_value = self.loss_pl(predicted_img, img, mask=supervised_mask).sum() * self.config.losses.perceptual.weight total_loss = total_loss + pl_value metrics['gen_pl'] = pl_value # discriminator # adversarial_loss calls backward by itself mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask self.adversarial_loss.pre_generator_step(real_batch=img, fake_batch=predicted_img, generator=self.generator, discriminator=self.discriminator) discr_real_pred, discr_real_features = self.discriminator(img) discr_fake_pred, discr_fake_features = self.discriminator(predicted_img) adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss(real_batch=img, fake_batch=predicted_img, discr_real_pred=discr_real_pred, discr_fake_pred=discr_fake_pred, mask=mask_for_discr) total_loss = total_loss + adv_gen_loss metrics['gen_adv'] = adv_gen_loss metrics.update(add_prefix_to_keys(adv_metrics, 'adv_')) # feature matching if self.config.losses.feature_matching.weight > 0: need_mask_in_fm = OmegaConf.to_container(self.config.losses.feature_matching).get('pass_mask', False) mask_for_fm = supervised_mask if need_mask_in_fm else None fm_value = feature_matching_loss(discr_fake_features, discr_real_features, mask=mask_for_fm) * self.config.losses.feature_matching.weight total_loss = total_loss + fm_value metrics['gen_fm'] = fm_value if self.loss_resnet_pl is not None: resnet_pl_value = self.loss_resnet_pl(predicted_img, img) total_loss = total_loss + resnet_pl_value metrics['gen_resnet_pl'] = resnet_pl_value return total_loss, metrics def discriminator_loss(self, batch): total_loss = 0 metrics = {} predicted_img = batch[self.image_to_discriminator].detach() self.adversarial_loss.pre_discriminator_step(real_batch=batch['image'], fake_batch=predicted_img, generator=self.generator, discriminator=self.discriminator) discr_real_pred, discr_real_features = self.discriminator(batch['image']) discr_fake_pred, discr_fake_features = self.discriminator(predicted_img) adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss(real_batch=batch['image'], fake_batch=predicted_img, discr_real_pred=discr_real_pred, discr_fake_pred=discr_fake_pred, mask=batch['mask']) total_loss = total_loss + adv_discr_loss metrics['discr_adv'] = adv_discr_loss metrics.update(add_prefix_to_keys(adv_metrics, 'adv_')) if batch.get('use_fake_fakes', False): fake_fakes = batch['fake_fakes'] self.adversarial_loss.pre_discriminator_step(real_batch=batch['image'], fake_batch=fake_fakes, generator=self.generator, discriminator=self.discriminator) discr_fake_fakes_pred, _ = self.discriminator(fake_fakes) fake_fakes_adv_discr_loss, fake_fakes_adv_metrics = self.adversarial_loss.discriminator_loss( real_batch=batch['image'], fake_batch=fake_fakes, discr_real_pred=discr_real_pred, discr_fake_pred=discr_fake_fakes_pred, mask=batch['mask'] ) total_loss = total_loss + fake_fakes_adv_discr_loss metrics['discr_adv_fake_fakes'] = fake_fakes_adv_discr_loss metrics.update(add_prefix_to_keys(fake_fakes_adv_metrics, 'adv_')) return total_loss, metrics ================================================ FILE: annotator/lama/saicinpainting/training/visualizers/__init__.py ================================================ import logging from annotator.lama.saicinpainting.training.visualizers.directory import DirectoryVisualizer from annotator.lama.saicinpainting.training.visualizers.noop import NoopVisualizer def make_visualizer(kind, **kwargs): logging.info(f'Make visualizer {kind}') if kind == 'directory': return DirectoryVisualizer(**kwargs) if kind == 'noop': return NoopVisualizer() raise ValueError(f'Unknown visualizer kind {kind}') ================================================ FILE: annotator/lama/saicinpainting/training/visualizers/base.py ================================================ import abc from typing import Dict, List import numpy as np import torch from skimage import color from skimage.segmentation import mark_boundaries from . import colors COLORS, _ = colors.generate_colors(151) # 151 - max classes for semantic segmentation class BaseVisualizer: @abc.abstractmethod def __call__(self, epoch_i, batch_i, batch, suffix='', rank=None): """ Take a batch, make an image from it and visualize """ raise NotImplementedError() def visualize_mask_and_images(images_dict: Dict[str, np.ndarray], keys: List[str], last_without_mask=True, rescale_keys=None, mask_only_first=None, black_mask=False) -> np.ndarray: mask = images_dict['mask'] > 0.5 result = [] for i, k in enumerate(keys): img = images_dict[k] img = np.transpose(img, (1, 2, 0)) if rescale_keys is not None and k in rescale_keys: img = img - img.min() img /= img.max() + 1e-5 if len(img.shape) == 2: img = np.expand_dims(img, 2) if img.shape[2] == 1: img = np.repeat(img, 3, axis=2) elif (img.shape[2] > 3): img_classes = img.argmax(2) img = color.label2rgb(img_classes, colors=COLORS) if mask_only_first: need_mark_boundaries = i == 0 else: need_mark_boundaries = i < len(keys) - 1 or not last_without_mask if need_mark_boundaries: if black_mask: img = img * (1 - mask[0][..., None]) img = mark_boundaries(img, mask[0], color=(1., 0., 0.), outline_color=(1., 1., 1.), mode='thick') result.append(img) return np.concatenate(result, axis=1) def visualize_mask_and_images_batch(batch: Dict[str, torch.Tensor], keys: List[str], max_items=10, last_without_mask=True, rescale_keys=None) -> np.ndarray: batch = {k: tens.detach().cpu().numpy() for k, tens in batch.items() if k in keys or k == 'mask'} batch_size = next(iter(batch.values())).shape[0] items_to_vis = min(batch_size, max_items) result = [] for i in range(items_to_vis): cur_dct = {k: tens[i] for k, tens in batch.items()} result.append(visualize_mask_and_images(cur_dct, keys, last_without_mask=last_without_mask, rescale_keys=rescale_keys)) return np.concatenate(result, axis=0) ================================================ FILE: annotator/lama/saicinpainting/training/visualizers/colors.py ================================================ import random import colorsys import numpy as np import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt from matplotlib.colors import LinearSegmentedColormap def generate_colors(nlabels, type='bright', first_color_black=False, last_color_black=True, verbose=False): # https://stackoverflow.com/questions/14720331/how-to-generate-random-colors-in-matplotlib """ Creates a random colormap to be used together with matplotlib. Useful for segmentation tasks :param nlabels: Number of labels (size of colormap) :param type: 'bright' for strong colors, 'soft' for pastel colors :param first_color_black: Option to use first color as black, True or False :param last_color_black: Option to use last color as black, True or False :param verbose: Prints the number of labels and shows the colormap. True or False :return: colormap for matplotlib """ if type not in ('bright', 'soft'): print ('Please choose "bright" or "soft" for type') return if verbose: print('Number of labels: ' + str(nlabels)) # Generate color map for bright colors, based on hsv if type == 'bright': randHSVcolors = [(np.random.uniform(low=0.0, high=1), np.random.uniform(low=0.2, high=1), np.random.uniform(low=0.9, high=1)) for i in range(nlabels)] # Convert HSV list to RGB randRGBcolors = [] for HSVcolor in randHSVcolors: randRGBcolors.append(colorsys.hsv_to_rgb(HSVcolor[0], HSVcolor[1], HSVcolor[2])) if first_color_black: randRGBcolors[0] = [0, 0, 0] if last_color_black: randRGBcolors[-1] = [0, 0, 0] random_colormap = LinearSegmentedColormap.from_list('new_map', randRGBcolors, N=nlabels) # Generate soft pastel colors, by limiting the RGB spectrum if type == 'soft': low = 0.6 high = 0.95 randRGBcolors = [(np.random.uniform(low=low, high=high), np.random.uniform(low=low, high=high), np.random.uniform(low=low, high=high)) for i in range(nlabels)] if first_color_black: randRGBcolors[0] = [0, 0, 0] if last_color_black: randRGBcolors[-1] = [0, 0, 0] random_colormap = LinearSegmentedColormap.from_list('new_map', randRGBcolors, N=nlabels) # Display colorbar if verbose: from matplotlib import colors, colorbar from matplotlib import pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(15, 0.5)) bounds = np.linspace(0, nlabels, nlabels + 1) norm = colors.BoundaryNorm(bounds, nlabels) cb = colorbar.ColorbarBase(ax, cmap=random_colormap, norm=norm, spacing='proportional', ticks=None, boundaries=bounds, format='%1i', orientation=u'horizontal') return randRGBcolors, random_colormap ================================================ FILE: annotator/lama/saicinpainting/training/visualizers/directory.py ================================================ import os import cv2 import numpy as np from annotator.lama.saicinpainting.training.visualizers.base import BaseVisualizer, visualize_mask_and_images_batch from annotator.lama.saicinpainting.utils import check_and_warn_input_range class DirectoryVisualizer(BaseVisualizer): DEFAULT_KEY_ORDER = 'image predicted_image inpainted'.split(' ') def __init__(self, outdir, key_order=DEFAULT_KEY_ORDER, max_items_in_batch=10, last_without_mask=True, rescale_keys=None): self.outdir = outdir os.makedirs(self.outdir, exist_ok=True) self.key_order = key_order self.max_items_in_batch = max_items_in_batch self.last_without_mask = last_without_mask self.rescale_keys = rescale_keys def __call__(self, epoch_i, batch_i, batch, suffix='', rank=None): check_and_warn_input_range(batch['image'], 0, 1, 'DirectoryVisualizer target image') vis_img = visualize_mask_and_images_batch(batch, self.key_order, max_items=self.max_items_in_batch, last_without_mask=self.last_without_mask, rescale_keys=self.rescale_keys) vis_img = np.clip(vis_img * 255, 0, 255).astype('uint8') curoutdir = os.path.join(self.outdir, f'epoch{epoch_i:04d}{suffix}') os.makedirs(curoutdir, exist_ok=True) rank_suffix = f'_r{rank}' if rank is not None else '' out_fname = os.path.join(curoutdir, f'batch{batch_i:07d}{rank_suffix}.jpg') vis_img = cv2.cvtColor(vis_img, cv2.COLOR_RGB2BGR) cv2.imwrite(out_fname, vis_img) ================================================ FILE: annotator/lama/saicinpainting/training/visualizers/noop.py ================================================ from annotator.lama.saicinpainting.training.visualizers.base import BaseVisualizer class NoopVisualizer(BaseVisualizer): def __init__(self, *args, **kwargs): pass def __call__(self, epoch_i, batch_i, batch, suffix='', rank=None): pass ================================================ FILE: annotator/lama/saicinpainting/utils.py ================================================ import bisect import functools import logging import numbers import os import signal import sys import traceback import warnings import torch from pytorch_lightning import seed_everything LOGGER = logging.getLogger(__name__) def check_and_warn_input_range(tensor, min_value, max_value, name): actual_min = tensor.min() actual_max = tensor.max() if actual_min < min_value or actual_max > max_value: warnings.warn(f"{name} must be in {min_value}..{max_value} range, but it ranges {actual_min}..{actual_max}") def sum_dict_with_prefix(target, cur_dict, prefix, default=0): for k, v in cur_dict.items(): target_key = prefix + k target[target_key] = target.get(target_key, default) + v def average_dicts(dict_list): result = {} norm = 1e-3 for dct in dict_list: sum_dict_with_prefix(result, dct, '') norm += 1 for k in list(result): result[k] /= norm return result def add_prefix_to_keys(dct, prefix): return {prefix + k: v for k, v in dct.items()} def set_requires_grad(module, value): for param in module.parameters(): param.requires_grad = value def flatten_dict(dct): result = {} for k, v in dct.items(): if isinstance(k, tuple): k = '_'.join(k) if isinstance(v, dict): for sub_k, sub_v in flatten_dict(v).items(): result[f'{k}_{sub_k}'] = sub_v else: result[k] = v return result class LinearRamp: def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0): self.start_value = start_value self.end_value = end_value self.start_iter = start_iter self.end_iter = end_iter def __call__(self, i): if i < self.start_iter: return self.start_value if i >= self.end_iter: return self.end_value part = (i - self.start_iter) / (self.end_iter - self.start_iter) return self.start_value * (1 - part) + self.end_value * part class LadderRamp: def __init__(self, start_iters, values): self.start_iters = start_iters self.values = values assert len(values) == len(start_iters) + 1, (len(values), len(start_iters)) def __call__(self, i): segment_i = bisect.bisect_right(self.start_iters, i) return self.values[segment_i] def get_ramp(kind='ladder', **kwargs): if kind == 'linear': return LinearRamp(**kwargs) if kind == 'ladder': return LadderRamp(**kwargs) raise ValueError(f'Unexpected ramp kind: {kind}') def print_traceback_handler(sig, frame): LOGGER.warning(f'Received signal {sig}') bt = ''.join(traceback.format_stack()) LOGGER.warning(f'Requested stack trace:\n{bt}') def register_debug_signal_handlers(sig=None, handler=print_traceback_handler): LOGGER.warning(f'Setting signal {sig} handler {handler}') signal.signal(sig, handler) def handle_deterministic_config(config): seed = dict(config).get('seed', None) if seed is None: return False seed_everything(seed) return True def get_shape(t): if torch.is_tensor(t): return tuple(t.shape) elif isinstance(t, dict): return {n: get_shape(q) for n, q in t.items()} elif isinstance(t, (list, tuple)): return [get_shape(q) for q in t] elif isinstance(t, numbers.Number): return type(t) else: raise ValueError('unexpected type {}'.format(type(t))) def get_has_ddp_rank(): master_port = os.environ.get('MASTER_PORT', None) node_rank = os.environ.get('NODE_RANK', None) local_rank = os.environ.get('LOCAL_RANK', None) world_size = os.environ.get('WORLD_SIZE', None) has_rank = master_port is not None or node_rank is not None or local_rank is not None or world_size is not None return has_rank def handle_ddp_subprocess(): def main_decorator(main_func): @functools.wraps(main_func) def new_main(*args, **kwargs): # Trainer sets MASTER_PORT, NODE_RANK, LOCAL_RANK, WORLD_SIZE parent_cwd = os.environ.get('TRAINING_PARENT_WORK_DIR', None) has_parent = parent_cwd is not None has_rank = get_has_ddp_rank() assert has_parent == has_rank, f'Inconsistent state: has_parent={has_parent}, has_rank={has_rank}' if has_parent: # we are in the worker sys.argv.extend([ f'hydra.run.dir={parent_cwd}', # 'hydra/hydra_logging=disabled', # 'hydra/job_logging=disabled' ]) # do nothing if this is a top-level process # TRAINING_PARENT_WORK_DIR is set in handle_ddp_parent_process after hydra initialization main_func(*args, **kwargs) return new_main return main_decorator def handle_ddp_parent_process(): parent_cwd = os.environ.get('TRAINING_PARENT_WORK_DIR', None) has_parent = parent_cwd is not None has_rank = get_has_ddp_rank() assert has_parent == has_rank, f'Inconsistent state: has_parent={has_parent}, has_rank={has_rank}' if parent_cwd is None: os.environ['TRAINING_PARENT_WORK_DIR'] = os.getcwd() return has_parent ================================================ FILE: annotator/leres/__init__.py ================================================ import cv2 import numpy as np import torch import os from modules import devices, shared from annotator.annotator_path import models_path from torchvision.transforms import transforms # AdelaiDepth/LeReS imports from .leres.depthmap import estimateleres, estimateboost from .leres.multi_depth_model_woauxi import RelDepthModel from .leres.net_tools import strip_prefix_if_present # pix2pix/merge net imports from .pix2pix.options.test_options import TestOptions from .pix2pix.models.pix2pix4depth_model import Pix2Pix4DepthModel base_model_path = os.path.join(models_path, "leres") old_modeldir = os.path.dirname(os.path.realpath(__file__)) remote_model_path_leres = "https://huggingface.co/lllyasviel/Annotators/resolve/main/res101.pth" remote_model_path_pix2pix = "https://huggingface.co/lllyasviel/Annotators/resolve/main/latest_net_G.pth" model = None pix2pixmodel = None def unload_leres_model(): global model, pix2pixmodel if model is not None: model = model.cpu() if pix2pixmodel is not None: pix2pixmodel = pix2pixmodel.unload_network('G') def apply_leres(input_image, thr_a, thr_b, boost=False): global model, pix2pixmodel if model is None: model_path = os.path.join(base_model_path, "res101.pth") old_model_path = os.path.join(old_modeldir, "res101.pth") if os.path.exists(old_model_path): model_path = old_model_path elif not os.path.exists(model_path): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path_leres, model_dir=base_model_path) if torch.cuda.is_available(): checkpoint = torch.load(model_path) else: checkpoint = torch.load(model_path, map_location=torch.device('cpu')) model = RelDepthModel(backbone='resnext101') model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True) del checkpoint if boost and pix2pixmodel is None: pix2pixmodel_path = os.path.join(base_model_path, "latest_net_G.pth") if not os.path.exists(pix2pixmodel_path): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path_pix2pix, model_dir=base_model_path) opt = TestOptions().parse() if not torch.cuda.is_available(): opt.gpu_ids = [] # cpu mode pix2pixmodel = Pix2Pix4DepthModel(opt) pix2pixmodel.save_dir = base_model_path pix2pixmodel.load_networks('latest') pix2pixmodel.eval() if devices.get_device_for("controlnet").type != 'mps': model = model.to(devices.get_device_for("controlnet")) assert input_image.ndim == 3 height, width, dim = input_image.shape with torch.no_grad(): if boost: depth = estimateboost(input_image, model, 0, pix2pixmodel, max(width, height)) else: depth = estimateleres(input_image, model, width, height) numbytes=2 depth_min = depth.min() depth_max = depth.max() max_val = (2**(8*numbytes))-1 # check output before normalizing and mapping to 16 bit if depth_max - depth_min > np.finfo("float").eps: out = max_val * (depth - depth_min) / (depth_max - depth_min) else: out = np.zeros(depth.shape) # single channel, 16 bit image depth_image = out.astype("uint16") # convert to uint8 depth_image = cv2.convertScaleAbs(depth_image, alpha=(255.0/65535.0)) # remove near if thr_a != 0: thr_a = ((thr_a/100)*255) depth_image = cv2.threshold(depth_image, thr_a, 255, cv2.THRESH_TOZERO)[1] # invert image depth_image = cv2.bitwise_not(depth_image) # remove bg if thr_b != 0: thr_b = ((thr_b/100)*255) depth_image = cv2.threshold(depth_image, thr_b, 255, cv2.THRESH_TOZERO)[1] return depth_image ================================================ FILE: annotator/leres/leres/LICENSE ================================================ https://github.com/thygate/stable-diffusion-webui-depthmap-script MIT License Copyright (c) 2023 Bob Thiry Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/leres/leres/Resnet.py ================================================ import torch.nn as nn import torch.nn as NN __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', } def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = NN.BatchNorm2d(planes) #NN.BatchNorm2d self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = NN.BatchNorm2d(planes) #NN.BatchNorm2d self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.bn1 = NN.BatchNorm2d(planes) #NN.BatchNorm2d self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = NN.BatchNorm2d(planes) #NN.BatchNorm2d self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = NN.BatchNorm2d(planes * self.expansion) #NN.BatchNorm2d self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class ResNet(nn.Module): def __init__(self, block, layers, num_classes=1000): self.inplanes = 64 super(ResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = NN.BatchNorm2d(64) #NN.BatchNorm2d self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) #self.avgpool = nn.AvgPool2d(7, stride=1) #self.fc = nn.Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), NN.BatchNorm2d(planes * block.expansion), #NN.BatchNorm2d ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): features = [] x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) features.append(x) x = self.layer2(x) features.append(x) x = self.layer3(x) features.append(x) x = self.layer4(x) features.append(x) return features def resnet18(pretrained=True, **kwargs): """Constructs a ResNet-18 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) return model def resnet34(pretrained=True, **kwargs): """Constructs a ResNet-34 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) return model def resnet50(pretrained=True, **kwargs): """Constructs a ResNet-50 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) return model def resnet101(pretrained=True, **kwargs): """Constructs a ResNet-101 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) return model def resnet152(pretrained=True, **kwargs): """Constructs a ResNet-152 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) return model ================================================ FILE: annotator/leres/leres/Resnext_torch.py ================================================ #!/usr/bin/env python # coding: utf-8 import torch.nn as nn try: from urllib import urlretrieve except ImportError: from urllib.request import urlretrieve __all__ = ['resnext101_32x8d'] model_urls = { 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth', 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth', } def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, groups=groups, bias=False, dilation=dilation) def conv1x1(in_planes, out_planes, stride=1): """1x1 convolution""" return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): super(BasicBlock, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d if groups != 1 or base_width != 64: raise ValueError('BasicBlock only supports groups=1 and base_width=64') if dilation > 1: raise NotImplementedError("Dilation > 1 not supported in BasicBlock") # Both self.conv1 and self.downsample layers downsample the input when stride != 1 self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = norm_layer(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = norm_layer(planes) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class Bottleneck(nn.Module): # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) # while original implementation places the stride at the first 1x1 convolution(self.conv1) # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. # This variant is also known as ResNet V1.5 and improves accuracy according to # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): super(Bottleneck, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d width = int(planes * (base_width / 64.)) * groups # Both self.conv2 and self.downsample layers downsample the input when stride != 1 self.conv1 = conv1x1(inplanes, width) self.bn1 = norm_layer(width) self.conv2 = conv3x3(width, width, stride, groups, dilation) self.bn2 = norm_layer(width) self.conv3 = conv1x1(width, planes * self.expansion) self.bn3 = norm_layer(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class ResNet(nn.Module): def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, groups=1, width_per_group=64, replace_stride_with_dilation=None, norm_layer=None): super(ResNet, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d self._norm_layer = norm_layer self.inplanes = 64 self.dilation = 1 if replace_stride_with_dilation is None: # each element in the tuple indicates if we should replace # the 2x2 stride with a dilated convolution instead replace_stride_with_dilation = [False, False, False] if len(replace_stride_with_dilation) != 3: raise ValueError("replace_stride_with_dilation should be None " "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) self.groups = groups self.base_width = width_per_group self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(self.inplanes) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]) self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]) self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]) #self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) #self.fc = nn.Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual block behaves like an identity. # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): nn.init.constant_(m.bn3.weight, 0) elif isinstance(m, BasicBlock): nn.init.constant_(m.bn2.weight, 0) def _make_layer(self, block, planes, blocks, stride=1, dilate=False): norm_layer = self._norm_layer downsample = None previous_dilation = self.dilation if dilate: self.dilation *= stride stride = 1 if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( conv1x1(self.inplanes, planes * block.expansion, stride), norm_layer(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append(block(self.inplanes, planes, groups=self.groups, base_width=self.base_width, dilation=self.dilation, norm_layer=norm_layer)) return nn.Sequential(*layers) def _forward_impl(self, x): # See note [TorchScript super()] features = [] x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) features.append(x) x = self.layer2(x) features.append(x) x = self.layer3(x) features.append(x) x = self.layer4(x) features.append(x) #x = self.avgpool(x) #x = torch.flatten(x, 1) #x = self.fc(x) return features def forward(self, x): return self._forward_impl(x) def resnext101_32x8d(pretrained=True, **kwargs): """Constructs a ResNet-152 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ kwargs['groups'] = 32 kwargs['width_per_group'] = 8 model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) return model ================================================ FILE: annotator/leres/leres/depthmap.py ================================================ # Author: thygate # https://github.com/thygate/stable-diffusion-webui-depthmap-script from modules import devices from modules.shared import opts from torchvision.transforms import transforms from operator import getitem import torch, gc import cv2 import numpy as np import skimage.measure whole_size_threshold = 1600 # R_max from the paper pix2pixsize = 1024 def scale_torch(img): """ Scale the image and output it in torch.tensor. :param img: input rgb is in shape [H, W, C], input depth/disp is in shape [H, W] :param scale: the scale factor. float :return: img. [C, H, W] """ if len(img.shape) == 2: img = img[np.newaxis, :, :] if img.shape[2] == 3: transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406) , (0.229, 0.224, 0.225) )]) img = transform(img.astype(np.float32)) else: img = img.astype(np.float32) img = torch.from_numpy(img) return img def estimateleres(img, model, w, h): # leres transform input rgb_c = img[:, :, ::-1].copy() A_resize = cv2.resize(rgb_c, (w, h)) img_torch = scale_torch(A_resize)[None, :, :, :] # compute with torch.no_grad(): img_torch = img_torch.to(devices.get_device_for("controlnet")) prediction = model.depth_model(img_torch) prediction = prediction.squeeze().cpu().numpy() prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC) return prediction def generatemask(size): # Generates a Guassian mask mask = np.zeros(size, dtype=np.float32) sigma = int(size[0]/16) k_size = int(2 * np.ceil(2 * int(size[0]/16)) + 1) mask[int(0.15*size[0]):size[0] - int(0.15*size[0]), int(0.15*size[1]): size[1] - int(0.15*size[1])] = 1 mask = cv2.GaussianBlur(mask, (int(k_size), int(k_size)), sigma) mask = (mask - mask.min()) / (mask.max() - mask.min()) mask = mask.astype(np.float32) return mask def resizewithpool(img, size): i_size = img.shape[0] n = int(np.floor(i_size/size)) out = skimage.measure.block_reduce(img, (n, n), np.max) return out def rgb2gray(rgb): # Converts rgb to gray return np.dot(rgb[..., :3], [0.2989, 0.5870, 0.1140]) def calculateprocessingres(img, basesize, confidence=0.1, scale_threshold=3, whole_size_threshold=3000): # Returns the R_x resolution described in section 5 of the main paper. # Parameters: # img :input rgb image # basesize : size the dilation kernel which is equal to receptive field of the network. # confidence: value of x in R_x; allowed percentage of pixels that are not getting any contextual cue. # scale_threshold: maximum allowed upscaling on the input image ; it has been set to 3. # whole_size_threshold: maximum allowed resolution. (R_max from section 6 of the main paper) # Returns: # outputsize_scale*speed_scale :The computed R_x resolution # patch_scale: K parameter from section 6 of the paper # speed scale parameter is to process every image in a smaller size to accelerate the R_x resolution search speed_scale = 32 image_dim = int(min(img.shape[0:2])) gray = rgb2gray(img) grad = np.abs(cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)) + np.abs(cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)) grad = cv2.resize(grad, (image_dim, image_dim), cv2.INTER_AREA) # thresholding the gradient map to generate the edge-map as a proxy of the contextual cues m = grad.min() M = grad.max() middle = m + (0.4 * (M - m)) grad[grad < middle] = 0 grad[grad >= middle] = 1 # dilation kernel with size of the receptive field kernel = np.ones((int(basesize/speed_scale), int(basesize/speed_scale)), float) # dilation kernel with size of the a quarter of receptive field used to compute k # as described in section 6 of main paper kernel2 = np.ones((int(basesize / (4*speed_scale)), int(basesize / (4*speed_scale))), float) # Output resolution limit set by the whole_size_threshold and scale_threshold. threshold = min(whole_size_threshold, scale_threshold * max(img.shape[:2])) outputsize_scale = basesize / speed_scale for p_size in range(int(basesize/speed_scale), int(threshold/speed_scale), int(basesize / (2*speed_scale))): grad_resized = resizewithpool(grad, p_size) grad_resized = cv2.resize(grad_resized, (p_size, p_size), cv2.INTER_NEAREST) grad_resized[grad_resized >= 0.5] = 1 grad_resized[grad_resized < 0.5] = 0 dilated = cv2.dilate(grad_resized, kernel, iterations=1) meanvalue = (1-dilated).mean() if meanvalue > confidence: break else: outputsize_scale = p_size grad_region = cv2.dilate(grad_resized, kernel2, iterations=1) patch_scale = grad_region.mean() return int(outputsize_scale*speed_scale), patch_scale # Generate a double-input depth estimation def doubleestimate(img, size1, size2, pix2pixsize, model, net_type, pix2pixmodel): # Generate the low resolution estimation estimate1 = singleestimate(img, size1, model, net_type) # Resize to the inference size of merge network. estimate1 = cv2.resize(estimate1, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC) # Generate the high resolution estimation estimate2 = singleestimate(img, size2, model, net_type) # Resize to the inference size of merge network. estimate2 = cv2.resize(estimate2, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC) # Inference on the merge model pix2pixmodel.set_input(estimate1, estimate2) pix2pixmodel.test() visuals = pix2pixmodel.get_current_visuals() prediction_mapped = visuals['fake_B'] prediction_mapped = (prediction_mapped+1)/2 prediction_mapped = (prediction_mapped - torch.min(prediction_mapped)) / ( torch.max(prediction_mapped) - torch.min(prediction_mapped)) prediction_mapped = prediction_mapped.squeeze().cpu().numpy() return prediction_mapped # Generate a single-input depth estimation def singleestimate(img, msize, model, net_type): # if net_type == 0: return estimateleres(img, model, msize, msize) # else: # return estimatemidasBoost(img, model, msize, msize) def applyGridpatch(blsize, stride, img, box): # Extract a simple grid patch. counter1 = 0 patch_bound_list = {} for k in range(blsize, img.shape[1] - blsize, stride): for j in range(blsize, img.shape[0] - blsize, stride): patch_bound_list[str(counter1)] = {} patchbounds = [j - blsize, k - blsize, j - blsize + 2 * blsize, k - blsize + 2 * blsize] patch_bound = [box[0] + patchbounds[1], box[1] + patchbounds[0], patchbounds[3] - patchbounds[1], patchbounds[2] - patchbounds[0]] patch_bound_list[str(counter1)]['rect'] = patch_bound patch_bound_list[str(counter1)]['size'] = patch_bound[2] counter1 = counter1 + 1 return patch_bound_list # Generating local patches to perform the local refinement described in section 6 of the main paper. def generatepatchs(img, base_size): # Compute the gradients as a proxy of the contextual cues. img_gray = rgb2gray(img) whole_grad = np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)) +\ np.abs(cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3)) threshold = whole_grad[whole_grad > 0].mean() whole_grad[whole_grad < threshold] = 0 # We use the integral image to speed-up the evaluation of the amount of gradients for each patch. gf = whole_grad.sum()/len(whole_grad.reshape(-1)) grad_integral_image = cv2.integral(whole_grad) # Variables are selected such that the initial patch size would be the receptive field size # and the stride is set to 1/3 of the receptive field size. blsize = int(round(base_size/2)) stride = int(round(blsize*0.75)) # Get initial Grid patch_bound_list = applyGridpatch(blsize, stride, img, [0, 0, 0, 0]) # Refine initial Grid of patches by discarding the flat (in terms of gradients of the rgb image) ones. Refine # each patch size to ensure that there will be enough depth cues for the network to generate a consistent depth map. print("Selecting patches ...") patch_bound_list = adaptiveselection(grad_integral_image, patch_bound_list, gf) # Sort the patch list to make sure the merging operation will be done with the correct order: starting from biggest # patch patchset = sorted(patch_bound_list.items(), key=lambda x: getitem(x[1], 'size'), reverse=True) return patchset def getGF_fromintegral(integralimage, rect): # Computes the gradient density of a given patch from the gradient integral image. x1 = rect[1] x2 = rect[1]+rect[3] y1 = rect[0] y2 = rect[0]+rect[2] value = integralimage[x2, y2]-integralimage[x1, y2]-integralimage[x2, y1]+integralimage[x1, y1] return value # Adaptively select patches def adaptiveselection(integral_grad, patch_bound_list, gf): patchlist = {} count = 0 height, width = integral_grad.shape search_step = int(32/factor) # Go through all patches for c in range(len(patch_bound_list)): # Get patch bbox = patch_bound_list[str(c)]['rect'] # Compute the amount of gradients present in the patch from the integral image. cgf = getGF_fromintegral(integral_grad, bbox)/(bbox[2]*bbox[3]) # Check if patching is beneficial by comparing the gradient density of the patch to # the gradient density of the whole image if cgf >= gf: bbox_test = bbox.copy() patchlist[str(count)] = {} # Enlarge each patch until the gradient density of the patch is equal # to the whole image gradient density while True: bbox_test[0] = bbox_test[0] - int(search_step/2) bbox_test[1] = bbox_test[1] - int(search_step/2) bbox_test[2] = bbox_test[2] + search_step bbox_test[3] = bbox_test[3] + search_step # Check if we are still within the image if bbox_test[0] < 0 or bbox_test[1] < 0 or bbox_test[1] + bbox_test[3] >= height \ or bbox_test[0] + bbox_test[2] >= width: break # Compare gradient density cgf = getGF_fromintegral(integral_grad, bbox_test)/(bbox_test[2]*bbox_test[3]) if cgf < gf: break bbox = bbox_test.copy() # Add patch to selected patches patchlist[str(count)]['rect'] = bbox patchlist[str(count)]['size'] = bbox[2] count = count + 1 # Return selected patches return patchlist def impatch(image, rect): # Extract the given patch pixels from a given image. w1 = rect[0] h1 = rect[1] w2 = w1 + rect[2] h2 = h1 + rect[3] image_patch = image[h1:h2, w1:w2] return image_patch class ImageandPatchs: def __init__(self, root_dir, name, patchsinfo, rgb_image, scale=1): self.root_dir = root_dir self.patchsinfo = patchsinfo self.name = name self.patchs = patchsinfo self.scale = scale self.rgb_image = cv2.resize(rgb_image, (round(rgb_image.shape[1]*scale), round(rgb_image.shape[0]*scale)), interpolation=cv2.INTER_CUBIC) self.do_have_estimate = False self.estimation_updated_image = None self.estimation_base_image = None def __len__(self): return len(self.patchs) def set_base_estimate(self, est): self.estimation_base_image = est if self.estimation_updated_image is not None: self.do_have_estimate = True def set_updated_estimate(self, est): self.estimation_updated_image = est if self.estimation_base_image is not None: self.do_have_estimate = True def __getitem__(self, index): patch_id = int(self.patchs[index][0]) rect = np.array(self.patchs[index][1]['rect']) msize = self.patchs[index][1]['size'] ## applying scale to rect: rect = np.round(rect * self.scale) rect = rect.astype('int') msize = round(msize * self.scale) patch_rgb = impatch(self.rgb_image, rect) if self.do_have_estimate: patch_whole_estimate_base = impatch(self.estimation_base_image, rect) patch_whole_estimate_updated = impatch(self.estimation_updated_image, rect) return {'patch_rgb': patch_rgb, 'patch_whole_estimate_base': patch_whole_estimate_base, 'patch_whole_estimate_updated': patch_whole_estimate_updated, 'rect': rect, 'size': msize, 'id': patch_id} else: return {'patch_rgb': patch_rgb, 'rect': rect, 'size': msize, 'id': patch_id} def print_options(self, opt): """Print and save options It will print both current options and default values(if different). It will save options into a text file / [checkpoints_dir] / opt.txt """ message = '' message += '----------------- Options ---------------\n' for k, v in sorted(vars(opt).items()): comment = '' default = self.parser.get_default(k) if v != default: comment = '\t[default: %s]' % str(default) message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment) message += '----------------- End -------------------' print(message) # save to the disk """ expr_dir = os.path.join(opt.checkpoints_dir, opt.name) util.mkdirs(expr_dir) file_name = os.path.join(expr_dir, '{}_opt.txt'.format(opt.phase)) with open(file_name, 'wt') as opt_file: opt_file.write(message) opt_file.write('\n') """ def parse(self): """Parse our options, create checkpoints directory suffix, and set up gpu device.""" opt = self.gather_options() opt.isTrain = self.isTrain # train or test # process opt.suffix if opt.suffix: suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else '' opt.name = opt.name + suffix #self.print_options(opt) # set gpu ids str_ids = opt.gpu_ids.split(',') opt.gpu_ids = [] for str_id in str_ids: id = int(str_id) if id >= 0: opt.gpu_ids.append(id) #if len(opt.gpu_ids) > 0: # torch.cuda.set_device(opt.gpu_ids[0]) self.opt = opt return self.opt def estimateboost(img, model, model_type, pix2pixmodel, max_res=512): global whole_size_threshold # get settings if hasattr(opts, 'depthmap_script_boost_rmax'): whole_size_threshold = opts.depthmap_script_boost_rmax if model_type == 0: #leres net_receptive_field_size = 448 patch_netsize = 2 * net_receptive_field_size elif model_type == 1: #dpt_beit_large_512 net_receptive_field_size = 512 patch_netsize = 2 * net_receptive_field_size else: #other midas net_receptive_field_size = 384 patch_netsize = 2 * net_receptive_field_size gc.collect() devices.torch_gc() # Generate mask used to smoothly blend the local pathc estimations to the base estimate. # It is arbitrarily large to avoid artifacts during rescaling for each crop. mask_org = generatemask((3000, 3000)) mask = mask_org.copy() # Value x of R_x defined in the section 5 of the main paper. r_threshold_value = 0.2 #if R0: # r_threshold_value = 0 input_resolution = img.shape scale_threshold = 3 # Allows up-scaling with a scale up to 3 # Find the best input resolution R-x. The resolution search described in section 5-double estimation of the main paper and section B of the # supplementary material. whole_image_optimal_size, patch_scale = calculateprocessingres(img, net_receptive_field_size, r_threshold_value, scale_threshold, whole_size_threshold) # print('wholeImage being processed in :', whole_image_optimal_size) # Generate the base estimate using the double estimation. whole_estimate = doubleestimate(img, net_receptive_field_size, whole_image_optimal_size, pix2pixsize, model, model_type, pix2pixmodel) # Compute the multiplier described in section 6 of the main paper to make sure our initial patch can select # small high-density regions of the image. global factor factor = max(min(1, 4 * patch_scale * whole_image_optimal_size / whole_size_threshold), 0.2) # print('Adjust factor is:', 1/factor) # Check if Local boosting is beneficial. if max_res < whole_image_optimal_size: # print("No Local boosting. Specified Max Res is smaller than R20, Returning doubleestimate result") return cv2.resize(whole_estimate, (input_resolution[1], input_resolution[0]), interpolation=cv2.INTER_CUBIC) # Compute the default target resolution. if img.shape[0] > img.shape[1]: a = 2 * whole_image_optimal_size b = round(2 * whole_image_optimal_size * img.shape[1] / img.shape[0]) else: a = round(2 * whole_image_optimal_size * img.shape[0] / img.shape[1]) b = 2 * whole_image_optimal_size b = int(round(b / factor)) a = int(round(a / factor)) """ # recompute a, b and saturate to max res. if max(a,b) > max_res: print('Default Res is higher than max-res: Reducing final resolution') if img.shape[0] > img.shape[1]: a = max_res b = round(max_res * img.shape[1] / img.shape[0]) else: a = round(max_res * img.shape[0] / img.shape[1]) b = max_res b = int(b) a = int(a) """ img = cv2.resize(img, (b, a), interpolation=cv2.INTER_CUBIC) # Extract selected patches for local refinement base_size = net_receptive_field_size * 2 patchset = generatepatchs(img, base_size) # print('Target resolution: ', img.shape) # Computing a scale in case user prompted to generate the results as the same resolution of the input. # Notice that our method output resolution is independent of the input resolution and this parameter will only # enable a scaling operation during the local patch merge implementation to generate results with the same resolution # as the input. """ if output_resolution == 1: mergein_scale = input_resolution[0] / img.shape[0] print('Dynamicly change merged-in resolution; scale:', mergein_scale) else: mergein_scale = 1 """ # always rescale to input res for now mergein_scale = input_resolution[0] / img.shape[0] imageandpatchs = ImageandPatchs('', '', patchset, img, mergein_scale) whole_estimate_resized = cv2.resize(whole_estimate, (round(img.shape[1]*mergein_scale), round(img.shape[0]*mergein_scale)), interpolation=cv2.INTER_CUBIC) imageandpatchs.set_base_estimate(whole_estimate_resized.copy()) imageandpatchs.set_updated_estimate(whole_estimate_resized.copy()) print('Resulting depthmap resolution will be :', whole_estimate_resized.shape[:2]) print('Patches to process: '+str(len(imageandpatchs))) # Enumerate through all patches, generate their estimations and refining the base estimate. for patch_ind in range(len(imageandpatchs)): # Get patch information patch = imageandpatchs[patch_ind] # patch object patch_rgb = patch['patch_rgb'] # rgb patch patch_whole_estimate_base = patch['patch_whole_estimate_base'] # corresponding patch from base rect = patch['rect'] # patch size and location patch_id = patch['id'] # patch ID org_size = patch_whole_estimate_base.shape # the original size from the unscaled input print('\t Processing patch', patch_ind, '/', len(imageandpatchs)-1, '|', rect) # We apply double estimation for patches. The high resolution value is fixed to twice the receptive # field size of the network for patches to accelerate the process. patch_estimation = doubleestimate(patch_rgb, net_receptive_field_size, patch_netsize, pix2pixsize, model, model_type, pix2pixmodel) patch_estimation = cv2.resize(patch_estimation, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC) patch_whole_estimate_base = cv2.resize(patch_whole_estimate_base, (pix2pixsize, pix2pixsize), interpolation=cv2.INTER_CUBIC) # Merging the patch estimation into the base estimate using our merge network: # We feed the patch estimation and the same region from the updated base estimate to the merge network # to generate the target estimate for the corresponding region. pix2pixmodel.set_input(patch_whole_estimate_base, patch_estimation) # Run merging network pix2pixmodel.test() visuals = pix2pixmodel.get_current_visuals() prediction_mapped = visuals['fake_B'] prediction_mapped = (prediction_mapped+1)/2 prediction_mapped = prediction_mapped.squeeze().cpu().numpy() mapped = prediction_mapped # We use a simple linear polynomial to make sure the result of the merge network would match the values of # base estimate p_coef = np.polyfit(mapped.reshape(-1), patch_whole_estimate_base.reshape(-1), deg=1) merged = np.polyval(p_coef, mapped.reshape(-1)).reshape(mapped.shape) merged = cv2.resize(merged, (org_size[1],org_size[0]), interpolation=cv2.INTER_CUBIC) # Get patch size and location w1 = rect[0] h1 = rect[1] w2 = w1 + rect[2] h2 = h1 + rect[3] # To speed up the implementation, we only generate the Gaussian mask once with a sufficiently large size # and resize it to our needed size while merging the patches. if mask.shape != org_size: mask = cv2.resize(mask_org, (org_size[1],org_size[0]), interpolation=cv2.INTER_LINEAR) tobemergedto = imageandpatchs.estimation_updated_image # Update the whole estimation: # We use a simple Gaussian mask to blend the merged patch region with the base estimate to ensure seamless # blending at the boundaries of the patch region. tobemergedto[h1:h2, w1:w2] = np.multiply(tobemergedto[h1:h2, w1:w2], 1 - mask) + np.multiply(merged, mask) imageandpatchs.set_updated_estimate(tobemergedto) # output return cv2.resize(imageandpatchs.estimation_updated_image, (input_resolution[1], input_resolution[0]), interpolation=cv2.INTER_CUBIC) ================================================ FILE: annotator/leres/leres/multi_depth_model_woauxi.py ================================================ from . import network_auxi as network from .net_tools import get_func import torch import torch.nn as nn from modules import devices class RelDepthModel(nn.Module): def __init__(self, backbone='resnet50'): super(RelDepthModel, self).__init__() if backbone == 'resnet50': encoder = 'resnet50_stride32' elif backbone == 'resnext101': encoder = 'resnext101_stride32x8d' self.depth_model = DepthModel(encoder) def inference(self, rgb): with torch.no_grad(): input = rgb.to(self.depth_model.device) depth = self.depth_model(input) #pred_depth_out = depth - depth.min() + 0.01 return depth #pred_depth_out class DepthModel(nn.Module): def __init__(self, encoder): super(DepthModel, self).__init__() backbone = network.__name__.split('.')[-1] + '.' + encoder self.encoder_modules = get_func(backbone)() self.decoder_modules = network.Decoder() def forward(self, x): lateral_out = self.encoder_modules(x) out_logit = self.decoder_modules(lateral_out) return out_logit ================================================ FILE: annotator/leres/leres/net_tools.py ================================================ import importlib import torch import os from collections import OrderedDict def get_func(func_name): """Helper to return a function object by name. func_name must identify a function in this module or the path to a function relative to the base 'modeling' module. """ if func_name == '': return None try: parts = func_name.split('.') # Refers to a function in this module if len(parts) == 1: return globals()[parts[0]] # Otherwise, assume we're referencing a module under modeling module_name = 'annotator.leres.leres.' + '.'.join(parts[:-1]) module = importlib.import_module(module_name) return getattr(module, parts[-1]) except Exception: print('Failed to f1ind function: %s', func_name) raise def load_ckpt(args, depth_model, shift_model, focal_model): """ Load checkpoint. """ if os.path.isfile(args.load_ckpt): print("loading checkpoint %s" % args.load_ckpt) checkpoint = torch.load(args.load_ckpt) if shift_model is not None: shift_model.load_state_dict(strip_prefix_if_present(checkpoint['shift_model'], 'module.'), strict=True) if focal_model is not None: focal_model.load_state_dict(strip_prefix_if_present(checkpoint['focal_model'], 'module.'), strict=True) depth_model.load_state_dict(strip_prefix_if_present(checkpoint['depth_model'], "module."), strict=True) del checkpoint if torch.cuda.is_available(): torch.cuda.empty_cache() def strip_prefix_if_present(state_dict, prefix): keys = sorted(state_dict.keys()) if not all(key.startswith(prefix) for key in keys): return state_dict stripped_state_dict = OrderedDict() for key, value in state_dict.items(): stripped_state_dict[key.replace(prefix, "")] = value return stripped_state_dict ================================================ FILE: annotator/leres/leres/network_auxi.py ================================================ import torch import torch.nn as nn import torch.nn.init as init from . import Resnet, Resnext_torch def resnet50_stride32(): return DepthNet(backbone='resnet', depth=50, upfactors=[2, 2, 2, 2]) def resnext101_stride32x8d(): return DepthNet(backbone='resnext101_32x8d', depth=101, upfactors=[2, 2, 2, 2]) class Decoder(nn.Module): def __init__(self): super(Decoder, self).__init__() self.inchannels = [256, 512, 1024, 2048] self.midchannels = [256, 256, 256, 512] self.upfactors = [2,2,2,2] self.outchannels = 1 self.conv = FTB(inchannels=self.inchannels[3], midchannels=self.midchannels[3]) self.conv1 = nn.Conv2d(in_channels=self.midchannels[3], out_channels=self.midchannels[2], kernel_size=3, padding=1, stride=1, bias=True) self.upsample = nn.Upsample(scale_factor=self.upfactors[3], mode='bilinear', align_corners=True) self.ffm2 = FFM(inchannels=self.inchannels[2], midchannels=self.midchannels[2], outchannels = self.midchannels[2], upfactor=self.upfactors[2]) self.ffm1 = FFM(inchannels=self.inchannels[1], midchannels=self.midchannels[1], outchannels = self.midchannels[1], upfactor=self.upfactors[1]) self.ffm0 = FFM(inchannels=self.inchannels[0], midchannels=self.midchannels[0], outchannels = self.midchannels[0], upfactor=self.upfactors[0]) self.outconv = AO(inchannels=self.midchannels[0], outchannels=self.outchannels, upfactor=2) self._init_params() def _init_params(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.ConvTranspose2d): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): #NN.BatchNorm2d init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) def forward(self, features): x_32x = self.conv(features[3]) # 1/32 x_32 = self.conv1(x_32x) x_16 = self.upsample(x_32) # 1/16 x_8 = self.ffm2(features[2], x_16) # 1/8 x_4 = self.ffm1(features[1], x_8) # 1/4 x_2 = self.ffm0(features[0], x_4) # 1/2 #----------------------------------------- x = self.outconv(x_2) # original size return x class DepthNet(nn.Module): __factory = { 18: Resnet.resnet18, 34: Resnet.resnet34, 50: Resnet.resnet50, 101: Resnet.resnet101, 152: Resnet.resnet152 } def __init__(self, backbone='resnet', depth=50, upfactors=[2, 2, 2, 2]): super(DepthNet, self).__init__() self.backbone = backbone self.depth = depth self.pretrained = False self.inchannels = [256, 512, 1024, 2048] self.midchannels = [256, 256, 256, 512] self.upfactors = upfactors self.outchannels = 1 # Build model if self.backbone == 'resnet': if self.depth not in DepthNet.__factory: raise KeyError("Unsupported depth:", self.depth) self.encoder = DepthNet.__factory[depth](pretrained=self.pretrained) elif self.backbone == 'resnext101_32x8d': self.encoder = Resnext_torch.resnext101_32x8d(pretrained=self.pretrained) else: self.encoder = Resnext_torch.resnext101(pretrained=self.pretrained) def forward(self, x): x = self.encoder(x) # 1/32, 1/16, 1/8, 1/4 return x class FTB(nn.Module): def __init__(self, inchannels, midchannels=512): super(FTB, self).__init__() self.in1 = inchannels self.mid = midchannels self.conv1 = nn.Conv2d(in_channels=self.in1, out_channels=self.mid, kernel_size=3, padding=1, stride=1, bias=True) # NN.BatchNorm2d self.conv_branch = nn.Sequential(nn.ReLU(inplace=True), \ nn.Conv2d(in_channels=self.mid, out_channels=self.mid, kernel_size=3, padding=1, stride=1, bias=True), \ nn.BatchNorm2d(num_features=self.mid), \ nn.ReLU(inplace=True), \ nn.Conv2d(in_channels=self.mid, out_channels=self.mid, kernel_size=3, padding=1, stride=1, bias=True)) self.relu = nn.ReLU(inplace=True) self.init_params() def forward(self, x): x = self.conv1(x) x = x + self.conv_branch(x) x = self.relu(x) return x def init_params(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.ConvTranspose2d): # init.kaiming_normal_(m.weight, mode='fan_out') init.normal_(m.weight, std=0.01) # init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) class ATA(nn.Module): def __init__(self, inchannels, reduction=8): super(ATA, self).__init__() self.inchannels = inchannels self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Sequential(nn.Linear(self.inchannels * 2, self.inchannels // reduction), nn.ReLU(inplace=True), nn.Linear(self.inchannels // reduction, self.inchannels), nn.Sigmoid()) self.init_params() def forward(self, low_x, high_x): n, c, _, _ = low_x.size() x = torch.cat([low_x, high_x], 1) x = self.avg_pool(x) x = x.view(n, -1) x = self.fc(x).view(n, c, 1, 1) x = low_x * x + high_x return x def init_params(self): for m in self.modules(): if isinstance(m, nn.Conv2d): # init.kaiming_normal_(m.weight, mode='fan_out') # init.normal(m.weight, std=0.01) init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.ConvTranspose2d): # init.kaiming_normal_(m.weight, mode='fan_out') # init.normal_(m.weight, std=0.01) init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) class FFM(nn.Module): def __init__(self, inchannels, midchannels, outchannels, upfactor=2): super(FFM, self).__init__() self.inchannels = inchannels self.midchannels = midchannels self.outchannels = outchannels self.upfactor = upfactor self.ftb1 = FTB(inchannels=self.inchannels, midchannels=self.midchannels) # self.ata = ATA(inchannels = self.midchannels) self.ftb2 = FTB(inchannels=self.midchannels, midchannels=self.outchannels) self.upsample = nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True) self.init_params() def forward(self, low_x, high_x): x = self.ftb1(low_x) x = x + high_x x = self.ftb2(x) x = self.upsample(x) return x def init_params(self): for m in self.modules(): if isinstance(m, nn.Conv2d): # init.kaiming_normal_(m.weight, mode='fan_out') init.normal_(m.weight, std=0.01) # init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.ConvTranspose2d): # init.kaiming_normal_(m.weight, mode='fan_out') init.normal_(m.weight, std=0.01) # init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): # NN.Batchnorm2d init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) class AO(nn.Module): # Adaptive output module def __init__(self, inchannels, outchannels, upfactor=2): super(AO, self).__init__() self.inchannels = inchannels self.outchannels = outchannels self.upfactor = upfactor self.adapt_conv = nn.Sequential( nn.Conv2d(in_channels=self.inchannels, out_channels=self.inchannels // 2, kernel_size=3, padding=1, stride=1, bias=True), \ nn.BatchNorm2d(num_features=self.inchannels // 2), \ nn.ReLU(inplace=True), \ nn.Conv2d(in_channels=self.inchannels // 2, out_channels=self.outchannels, kernel_size=3, padding=1, stride=1, bias=True), \ nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True)) self.init_params() def forward(self, x): x = self.adapt_conv(x) return x def init_params(self): for m in self.modules(): if isinstance(m, nn.Conv2d): # init.kaiming_normal_(m.weight, mode='fan_out') init.normal_(m.weight, std=0.01) # init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.ConvTranspose2d): # init.kaiming_normal_(m.weight, mode='fan_out') init.normal_(m.weight, std=0.01) # init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): # NN.Batchnorm2d init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) # ============================================================================================================== class ResidualConv(nn.Module): def __init__(self, inchannels): super(ResidualConv, self).__init__() # NN.BatchNorm2d self.conv = nn.Sequential( # nn.BatchNorm2d(num_features=inchannels), nn.ReLU(inplace=False), # nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=3, padding=1, stride=1, groups=inchannels,bias=True), # nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=1, padding=0, stride=1, groups=1,bias=True) nn.Conv2d(in_channels=inchannels, out_channels=inchannels / 2, kernel_size=3, padding=1, stride=1, bias=False), nn.BatchNorm2d(num_features=inchannels / 2), nn.ReLU(inplace=False), nn.Conv2d(in_channels=inchannels / 2, out_channels=inchannels, kernel_size=3, padding=1, stride=1, bias=False) ) self.init_params() def forward(self, x): x = self.conv(x) + x return x def init_params(self): for m in self.modules(): if isinstance(m, nn.Conv2d): # init.kaiming_normal_(m.weight, mode='fan_out') init.normal_(m.weight, std=0.01) # init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.ConvTranspose2d): # init.kaiming_normal_(m.weight, mode='fan_out') init.normal_(m.weight, std=0.01) # init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) class FeatureFusion(nn.Module): def __init__(self, inchannels, outchannels): super(FeatureFusion, self).__init__() self.conv = ResidualConv(inchannels=inchannels) # NN.BatchNorm2d self.up = nn.Sequential(ResidualConv(inchannels=inchannels), nn.ConvTranspose2d(in_channels=inchannels, out_channels=outchannels, kernel_size=3, stride=2, padding=1, output_padding=1), nn.BatchNorm2d(num_features=outchannels), nn.ReLU(inplace=True)) def forward(self, lowfeat, highfeat): return self.up(highfeat + self.conv(lowfeat)) def init_params(self): for m in self.modules(): if isinstance(m, nn.Conv2d): # init.kaiming_normal_(m.weight, mode='fan_out') init.normal_(m.weight, std=0.01) # init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.ConvTranspose2d): # init.kaiming_normal_(m.weight, mode='fan_out') init.normal_(m.weight, std=0.01) # init.xavier_normal_(m.weight) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): # NN.BatchNorm2d init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std=0.01) if m.bias is not None: init.constant_(m.bias, 0) class SenceUnderstand(nn.Module): def __init__(self, channels): super(SenceUnderstand, self).__init__() self.channels = channels self.conv1 = nn.Sequential(nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1), nn.ReLU(inplace=True)) self.pool = nn.AdaptiveAvgPool2d(8) self.fc = nn.Sequential(nn.Linear(512 * 8 * 8, self.channels), nn.ReLU(inplace=True)) self.conv2 = nn.Sequential( nn.Conv2d(in_channels=self.channels, out_channels=self.channels, kernel_size=1, padding=0), nn.ReLU(inplace=True)) self.initial_params() def forward(self, x): n, c, h, w = x.size() x = self.conv1(x) x = self.pool(x) x = x.view(n, -1) x = self.fc(x) x = x.view(n, self.channels, 1, 1) x = self.conv2(x) x = x.repeat(1, 1, h, w) return x def initial_params(self, dev=0.01): for m in self.modules(): if isinstance(m, nn.Conv2d): # print torch.sum(m.weight) m.weight.data.normal_(0, dev) if m.bias is not None: m.bias.data.fill_(0) elif isinstance(m, nn.ConvTranspose2d): # print torch.sum(m.weight) m.weight.data.normal_(0, dev) if m.bias is not None: m.bias.data.fill_(0) elif isinstance(m, nn.Linear): m.weight.data.normal_(0, dev) if __name__ == '__main__': net = DepthNet(depth=50, pretrained=True) print(net) inputs = torch.ones(4,3,128,128) out = net(inputs) print(out.size()) ================================================ FILE: annotator/leres/pix2pix/LICENSE ================================================ https://github.com/compphoto/BoostingMonocularDepth Copyright 2021, Seyed Mahdi Hosseini Miangoleh, Sebastian Dille, Computational Photography Laboratory. All rights reserved. This software is for academic use only. A redistribution of this software, with or without modifications, has to be for academic use only, while giving the appropriate credit to the original authors of the software. The methods implemented as a part of this software may be covered under patents or patent applications. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: annotator/leres/pix2pix/models/__init__.py ================================================ """This package contains modules related to objective functions, optimizations, and network architectures. To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel. You need to implement the following five functions: -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt). -- : unpack data from dataset and apply preprocessing. -- : produce intermediate results. -- : calculate loss, gradients, and update network weights. -- : (optionally) add model-specific options and set default options. In the function <__init__>, you need to define four lists: -- self.loss_names (str list): specify the training losses that you want to plot and save. -- self.model_names (str list): define networks used in our training. -- self.visual_names (str list): specify the images that you want to display and save. -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage. Now you can use the model class by specifying flag '--model dummy'. See our template model class 'template_model.py' for more details. """ import importlib from .base_model import BaseModel def find_model_using_name(model_name): """Import the module "models/[model_name]_model.py". In the file, the class called DatasetNameModel() will be instantiated. It has to be a subclass of BaseModel, and it is case-insensitive. """ model_filename = "annotator.leres.pix2pix.models." + model_name + "_model" modellib = importlib.import_module(model_filename) model = None target_model_name = model_name.replace('_', '') + 'model' for name, cls in modellib.__dict__.items(): if name.lower() == target_model_name.lower() \ and issubclass(cls, BaseModel): model = cls if model is None: print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name)) exit(0) return model def get_option_setter(model_name): """Return the static method of the model class.""" model_class = find_model_using_name(model_name) return model_class.modify_commandline_options def create_model(opt): """Create a model given the option. This function warps the class CustomDatasetDataLoader. This is the main interface between this package and 'train.py'/'test.py' Example: >>> from models import create_model >>> model = create_model(opt) """ model = find_model_using_name(opt.model) instance = model(opt) print("model [%s] was created" % type(instance).__name__) return instance ================================================ FILE: annotator/leres/pix2pix/models/base_model.py ================================================ import os import torch, gc from modules import devices from collections import OrderedDict from abc import ABC, abstractmethod from . import networks class BaseModel(ABC): """This class is an abstract base class (ABC) for models. To create a subclass, you need to implement the following five functions: -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt). -- : unpack data from dataset and apply preprocessing. -- : produce intermediate results. -- : calculate losses, gradients, and update network weights. -- : (optionally) add model-specific options and set default options. """ def __init__(self, opt): """Initialize the BaseModel class. Parameters: opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions When creating your custom class, you need to implement your own initialization. In this function, you should first call Then, you need to define four lists: -- self.loss_names (str list): specify the training losses that you want to plot and save. -- self.model_names (str list): define networks used in our training. -- self.visual_names (str list): specify the images that you want to display and save. -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example. """ self.opt = opt self.gpu_ids = opt.gpu_ids self.isTrain = opt.isTrain self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu') # get device name: CPU or GPU self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) # save all the checkpoints to save_dir if opt.preprocess != 'scale_width': # with [scale_width], input images might have different sizes, which hurts the performance of cudnn.benchmark. torch.backends.cudnn.benchmark = True self.loss_names = [] self.model_names = [] self.visual_names = [] self.optimizers = [] self.image_paths = [] self.metric = 0 # used for learning rate policy 'plateau' @staticmethod def modify_commandline_options(parser, is_train): """Add new model-specific options, and rewrite default values for existing options. Parameters: parser -- original option parser is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options. Returns: the modified parser. """ return parser @abstractmethod def set_input(self, input): """Unpack input data from the dataloader and perform necessary pre-processing steps. Parameters: input (dict): includes the data itself and its metadata information. """ pass @abstractmethod def forward(self): """Run forward pass; called by both functions and .""" pass @abstractmethod def optimize_parameters(self): """Calculate losses, gradients, and update network weights; called in every training iteration""" pass def setup(self, opt): """Load and print networks; create schedulers Parameters: opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions """ if self.isTrain: self.schedulers = [networks.get_scheduler(optimizer, opt) for optimizer in self.optimizers] if not self.isTrain or opt.continue_train: load_suffix = 'iter_%d' % opt.load_iter if opt.load_iter > 0 else opt.epoch self.load_networks(load_suffix) self.print_networks(opt.verbose) def eval(self): """Make models eval mode during test time""" for name in self.model_names: if isinstance(name, str): net = getattr(self, 'net' + name) net.eval() def test(self): """Forward function used in test time. This function wraps function in no_grad() so we don't save intermediate steps for backprop It also calls to produce additional visualization results """ with torch.no_grad(): self.forward() self.compute_visuals() def compute_visuals(self): """Calculate additional output images for visdom and HTML visualization""" pass def get_image_paths(self): """ Return image paths that are used to load current data""" return self.image_paths def update_learning_rate(self): """Update learning rates for all the networks; called at the end of every epoch""" old_lr = self.optimizers[0].param_groups[0]['lr'] for scheduler in self.schedulers: if self.opt.lr_policy == 'plateau': scheduler.step(self.metric) else: scheduler.step() lr = self.optimizers[0].param_groups[0]['lr'] print('learning rate %.7f -> %.7f' % (old_lr, lr)) def get_current_visuals(self): """Return visualization images. train.py will display these images with visdom, and save the images to a HTML""" visual_ret = OrderedDict() for name in self.visual_names: if isinstance(name, str): visual_ret[name] = getattr(self, name) return visual_ret def get_current_losses(self): """Return traning losses / errors. train.py will print out these errors on console, and save them to a file""" errors_ret = OrderedDict() for name in self.loss_names: if isinstance(name, str): errors_ret[name] = float(getattr(self, 'loss_' + name)) # float(...) works for both scalar tensor and float number return errors_ret def save_networks(self, epoch): """Save all the networks to the disk. Parameters: epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name) """ for name in self.model_names: if isinstance(name, str): save_filename = '%s_net_%s.pth' % (epoch, name) save_path = os.path.join(self.save_dir, save_filename) net = getattr(self, 'net' + name) if len(self.gpu_ids) > 0 and torch.cuda.is_available(): torch.save(net.module.cpu().state_dict(), save_path) net.cuda(self.gpu_ids[0]) else: torch.save(net.cpu().state_dict(), save_path) def unload_network(self, name): """Unload network and gc. """ if isinstance(name, str): net = getattr(self, 'net' + name) del net gc.collect() devices.torch_gc() return None def __patch_instance_norm_state_dict(self, state_dict, module, keys, i=0): """Fix InstanceNorm checkpoints incompatibility (prior to 0.4)""" key = keys[i] if i + 1 == len(keys): # at the end, pointing to a parameter/buffer if module.__class__.__name__.startswith('InstanceNorm') and \ (key == 'running_mean' or key == 'running_var'): if getattr(module, key) is None: state_dict.pop('.'.join(keys)) if module.__class__.__name__.startswith('InstanceNorm') and \ (key == 'num_batches_tracked'): state_dict.pop('.'.join(keys)) else: self.__patch_instance_norm_state_dict(state_dict, getattr(module, key), keys, i + 1) def load_networks(self, epoch): """Load all the networks from the disk. Parameters: epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name) """ for name in self.model_names: if isinstance(name, str): load_filename = '%s_net_%s.pth' % (epoch, name) load_path = os.path.join(self.save_dir, load_filename) net = getattr(self, 'net' + name) if isinstance(net, torch.nn.DataParallel): net = net.module # print('Loading depth boost model from %s' % load_path) # if you are using PyTorch newer than 0.4 (e.g., built from # GitHub source), you can remove str() on self.device state_dict = torch.load(load_path, map_location=str(self.device)) if hasattr(state_dict, '_metadata'): del state_dict._metadata # patch InstanceNorm checkpoints prior to 0.4 for key in list(state_dict.keys()): # need to copy keys here because we mutate in loop self.__patch_instance_norm_state_dict(state_dict, net, key.split('.')) net.load_state_dict(state_dict) def print_networks(self, verbose): """Print the total number of parameters in the network and (if verbose) network architecture Parameters: verbose (bool) -- if verbose: print the network architecture """ print('---------- Networks initialized -------------') for name in self.model_names: if isinstance(name, str): net = getattr(self, 'net' + name) num_params = 0 for param in net.parameters(): num_params += param.numel() if verbose: print(net) print('[Network %s] Total number of parameters : %.3f M' % (name, num_params / 1e6)) print('-----------------------------------------------') def set_requires_grad(self, nets, requires_grad=False): """Set requies_grad=Fasle for all the networks to avoid unnecessary computations Parameters: nets (network list) -- a list of networks requires_grad (bool) -- whether the networks require gradients or not """ if not isinstance(nets, list): nets = [nets] for net in nets: if net is not None: for param in net.parameters(): param.requires_grad = requires_grad ================================================ FILE: annotator/leres/pix2pix/models/base_model_hg.py ================================================ import os import torch class BaseModelHG(): def name(self): return 'BaseModel' def initialize(self, opt): self.opt = opt self.gpu_ids = opt.gpu_ids self.isTrain = opt.isTrain self.Tensor = torch.cuda.FloatTensor if self.gpu_ids else torch.Tensor self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) def set_input(self, input): self.input = input def forward(self): pass # used in test time, no backprop def test(self): pass def get_image_paths(self): pass def optimize_parameters(self): pass def get_current_visuals(self): return self.input def get_current_errors(self): return {} def save(self, label): pass # helper saving function that can be used by subclasses def save_network(self, network, network_label, epoch_label, gpu_ids): save_filename = '_%s_net_%s.pth' % (epoch_label, network_label) save_path = os.path.join(self.save_dir, save_filename) torch.save(network.cpu().state_dict(), save_path) if len(gpu_ids) and torch.cuda.is_available(): network.cuda(device_id=gpu_ids[0]) # helper loading function that can be used by subclasses def load_network(self, network, network_label, epoch_label): save_filename = '%s_net_%s.pth' % (epoch_label, network_label) save_path = os.path.join(self.save_dir, save_filename) print(save_path) model = torch.load(save_path) return model # network.load_state_dict(torch.load(save_path)) def update_learning_rate(): pass ================================================ FILE: annotator/leres/pix2pix/models/networks.py ================================================ import torch import torch.nn as nn from torch.nn import init import functools from torch.optim import lr_scheduler ############################################################################### # Helper Functions ############################################################################### class Identity(nn.Module): def forward(self, x): return x def get_norm_layer(norm_type='instance'): """Return a normalization layer Parameters: norm_type (str) -- the name of the normalization layer: batch | instance | none For BatchNorm, we use learnable affine parameters and track running statistics (mean/stddev). For InstanceNorm, we do not use learnable affine parameters. We do not track running statistics. """ if norm_type == 'batch': norm_layer = functools.partial(nn.BatchNorm2d, affine=True, track_running_stats=True) elif norm_type == 'instance': norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False) elif norm_type == 'none': def norm_layer(x): return Identity() else: raise NotImplementedError('normalization layer [%s] is not found' % norm_type) return norm_layer def get_scheduler(optimizer, opt): """Return a learning rate scheduler Parameters: optimizer -- the optimizer of the network opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions.  opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine For 'linear', we keep the same learning rate for the first epochs and linearly decay the rate to zero over the next epochs. For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers. See https://pytorch.org/docs/stable/optim.html for more details. """ if opt.lr_policy == 'linear': def lambda_rule(epoch): lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.n_epochs) / float(opt.n_epochs_decay + 1) return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step': scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1) elif opt.lr_policy == 'plateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5) elif opt.lr_policy == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.n_epochs, eta_min=0) else: return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy) return scheduler def init_weights(net, init_type='normal', init_gain=0.02): """Initialize network weights. Parameters: net (network) -- network to be initialized init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal init_gain (float) -- scaling factor for normal, xavier and orthogonal. We use 'normal' in the original pix2pix and CycleGAN paper. But xavier and kaiming might work better for some applications. Feel free to try yourself. """ def init_func(m): # define the initialization function classname = m.__class__.__name__ if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1): if init_type == 'normal': init.normal_(m.weight.data, 0.0, init_gain) elif init_type == 'xavier': init.xavier_normal_(m.weight.data, gain=init_gain) elif init_type == 'kaiming': init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') elif init_type == 'orthogonal': init.orthogonal_(m.weight.data, gain=init_gain) else: raise NotImplementedError('initialization method [%s] is not implemented' % init_type) if hasattr(m, 'bias') and m.bias is not None: init.constant_(m.bias.data, 0.0) elif classname.find('BatchNorm2d') != -1: # BatchNorm Layer's weight is not a matrix; only normal distribution applies. init.normal_(m.weight.data, 1.0, init_gain) init.constant_(m.bias.data, 0.0) # print('initialize network with %s' % init_type) net.apply(init_func) # apply the initialization function def init_net(net, init_type='normal', init_gain=0.02, gpu_ids=[]): """Initialize a network: 1. register CPU/GPU device (with multi-GPU support); 2. initialize the network weights Parameters: net (network) -- the network to be initialized init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal gain (float) -- scaling factor for normal, xavier and orthogonal. gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2 Return an initialized network. """ if len(gpu_ids) > 0: assert(torch.cuda.is_available()) net.to(gpu_ids[0]) net = torch.nn.DataParallel(net, gpu_ids) # multi-GPUs init_weights(net, init_type, init_gain=init_gain) return net def define_G(input_nc, output_nc, ngf, netG, norm='batch', use_dropout=False, init_type='normal', init_gain=0.02, gpu_ids=[]): """Create a generator Parameters: input_nc (int) -- the number of channels in input images output_nc (int) -- the number of channels in output images ngf (int) -- the number of filters in the last conv layer netG (str) -- the architecture's name: resnet_9blocks | resnet_6blocks | unet_256 | unet_128 norm (str) -- the name of normalization layers used in the network: batch | instance | none use_dropout (bool) -- if use dropout layers. init_type (str) -- the name of our initialization method. init_gain (float) -- scaling factor for normal, xavier and orthogonal. gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2 Returns a generator Our current implementation provides two types of generators: U-Net: [unet_128] (for 128x128 input images) and [unet_256] (for 256x256 input images) The original U-Net paper: https://arxiv.org/abs/1505.04597 Resnet-based generator: [resnet_6blocks] (with 6 Resnet blocks) and [resnet_9blocks] (with 9 Resnet blocks) Resnet-based generator consists of several Resnet blocks between a few downsampling/upsampling operations. We adapt Torch code from Justin Johnson's neural style transfer project (https://github.com/jcjohnson/fast-neural-style). The generator has been initialized by . It uses RELU for non-linearity. """ net = None norm_layer = get_norm_layer(norm_type=norm) if netG == 'resnet_9blocks': net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=9) elif netG == 'resnet_6blocks': net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=6) elif netG == 'resnet_12blocks': net = ResnetGenerator(input_nc, output_nc, ngf, norm_layer=norm_layer, use_dropout=use_dropout, n_blocks=12) elif netG == 'unet_128': net = UnetGenerator(input_nc, output_nc, 7, ngf, norm_layer=norm_layer, use_dropout=use_dropout) elif netG == 'unet_256': net = UnetGenerator(input_nc, output_nc, 8, ngf, norm_layer=norm_layer, use_dropout=use_dropout) elif netG == 'unet_672': net = UnetGenerator(input_nc, output_nc, 5, ngf, norm_layer=norm_layer, use_dropout=use_dropout) elif netG == 'unet_960': net = UnetGenerator(input_nc, output_nc, 6, ngf, norm_layer=norm_layer, use_dropout=use_dropout) elif netG == 'unet_1024': net = UnetGenerator(input_nc, output_nc, 10, ngf, norm_layer=norm_layer, use_dropout=use_dropout) else: raise NotImplementedError('Generator model name [%s] is not recognized' % netG) return init_net(net, init_type, init_gain, gpu_ids) def define_D(input_nc, ndf, netD, n_layers_D=3, norm='batch', init_type='normal', init_gain=0.02, gpu_ids=[]): """Create a discriminator Parameters: input_nc (int) -- the number of channels in input images ndf (int) -- the number of filters in the first conv layer netD (str) -- the architecture's name: basic | n_layers | pixel n_layers_D (int) -- the number of conv layers in the discriminator; effective when netD=='n_layers' norm (str) -- the type of normalization layers used in the network. init_type (str) -- the name of the initialization method. init_gain (float) -- scaling factor for normal, xavier and orthogonal. gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2 Returns a discriminator Our current implementation provides three types of discriminators: [basic]: 'PatchGAN' classifier described in the original pix2pix paper. It can classify whether 70×70 overlapping patches are real or fake. Such a patch-level discriminator architecture has fewer parameters than a full-image discriminator and can work on arbitrarily-sized images in a fully convolutional fashion. [n_layers]: With this mode, you can specify the number of conv layers in the discriminator with the parameter (default=3 as used in [basic] (PatchGAN).) [pixel]: 1x1 PixelGAN discriminator can classify whether a pixel is real or not. It encourages greater color diversity but has no effect on spatial statistics. The discriminator has been initialized by . It uses Leakly RELU for non-linearity. """ net = None norm_layer = get_norm_layer(norm_type=norm) if netD == 'basic': # default PatchGAN classifier net = NLayerDiscriminator(input_nc, ndf, n_layers=3, norm_layer=norm_layer) elif netD == 'n_layers': # more options net = NLayerDiscriminator(input_nc, ndf, n_layers_D, norm_layer=norm_layer) elif netD == 'pixel': # classify if each pixel is real or fake net = PixelDiscriminator(input_nc, ndf, norm_layer=norm_layer) else: raise NotImplementedError('Discriminator model name [%s] is not recognized' % netD) return init_net(net, init_type, init_gain, gpu_ids) ############################################################################## # Classes ############################################################################## class GANLoss(nn.Module): """Define different GAN objectives. The GANLoss class abstracts away the need to create the target label tensor that has the same size as the input. """ def __init__(self, gan_mode, target_real_label=1.0, target_fake_label=0.0): """ Initialize the GANLoss class. Parameters: gan_mode (str) - - the type of GAN objective. It currently supports vanilla, lsgan, and wgangp. target_real_label (bool) - - label for a real image target_fake_label (bool) - - label of a fake image Note: Do not use sigmoid as the last layer of Discriminator. LSGAN needs no sigmoid. vanilla GANs will handle it with BCEWithLogitsLoss. """ super(GANLoss, self).__init__() self.register_buffer('real_label', torch.tensor(target_real_label)) self.register_buffer('fake_label', torch.tensor(target_fake_label)) self.gan_mode = gan_mode if gan_mode == 'lsgan': self.loss = nn.MSELoss() elif gan_mode == 'vanilla': self.loss = nn.BCEWithLogitsLoss() elif gan_mode in ['wgangp']: self.loss = None else: raise NotImplementedError('gan mode %s not implemented' % gan_mode) def get_target_tensor(self, prediction, target_is_real): """Create label tensors with the same size as the input. Parameters: prediction (tensor) - - tpyically the prediction from a discriminator target_is_real (bool) - - if the ground truth label is for real images or fake images Returns: A label tensor filled with ground truth label, and with the size of the input """ if target_is_real: target_tensor = self.real_label else: target_tensor = self.fake_label return target_tensor.expand_as(prediction) def __call__(self, prediction, target_is_real): """Calculate loss given Discriminator's output and grount truth labels. Parameters: prediction (tensor) - - tpyically the prediction output from a discriminator target_is_real (bool) - - if the ground truth label is for real images or fake images Returns: the calculated loss. """ if self.gan_mode in ['lsgan', 'vanilla']: target_tensor = self.get_target_tensor(prediction, target_is_real) loss = self.loss(prediction, target_tensor) elif self.gan_mode == 'wgangp': if target_is_real: loss = -prediction.mean() else: loss = prediction.mean() return loss def cal_gradient_penalty(netD, real_data, fake_data, device, type='mixed', constant=1.0, lambda_gp=10.0): """Calculate the gradient penalty loss, used in WGAN-GP paper https://arxiv.org/abs/1704.00028 Arguments: netD (network) -- discriminator network real_data (tensor array) -- real images fake_data (tensor array) -- generated images from the generator device (str) -- GPU / CPU: from torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu') type (str) -- if we mix real and fake data or not [real | fake | mixed]. constant (float) -- the constant used in formula ( ||gradient||_2 - constant)^2 lambda_gp (float) -- weight for this loss Returns the gradient penalty loss """ if lambda_gp > 0.0: if type == 'real': # either use real images, fake images, or a linear interpolation of two. interpolatesv = real_data elif type == 'fake': interpolatesv = fake_data elif type == 'mixed': alpha = torch.rand(real_data.shape[0], 1, device=device) alpha = alpha.expand(real_data.shape[0], real_data.nelement() // real_data.shape[0]).contiguous().view(*real_data.shape) interpolatesv = alpha * real_data + ((1 - alpha) * fake_data) else: raise NotImplementedError('{} not implemented'.format(type)) interpolatesv.requires_grad_(True) disc_interpolates = netD(interpolatesv) gradients = torch.autograd.grad(outputs=disc_interpolates, inputs=interpolatesv, grad_outputs=torch.ones(disc_interpolates.size()).to(device), create_graph=True, retain_graph=True, only_inputs=True) gradients = gradients[0].view(real_data.size(0), -1) # flat the data gradient_penalty = (((gradients + 1e-16).norm(2, dim=1) - constant) ** 2).mean() * lambda_gp # added eps return gradient_penalty, gradients else: return 0.0, None class ResnetGenerator(nn.Module): """Resnet-based generator that consists of Resnet blocks between a few downsampling/upsampling operations. We adapt Torch code and idea from Justin Johnson's neural style transfer project(https://github.com/jcjohnson/fast-neural-style) """ def __init__(self, input_nc, output_nc, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False, n_blocks=6, padding_type='reflect'): """Construct a Resnet-based generator Parameters: input_nc (int) -- the number of channels in input images output_nc (int) -- the number of channels in output images ngf (int) -- the number of filters in the last conv layer norm_layer -- normalization layer use_dropout (bool) -- if use dropout layers n_blocks (int) -- the number of ResNet blocks padding_type (str) -- the name of padding layer in conv layers: reflect | replicate | zero """ assert(n_blocks >= 0) super(ResnetGenerator, self).__init__() if type(norm_layer) == functools.partial: use_bias = norm_layer.func == nn.InstanceNorm2d else: use_bias = norm_layer == nn.InstanceNorm2d model = [nn.ReflectionPad2d(3), nn.Conv2d(input_nc, ngf, kernel_size=7, padding=0, bias=use_bias), norm_layer(ngf), nn.ReLU(True)] n_downsampling = 2 for i in range(n_downsampling): # add downsampling layers mult = 2 ** i model += [nn.Conv2d(ngf * mult, ngf * mult * 2, kernel_size=3, stride=2, padding=1, bias=use_bias), norm_layer(ngf * mult * 2), nn.ReLU(True)] mult = 2 ** n_downsampling for i in range(n_blocks): # add ResNet blocks model += [ResnetBlock(ngf * mult, padding_type=padding_type, norm_layer=norm_layer, use_dropout=use_dropout, use_bias=use_bias)] for i in range(n_downsampling): # add upsampling layers mult = 2 ** (n_downsampling - i) model += [nn.ConvTranspose2d(ngf * mult, int(ngf * mult / 2), kernel_size=3, stride=2, padding=1, output_padding=1, bias=use_bias), norm_layer(int(ngf * mult / 2)), nn.ReLU(True)] model += [nn.ReflectionPad2d(3)] model += [nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)] model += [nn.Tanh()] self.model = nn.Sequential(*model) def forward(self, input): """Standard forward""" return self.model(input) class ResnetBlock(nn.Module): """Define a Resnet block""" def __init__(self, dim, padding_type, norm_layer, use_dropout, use_bias): """Initialize the Resnet block A resnet block is a conv block with skip connections We construct a conv block with build_conv_block function, and implement skip connections in function. Original Resnet paper: https://arxiv.org/pdf/1512.03385.pdf """ super(ResnetBlock, self).__init__() self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, use_dropout, use_bias) def build_conv_block(self, dim, padding_type, norm_layer, use_dropout, use_bias): """Construct a convolutional block. Parameters: dim (int) -- the number of channels in the conv layer. padding_type (str) -- the name of padding layer: reflect | replicate | zero norm_layer -- normalization layer use_dropout (bool) -- if use dropout layers. use_bias (bool) -- if the conv layer uses bias or not Returns a conv block (with a conv layer, a normalization layer, and a non-linearity layer (ReLU)) """ conv_block = [] p = 0 if padding_type == 'reflect': conv_block += [nn.ReflectionPad2d(1)] elif padding_type == 'replicate': conv_block += [nn.ReplicationPad2d(1)] elif padding_type == 'zero': p = 1 else: raise NotImplementedError('padding [%s] is not implemented' % padding_type) conv_block += [nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), norm_layer(dim), nn.ReLU(True)] if use_dropout: conv_block += [nn.Dropout(0.5)] p = 0 if padding_type == 'reflect': conv_block += [nn.ReflectionPad2d(1)] elif padding_type == 'replicate': conv_block += [nn.ReplicationPad2d(1)] elif padding_type == 'zero': p = 1 else: raise NotImplementedError('padding [%s] is not implemented' % padding_type) conv_block += [nn.Conv2d(dim, dim, kernel_size=3, padding=p, bias=use_bias), norm_layer(dim)] return nn.Sequential(*conv_block) def forward(self, x): """Forward function (with skip connections)""" out = x + self.conv_block(x) # add skip connections return out class UnetGenerator(nn.Module): """Create a Unet-based generator""" def __init__(self, input_nc, output_nc, num_downs, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False): """Construct a Unet generator Parameters: input_nc (int) -- the number of channels in input images output_nc (int) -- the number of channels in output images num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7, image of size 128x128 will become of size 1x1 # at the bottleneck ngf (int) -- the number of filters in the last conv layer norm_layer -- normalization layer We construct the U-Net from the innermost layer to the outermost layer. It is a recursive process. """ super(UnetGenerator, self).__init__() # construct unet structure unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True) # add the innermost layer for i in range(num_downs - 5): # add intermediate layers with ngf * 8 filters unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout) # gradually reduce the number of filters from ngf * 8 to ngf unet_block = UnetSkipConnectionBlock(ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer) unet_block = UnetSkipConnectionBlock(ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer) unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer) self.model = UnetSkipConnectionBlock(output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True, norm_layer=norm_layer) # add the outermost layer def forward(self, input): """Standard forward""" return self.model(input) class UnetSkipConnectionBlock(nn.Module): """Defines the Unet submodule with skip connection. X -------------------identity---------------------- |-- downsampling -- |submodule| -- upsampling --| """ def __init__(self, outer_nc, inner_nc, input_nc=None, submodule=None, outermost=False, innermost=False, norm_layer=nn.BatchNorm2d, use_dropout=False): """Construct a Unet submodule with skip connections. Parameters: outer_nc (int) -- the number of filters in the outer conv layer inner_nc (int) -- the number of filters in the inner conv layer input_nc (int) -- the number of channels in input images/features submodule (UnetSkipConnectionBlock) -- previously defined submodules outermost (bool) -- if this module is the outermost module innermost (bool) -- if this module is the innermost module norm_layer -- normalization layer use_dropout (bool) -- if use dropout layers. """ super(UnetSkipConnectionBlock, self).__init__() self.outermost = outermost if type(norm_layer) == functools.partial: use_bias = norm_layer.func == nn.InstanceNorm2d else: use_bias = norm_layer == nn.InstanceNorm2d if input_nc is None: input_nc = outer_nc downconv = nn.Conv2d(input_nc, inner_nc, kernel_size=4, stride=2, padding=1, bias=use_bias) downrelu = nn.LeakyReLU(0.2, True) downnorm = norm_layer(inner_nc) uprelu = nn.ReLU(True) upnorm = norm_layer(outer_nc) if outermost: upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc, kernel_size=4, stride=2, padding=1) down = [downconv] up = [uprelu, upconv, nn.Tanh()] model = down + [submodule] + up elif innermost: upconv = nn.ConvTranspose2d(inner_nc, outer_nc, kernel_size=4, stride=2, padding=1, bias=use_bias) down = [downrelu, downconv] up = [uprelu, upconv, upnorm] model = down + up else: upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc, kernel_size=4, stride=2, padding=1, bias=use_bias) down = [downrelu, downconv, downnorm] up = [uprelu, upconv, upnorm] if use_dropout: model = down + [submodule] + up + [nn.Dropout(0.5)] else: model = down + [submodule] + up self.model = nn.Sequential(*model) def forward(self, x): if self.outermost: return self.model(x) else: # add skip connections return torch.cat([x, self.model(x)], 1) class NLayerDiscriminator(nn.Module): """Defines a PatchGAN discriminator""" def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d): """Construct a PatchGAN discriminator Parameters: input_nc (int) -- the number of channels in input images ndf (int) -- the number of filters in the last conv layer n_layers (int) -- the number of conv layers in the discriminator norm_layer -- normalization layer """ super(NLayerDiscriminator, self).__init__() if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters use_bias = norm_layer.func == nn.InstanceNorm2d else: use_bias = norm_layer == nn.InstanceNorm2d kw = 4 padw = 1 sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)] nf_mult = 1 nf_mult_prev = 1 for n in range(1, n_layers): # gradually increase the number of filters nf_mult_prev = nf_mult nf_mult = min(2 ** n, 8) sequence += [ nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2, True) ] nf_mult_prev = nf_mult nf_mult = min(2 ** n_layers, 8) sequence += [ nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2, True) ] sequence += [nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)] # output 1 channel prediction map self.model = nn.Sequential(*sequence) def forward(self, input): """Standard forward.""" return self.model(input) class PixelDiscriminator(nn.Module): """Defines a 1x1 PatchGAN discriminator (pixelGAN)""" def __init__(self, input_nc, ndf=64, norm_layer=nn.BatchNorm2d): """Construct a 1x1 PatchGAN discriminator Parameters: input_nc (int) -- the number of channels in input images ndf (int) -- the number of filters in the last conv layer norm_layer -- normalization layer """ super(PixelDiscriminator, self).__init__() if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters use_bias = norm_layer.func == nn.InstanceNorm2d else: use_bias = norm_layer == nn.InstanceNorm2d self.net = [ nn.Conv2d(input_nc, ndf, kernel_size=1, stride=1, padding=0), nn.LeakyReLU(0.2, True), nn.Conv2d(ndf, ndf * 2, kernel_size=1, stride=1, padding=0, bias=use_bias), norm_layer(ndf * 2), nn.LeakyReLU(0.2, True), nn.Conv2d(ndf * 2, 1, kernel_size=1, stride=1, padding=0, bias=use_bias)] self.net = nn.Sequential(*self.net) def forward(self, input): """Standard forward.""" return self.net(input) ================================================ FILE: annotator/leres/pix2pix/models/pix2pix4depth_model.py ================================================ import torch from .base_model import BaseModel from . import networks class Pix2Pix4DepthModel(BaseModel): """ This class implements the pix2pix model, for learning a mapping from input images to output images given paired data. The model training requires '--dataset_mode aligned' dataset. By default, it uses a '--netG unet256' U-Net generator, a '--netD basic' discriminator (PatchGAN), and a '--gan_mode' vanilla GAN loss (the cross-entropy objective used in the orignal GAN paper). pix2pix paper: https://arxiv.org/pdf/1611.07004.pdf """ @staticmethod def modify_commandline_options(parser, is_train=True): """Add new dataset-specific options, and rewrite default values for existing options. Parameters: parser -- original option parser is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options. Returns: the modified parser. For pix2pix, we do not use image buffer The training objective is: GAN Loss + lambda_L1 * ||G(A)-B||_1 By default, we use vanilla GAN loss, UNet with batchnorm, and aligned datasets. """ # changing the default values to match the pix2pix paper (https://phillipi.github.io/pix2pix/) parser.set_defaults(input_nc=2,output_nc=1,norm='none', netG='unet_1024', dataset_mode='depthmerge') if is_train: parser.set_defaults(pool_size=0, gan_mode='vanilla',) parser.add_argument('--lambda_L1', type=float, default=1000, help='weight for L1 loss') return parser def __init__(self, opt): """Initialize the pix2pix class. Parameters: opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions """ BaseModel.__init__(self, opt) # specify the training losses you want to print out. The training/test scripts will call self.loss_names = ['G_GAN', 'G_L1', 'D_real', 'D_fake'] # self.loss_names = ['G_L1'] # specify the images you want to save/display. The training/test scripts will call if self.isTrain: self.visual_names = ['outer','inner', 'fake_B', 'real_B'] else: self.visual_names = ['fake_B'] # specify the models you want to save to the disk. The training/test scripts will call and if self.isTrain: self.model_names = ['G','D'] else: # during test time, only load G self.model_names = ['G'] # define networks (both generator and discriminator) self.netG = networks.define_G(opt.input_nc, opt.output_nc, 64, 'unet_1024', 'none', False, 'normal', 0.02, self.gpu_ids) if self.isTrain: # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc self.netD = networks.define_D(opt.input_nc + opt.output_nc, opt.ndf, opt.netD, opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids) if self.isTrain: # define loss functions self.criterionGAN = networks.GANLoss(opt.gan_mode).to(self.device) self.criterionL1 = torch.nn.L1Loss() # initialize optimizers; schedulers will be automatically created by function . self.optimizer_G = torch.optim.Adam(self.netG.parameters(), lr=1e-4, betas=(opt.beta1, 0.999)) self.optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=2e-06, betas=(opt.beta1, 0.999)) self.optimizers.append(self.optimizer_G) self.optimizers.append(self.optimizer_D) def set_input_train(self, input): self.outer = input['data_outer'].to(self.device) self.outer = torch.nn.functional.interpolate(self.outer,(1024,1024),mode='bilinear',align_corners=False) self.inner = input['data_inner'].to(self.device) self.inner = torch.nn.functional.interpolate(self.inner,(1024,1024),mode='bilinear',align_corners=False) self.image_paths = input['image_path'] if self.isTrain: self.gtfake = input['data_gtfake'].to(self.device) self.gtfake = torch.nn.functional.interpolate(self.gtfake, (1024, 1024), mode='bilinear', align_corners=False) self.real_B = self.gtfake self.real_A = torch.cat((self.outer, self.inner), 1) def set_input(self, outer, inner): inner = torch.from_numpy(inner).unsqueeze(0).unsqueeze(0) outer = torch.from_numpy(outer).unsqueeze(0).unsqueeze(0) inner = (inner - torch.min(inner))/(torch.max(inner)-torch.min(inner)) outer = (outer - torch.min(outer))/(torch.max(outer)-torch.min(outer)) inner = self.normalize(inner) outer = self.normalize(outer) self.real_A = torch.cat((outer, inner), 1).to(self.device) def normalize(self, input): input = input * 2 input = input - 1 return input def forward(self): """Run forward pass; called by both functions and .""" self.fake_B = self.netG(self.real_A) # G(A) def backward_D(self): """Calculate GAN loss for the discriminator""" # Fake; stop backprop to the generator by detaching fake_B fake_AB = torch.cat((self.real_A, self.fake_B), 1) # we use conditional GANs; we need to feed both input and output to the discriminator pred_fake = self.netD(fake_AB.detach()) self.loss_D_fake = self.criterionGAN(pred_fake, False) # Real real_AB = torch.cat((self.real_A, self.real_B), 1) pred_real = self.netD(real_AB) self.loss_D_real = self.criterionGAN(pred_real, True) # combine loss and calculate gradients self.loss_D = (self.loss_D_fake + self.loss_D_real) * 0.5 self.loss_D.backward() def backward_G(self): """Calculate GAN and L1 loss for the generator""" # First, G(A) should fake the discriminator fake_AB = torch.cat((self.real_A, self.fake_B), 1) pred_fake = self.netD(fake_AB) self.loss_G_GAN = self.criterionGAN(pred_fake, True) # Second, G(A) = B self.loss_G_L1 = self.criterionL1(self.fake_B, self.real_B) * self.opt.lambda_L1 # combine loss and calculate gradients self.loss_G = self.loss_G_L1 + self.loss_G_GAN self.loss_G.backward() def optimize_parameters(self): self.forward() # compute fake images: G(A) # update D self.set_requires_grad(self.netD, True) # enable backprop for D self.optimizer_D.zero_grad() # set D's gradients to zero self.backward_D() # calculate gradients for D self.optimizer_D.step() # update D's weights # update G self.set_requires_grad(self.netD, False) # D requires no gradients when optimizing G self.optimizer_G.zero_grad() # set G's gradients to zero self.backward_G() # calculate graidents for G self.optimizer_G.step() # udpate G's weights ================================================ FILE: annotator/leres/pix2pix/options/__init__.py ================================================ """This package options includes option modules: training options, test options, and basic options (used in both training and test).""" ================================================ FILE: annotator/leres/pix2pix/options/base_options.py ================================================ import argparse import os from ...pix2pix.util import util # import torch from ...pix2pix import models # import pix2pix.data import numpy as np class BaseOptions(): """This class defines options used during both training and test time. It also implements several helper functions such as parsing, printing, and saving the options. It also gathers additional options defined in functions in both dataset class and model class. """ def __init__(self): """Reset the class; indicates the class hasn't been initailized""" self.initialized = False def initialize(self, parser): """Define the common options that are used in both training and test.""" # basic parameters parser.add_argument('--dataroot', help='path to images (should have subfolders trainA, trainB, valA, valB, etc)') parser.add_argument('--name', type=str, default='void', help='mahdi_unet_new, scaled_unet') parser.add_argument('--gpu_ids', type=str, default='0', help='gpu ids: e.g. 0 0,1,2, 0,2. use -1 for CPU') parser.add_argument('--checkpoints_dir', type=str, default='./pix2pix/checkpoints', help='models are saved here') # model parameters parser.add_argument('--model', type=str, default='cycle_gan', help='chooses which model to use. [cycle_gan | pix2pix | test | colorization]') parser.add_argument('--input_nc', type=int, default=2, help='# of input image channels: 3 for RGB and 1 for grayscale') parser.add_argument('--output_nc', type=int, default=1, help='# of output image channels: 3 for RGB and 1 for grayscale') parser.add_argument('--ngf', type=int, default=64, help='# of gen filters in the last conv layer') parser.add_argument('--ndf', type=int, default=64, help='# of discrim filters in the first conv layer') parser.add_argument('--netD', type=str, default='basic', help='specify discriminator architecture [basic | n_layers | pixel]. The basic model is a 70x70 PatchGAN. n_layers allows you to specify the layers in the discriminator') parser.add_argument('--netG', type=str, default='resnet_9blocks', help='specify generator architecture [resnet_9blocks | resnet_6blocks | unet_256 | unet_128]') parser.add_argument('--n_layers_D', type=int, default=3, help='only used if netD==n_layers') parser.add_argument('--norm', type=str, default='instance', help='instance normalization or batch normalization [instance | batch | none]') parser.add_argument('--init_type', type=str, default='normal', help='network initialization [normal | xavier | kaiming | orthogonal]') parser.add_argument('--init_gain', type=float, default=0.02, help='scaling factor for normal, xavier and orthogonal.') parser.add_argument('--no_dropout', action='store_true', help='no dropout for the generator') # dataset parameters parser.add_argument('--dataset_mode', type=str, default='unaligned', help='chooses how datasets are loaded. [unaligned | aligned | single | colorization]') parser.add_argument('--direction', type=str, default='AtoB', help='AtoB or BtoA') parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly') parser.add_argument('--num_threads', default=4, type=int, help='# threads for loading data') parser.add_argument('--batch_size', type=int, default=1, help='input batch size') parser.add_argument('--load_size', type=int, default=672, help='scale images to this size') parser.add_argument('--crop_size', type=int, default=672, help='then crop to this size') parser.add_argument('--max_dataset_size', type=int, default=10000, help='Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded.') parser.add_argument('--preprocess', type=str, default='resize_and_crop', help='scaling and cropping of images at load time [resize_and_crop | crop | scale_width | scale_width_and_crop | none]') parser.add_argument('--no_flip', action='store_true', help='if specified, do not flip the images for data augmentation') parser.add_argument('--display_winsize', type=int, default=256, help='display window size for both visdom and HTML') # additional parameters parser.add_argument('--epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model') parser.add_argument('--load_iter', type=int, default='0', help='which iteration to load? if load_iter > 0, the code will load models by iter_[load_iter]; otherwise, the code will load models by [epoch]') parser.add_argument('--verbose', action='store_true', help='if specified, print more debugging information') parser.add_argument('--suffix', default='', type=str, help='customized suffix: opt.name = opt.name + suffix: e.g., {model}_{netG}_size{load_size}') parser.add_argument('--data_dir', type=str, required=False, help='input files directory images can be .png .jpg .tiff') parser.add_argument('--output_dir', type=str, required=False, help='result dir. result depth will be png. vides are JMPG as avi') parser.add_argument('--savecrops', type=int, required=False) parser.add_argument('--savewholeest', type=int, required=False) parser.add_argument('--output_resolution', type=int, required=False, help='0 for no restriction 1 for resize to input size') parser.add_argument('--net_receptive_field_size', type=int, required=False) parser.add_argument('--pix2pixsize', type=int, required=False) parser.add_argument('--generatevideo', type=int, required=False) parser.add_argument('--depthNet', type=int, required=False, help='0: midas 1:strurturedRL') parser.add_argument('--R0', action='store_true') parser.add_argument('--R20', action='store_true') parser.add_argument('--Final', action='store_true') parser.add_argument('--colorize_results', action='store_true') parser.add_argument('--max_res', type=float, default=np.inf) self.initialized = True return parser def gather_options(self): """Initialize our parser with basic options(only once). Add additional model-specific and dataset-specific options. These options are defined in the function in model and dataset classes. """ if not self.initialized: # check if it has been initialized parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = self.initialize(parser) # get the basic options opt, _ = parser.parse_known_args() # modify model-related parser options model_name = opt.model model_option_setter = models.get_option_setter(model_name) parser = model_option_setter(parser, self.isTrain) opt, _ = parser.parse_known_args() # parse again with new defaults # modify dataset-related parser options # dataset_name = opt.dataset_mode # dataset_option_setter = pix2pix.data.get_option_setter(dataset_name) # parser = dataset_option_setter(parser, self.isTrain) # save and return the parser self.parser = parser #return parser.parse_args() #EVIL return opt def print_options(self, opt): """Print and save options It will print both current options and default values(if different). It will save options into a text file / [checkpoints_dir] / opt.txt """ message = '' message += '----------------- Options ---------------\n' for k, v in sorted(vars(opt).items()): comment = '' default = self.parser.get_default(k) if v != default: comment = '\t[default: %s]' % str(default) message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment) message += '----------------- End -------------------' print(message) # save to the disk expr_dir = os.path.join(opt.checkpoints_dir, opt.name) util.mkdirs(expr_dir) file_name = os.path.join(expr_dir, '{}_opt.txt'.format(opt.phase)) with open(file_name, 'wt') as opt_file: opt_file.write(message) opt_file.write('\n') def parse(self): """Parse our options, create checkpoints directory suffix, and set up gpu device.""" opt = self.gather_options() opt.isTrain = self.isTrain # train or test # process opt.suffix if opt.suffix: suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else '' opt.name = opt.name + suffix #self.print_options(opt) # set gpu ids str_ids = opt.gpu_ids.split(',') opt.gpu_ids = [] for str_id in str_ids: id = int(str_id) if id >= 0: opt.gpu_ids.append(id) #if len(opt.gpu_ids) > 0: # torch.cuda.set_device(opt.gpu_ids[0]) self.opt = opt return self.opt ================================================ FILE: annotator/leres/pix2pix/options/test_options.py ================================================ from .base_options import BaseOptions class TestOptions(BaseOptions): """This class includes test options. It also includes shared options defined in BaseOptions. """ def initialize(self, parser): parser = BaseOptions.initialize(self, parser) # define shared options parser.add_argument('--aspect_ratio', type=float, default=1.0, help='aspect ratio of result images') parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc') # Dropout and Batchnorm has different behavioir during training and test. parser.add_argument('--eval', action='store_true', help='use eval mode during test time.') parser.add_argument('--num_test', type=int, default=50, help='how many test images to run') # rewrite devalue values parser.set_defaults(model='pix2pix4depth') # To avoid cropping, the load_size should be the same as crop_size parser.set_defaults(load_size=parser.get_default('crop_size')) self.isTrain = False return parser ================================================ FILE: annotator/leres/pix2pix/util/__init__.py ================================================ """This package includes a miscellaneous collection of useful helper functions.""" ================================================ FILE: annotator/leres/pix2pix/util/get_data.py ================================================ from __future__ import print_function import os import tarfile import requests from warnings import warn from zipfile import ZipFile from bs4 import BeautifulSoup from os.path import abspath, isdir, join, basename class GetData(object): """A Python script for downloading CycleGAN or pix2pix datasets. Parameters: technique (str) -- One of: 'cyclegan' or 'pix2pix'. verbose (bool) -- If True, print additional information. Examples: >>> from util.get_data import GetData >>> gd = GetData(technique='cyclegan') >>> new_data_path = gd.get(save_path='./datasets') # options will be displayed. Alternatively, You can use bash scripts: 'scripts/download_pix2pix_model.sh' and 'scripts/download_cyclegan_model.sh'. """ def __init__(self, technique='cyclegan', verbose=True): url_dict = { 'pix2pix': 'http://efrosgans.eecs.berkeley.edu/pix2pix/datasets/', 'cyclegan': 'https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets' } self.url = url_dict.get(technique.lower()) self._verbose = verbose def _print(self, text): if self._verbose: print(text) @staticmethod def _get_options(r): soup = BeautifulSoup(r.text, 'lxml') options = [h.text for h in soup.find_all('a', href=True) if h.text.endswith(('.zip', 'tar.gz'))] return options def _present_options(self): r = requests.get(self.url) options = self._get_options(r) print('Options:\n') for i, o in enumerate(options): print("{0}: {1}".format(i, o)) choice = input("\nPlease enter the number of the " "dataset above you wish to download:") return options[int(choice)] def _download_data(self, dataset_url, save_path): if not isdir(save_path): os.makedirs(save_path) base = basename(dataset_url) temp_save_path = join(save_path, base) with open(temp_save_path, "wb") as f: r = requests.get(dataset_url) f.write(r.content) if base.endswith('.tar.gz'): obj = tarfile.open(temp_save_path) elif base.endswith('.zip'): obj = ZipFile(temp_save_path, 'r') else: raise ValueError("Unknown File Type: {0}.".format(base)) self._print("Unpacking Data...") obj.extractall(save_path) obj.close() os.remove(temp_save_path) def get(self, save_path, dataset=None): """ Download a dataset. Parameters: save_path (str) -- A directory to save the data to. dataset (str) -- (optional). A specific dataset to download. Note: this must include the file extension. If None, options will be presented for you to choose from. Returns: save_path_full (str) -- the absolute path to the downloaded data. """ if dataset is None: selected_dataset = self._present_options() else: selected_dataset = dataset save_path_full = join(save_path, selected_dataset.split('.')[0]) if isdir(save_path_full): warn("\n'{0}' already exists. Voiding Download.".format( save_path_full)) else: self._print('Downloading Data...') url = "{0}/{1}".format(self.url, selected_dataset) self._download_data(url, save_path=save_path) return abspath(save_path_full) ================================================ FILE: annotator/leres/pix2pix/util/guidedfilter.py ================================================ import numpy as np class GuidedFilter(): def __init__(self, source, reference, r=64, eps= 0.05**2): self.source = source; self.reference = reference; self.r = r self.eps = eps self.smooth = self.guidedfilter(self.source,self.reference,self.r,self.eps) def boxfilter(self,img, r): (rows, cols) = img.shape imDst = np.zeros_like(img) imCum = np.cumsum(img, 0) imDst[0 : r+1, :] = imCum[r : 2*r+1, :] imDst[r+1 : rows-r, :] = imCum[2*r+1 : rows, :] - imCum[0 : rows-2*r-1, :] imDst[rows-r: rows, :] = np.tile(imCum[rows-1, :], [r, 1]) - imCum[rows-2*r-1 : rows-r-1, :] imCum = np.cumsum(imDst, 1) imDst[:, 0 : r+1] = imCum[:, r : 2*r+1] imDst[:, r+1 : cols-r] = imCum[:, 2*r+1 : cols] - imCum[:, 0 : cols-2*r-1] imDst[:, cols-r: cols] = np.tile(imCum[:, cols-1], [r, 1]).T - imCum[:, cols-2*r-1 : cols-r-1] return imDst def guidedfilter(self,I, p, r, eps): (rows, cols) = I.shape N = self.boxfilter(np.ones([rows, cols]), r) meanI = self.boxfilter(I, r) / N meanP = self.boxfilter(p, r) / N meanIp = self.boxfilter(I * p, r) / N covIp = meanIp - meanI * meanP meanII = self.boxfilter(I * I, r) / N varI = meanII - meanI * meanI a = covIp / (varI + eps) b = meanP - a * meanI meanA = self.boxfilter(a, r) / N meanB = self.boxfilter(b, r) / N q = meanA * I + meanB return q ================================================ FILE: annotator/leres/pix2pix/util/html.py ================================================ import dominate from dominate.tags import meta, h3, table, tr, td, p, a, img, br import os class HTML: """This HTML class allows us to save images and write texts into a single HTML file. It consists of functions such as (add a text header to the HTML file), (add a row of images to the HTML file), and (save the HTML to the disk). It is based on Python library 'dominate', a Python library for creating and manipulating HTML documents using a DOM API. """ def __init__(self, web_dir, title, refresh=0): """Initialize the HTML classes Parameters: web_dir (str) -- a directory that stores the webpage. HTML file will be created at /index.html; images will be saved at 0: with self.doc.head: meta(http_equiv="refresh", content=str(refresh)) def get_image_dir(self): """Return the directory that stores images""" return self.img_dir def add_header(self, text): """Insert a header to the HTML file Parameters: text (str) -- the header text """ with self.doc: h3(text) def add_images(self, ims, txts, links, width=400): """add images to the HTML file Parameters: ims (str list) -- a list of image paths txts (str list) -- a list of image names shown on the website links (str list) -- a list of hyperref links; when you click an image, it will redirect you to a new page """ self.t = table(border=1, style="table-layout: fixed;") # Insert a table self.doc.add(self.t) with self.t: with tr(): for im, txt, link in zip(ims, txts, links): with td(style="word-wrap: break-word;", halign="center", valign="top"): with p(): with a(href=os.path.join('images', link)): img(style="width:%dpx" % width, src=os.path.join('images', im)) br() p(txt) def save(self): """save the current content to the HMTL file""" html_file = '%s/index.html' % self.web_dir f = open(html_file, 'wt') f.write(self.doc.render()) f.close() if __name__ == '__main__': # we show an example usage here. html = HTML('web/', 'test_html') html.add_header('hello world') ims, txts, links = [], [], [] for n in range(4): ims.append('image_%d.png' % n) txts.append('text_%d' % n) links.append('image_%d.png' % n) html.add_images(ims, txts, links) html.save() ================================================ FILE: annotator/leres/pix2pix/util/image_pool.py ================================================ import random import torch class ImagePool(): """This class implements an image buffer that stores previously generated images. This buffer enables us to update discriminators using a history of generated images rather than the ones produced by the latest generators. """ def __init__(self, pool_size): """Initialize the ImagePool class Parameters: pool_size (int) -- the size of image buffer, if pool_size=0, no buffer will be created """ self.pool_size = pool_size if self.pool_size > 0: # create an empty pool self.num_imgs = 0 self.images = [] def query(self, images): """Return an image from the pool. Parameters: images: the latest generated images from the generator Returns images from the buffer. By 50/100, the buffer will return input images. By 50/100, the buffer will return images previously stored in the buffer, and insert the current images to the buffer. """ if self.pool_size == 0: # if the buffer size is 0, do nothing return images return_images = [] for image in images: image = torch.unsqueeze(image.data, 0) if self.num_imgs < self.pool_size: # if the buffer is not full; keep inserting current images to the buffer self.num_imgs = self.num_imgs + 1 self.images.append(image) return_images.append(image) else: p = random.uniform(0, 1) if p > 0.5: # by 50% chance, the buffer will return a previously stored image, and insert the current image into the buffer random_id = random.randint(0, self.pool_size - 1) # randint is inclusive tmp = self.images[random_id].clone() self.images[random_id] = image return_images.append(tmp) else: # by another 50% chance, the buffer will return the current image return_images.append(image) return_images = torch.cat(return_images, 0) # collect all the images and return return return_images ================================================ FILE: annotator/leres/pix2pix/util/util.py ================================================ """This module contains simple helper functions """ from __future__ import print_function import torch import numpy as np from PIL import Image import os def tensor2im(input_image, imtype=np.uint16): """"Converts a Tensor array into a numpy image array. Parameters: input_image (tensor) -- the input image tensor array imtype (type) -- the desired type of the converted numpy array """ if not isinstance(input_image, np.ndarray): if isinstance(input_image, torch.Tensor): # get the data from a variable image_tensor = input_image.data else: return input_image image_numpy = torch.squeeze(image_tensor).cpu().numpy() # convert it into a numpy array image_numpy = (image_numpy + 1) / 2.0 * (2**16-1) # else: # if it is a numpy array, do nothing image_numpy = input_image return image_numpy.astype(imtype) def diagnose_network(net, name='network'): """Calculate and print the mean of average absolute(gradients) Parameters: net (torch network) -- Torch network name (str) -- the name of the network """ mean = 0.0 count = 0 for param in net.parameters(): if param.grad is not None: mean += torch.mean(torch.abs(param.grad.data)) count += 1 if count > 0: mean = mean / count print(name) print(mean) def save_image(image_numpy, image_path, aspect_ratio=1.0): """Save a numpy image to the disk Parameters: image_numpy (numpy array) -- input numpy array image_path (str) -- the path of the image """ image_pil = Image.fromarray(image_numpy) image_pil = image_pil.convert('I;16') # image_pil = Image.fromarray(image_numpy) # h, w, _ = image_numpy.shape # # if aspect_ratio > 1.0: # image_pil = image_pil.resize((h, int(w * aspect_ratio)), Image.BICUBIC) # if aspect_ratio < 1.0: # image_pil = image_pil.resize((int(h / aspect_ratio), w), Image.BICUBIC) image_pil.save(image_path) def print_numpy(x, val=True, shp=False): """Print the mean, min, max, median, std, and size of a numpy array Parameters: val (bool) -- if print the values of the numpy array shp (bool) -- if print the shape of the numpy array """ x = x.astype(np.float64) if shp: print('shape,', x.shape) if val: x = x.flatten() print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % ( np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x))) def mkdirs(paths): """create empty directories if they don't exist Parameters: paths (str list) -- a list of directory paths """ if isinstance(paths, list) and not isinstance(paths, str): for path in paths: mkdir(path) else: mkdir(paths) def mkdir(path): """create a single empty directory if it didn't exist Parameters: path (str) -- a single directory path """ if not os.path.exists(path): os.makedirs(path) ================================================ FILE: annotator/leres/pix2pix/util/visualizer.py ================================================ import numpy as np import os import sys import ntpath import time from . import util, html from subprocess import Popen, PIPE import torch if sys.version_info[0] == 2: VisdomExceptionBase = Exception else: VisdomExceptionBase = ConnectionError def save_images(webpage, visuals, image_path, aspect_ratio=1.0, width=256): """Save images to the disk. Parameters: webpage (the HTML class) -- the HTML webpage class that stores these imaegs (see html.py for more details) visuals (OrderedDict) -- an ordered dictionary that stores (name, images (either tensor or numpy) ) pairs image_path (str) -- the string is used to create image paths aspect_ratio (float) -- the aspect ratio of saved images width (int) -- the images will be resized to width x width This function will save images stored in 'visuals' to the HTML file specified by 'webpage'. """ image_dir = webpage.get_image_dir() short_path = ntpath.basename(image_path[0]) name = os.path.splitext(short_path)[0] webpage.add_header(name) ims, txts, links = [], [], [] for label, im_data in visuals.items(): im = util.tensor2im(im_data) image_name = '%s_%s.png' % (name, label) save_path = os.path.join(image_dir, image_name) util.save_image(im, save_path, aspect_ratio=aspect_ratio) ims.append(image_name) txts.append(label) links.append(image_name) webpage.add_images(ims, txts, links, width=width) class Visualizer(): """This class includes several functions that can display/save images and print/save logging information. It uses a Python library 'visdom' for display, and a Python library 'dominate' (wrapped in 'HTML') for creating HTML files with images. """ def __init__(self, opt): """Initialize the Visualizer class Parameters: opt -- stores all the experiment flags; needs to be a subclass of BaseOptions Step 1: Cache the training/test options Step 2: connect to a visdom server Step 3: create an HTML object for saveing HTML filters Step 4: create a logging file to store training losses """ self.opt = opt # cache the option self.display_id = opt.display_id self.use_html = opt.isTrain and not opt.no_html self.win_size = opt.display_winsize self.name = opt.name self.port = opt.display_port self.saved = False if self.use_html: # create an HTML object at /web/; images will be saved under /web/images/ self.web_dir = os.path.join(opt.checkpoints_dir, opt.name, 'web') self.img_dir = os.path.join(self.web_dir, 'images') print('create web directory %s...' % self.web_dir) util.mkdirs([self.web_dir, self.img_dir]) # create a logging file to store training losses self.log_name = os.path.join(opt.checkpoints_dir, opt.name, 'loss_log.txt') with open(self.log_name, "a") as log_file: now = time.strftime("%c") log_file.write('================ Training Loss (%s) ================\n' % now) def reset(self): """Reset the self.saved status""" self.saved = False def create_visdom_connections(self): """If the program could not connect to Visdom server, this function will start a new server at port < self.port > """ cmd = sys.executable + ' -m visdom.server -p %d &>/dev/null &' % self.port print('\n\nCould not connect to Visdom server. \n Trying to start a server....') print('Command: %s' % cmd) Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) def display_current_results(self, visuals, epoch, save_result): """Display current results on visdom; save current results to an HTML file. Parameters: visuals (OrderedDict) - - dictionary of images to display or save epoch (int) - - the current epoch save_result (bool) - - if save the current results to an HTML file """ if self.use_html and (save_result or not self.saved): # save images to an HTML file if they haven't been saved. self.saved = True # save images to the disk for label, image in visuals.items(): image_numpy = util.tensor2im(image) img_path = os.path.join(self.img_dir, 'epoch%.3d_%s.png' % (epoch, label)) util.save_image(image_numpy, img_path) # update website webpage = html.HTML(self.web_dir, 'Experiment name = %s' % self.name, refresh=1) for n in range(epoch, 0, -1): webpage.add_header('epoch [%d]' % n) ims, txts, links = [], [], [] for label, image_numpy in visuals.items(): # image_numpy = util.tensor2im(image) img_path = 'epoch%.3d_%s.png' % (n, label) ims.append(img_path) txts.append(label) links.append(img_path) webpage.add_images(ims, txts, links, width=self.win_size) webpage.save() # def plot_current_losses(self, epoch, counter_ratio, losses): # """display the current losses on visdom display: dictionary of error labels and values # # Parameters: # epoch (int) -- current epoch # counter_ratio (float) -- progress (percentage) in the current epoch, between 0 to 1 # losses (OrderedDict) -- training losses stored in the format of (name, float) pairs # """ # if not hasattr(self, 'plot_data'): # self.plot_data = {'X': [], 'Y': [], 'legend': list(losses.keys())} # self.plot_data['X'].append(epoch + counter_ratio) # self.plot_data['Y'].append([losses[k] for k in self.plot_data['legend']]) # try: # self.vis.line( # X=np.stack([np.array(self.plot_data['X'])] * len(self.plot_data['legend']), 1), # Y=np.array(self.plot_data['Y']), # opts={ # 'title': self.name + ' loss over time', # 'legend': self.plot_data['legend'], # 'xlabel': 'epoch', # 'ylabel': 'loss'}, # win=self.display_id) # except VisdomExceptionBase: # self.create_visdom_connections() # losses: same format as |losses| of plot_current_losses def print_current_losses(self, epoch, iters, losses, t_comp, t_data): """print current losses on console; also save the losses to the disk Parameters: epoch (int) -- current epoch iters (int) -- current training iteration during this epoch (reset to 0 at the end of every epoch) losses (OrderedDict) -- training losses stored in the format of (name, float) pairs t_comp (float) -- computational time per data point (normalized by batch_size) t_data (float) -- data loading time per data point (normalized by batch_size) """ message = '(epoch: %d, iters: %d, time: %.3f, data: %.3f) ' % (epoch, iters, t_comp, t_data) for k, v in losses.items(): message += '%s: %.3f ' % (k, v) print(message) # print the message with open(self.log_name, "a") as log_file: log_file.write('%s\n' % message) # save the message ================================================ FILE: annotator/lineart/LICENSE ================================================ MIT License Copyright (c) 2022 Caroline Chan Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/lineart/__init__.py ================================================ import os import cv2 import torch import numpy as np import torch.nn as nn from einops import rearrange from modules import devices from annotator.annotator_path import models_path norm_layer = nn.InstanceNorm2d class ResidualBlock(nn.Module): def __init__(self, in_features): super(ResidualBlock, self).__init__() conv_block = [ nn.ReflectionPad2d(1), nn.Conv2d(in_features, in_features, 3), norm_layer(in_features), nn.ReLU(inplace=True), nn.ReflectionPad2d(1), nn.Conv2d(in_features, in_features, 3), norm_layer(in_features) ] self.conv_block = nn.Sequential(*conv_block) def forward(self, x): return x + self.conv_block(x) class Generator(nn.Module): def __init__(self, input_nc, output_nc, n_residual_blocks=9, sigmoid=True): super(Generator, self).__init__() # Initial convolution block model0 = [ nn.ReflectionPad2d(3), nn.Conv2d(input_nc, 64, 7), norm_layer(64), nn.ReLU(inplace=True) ] self.model0 = nn.Sequential(*model0) # Downsampling model1 = [] in_features = 64 out_features = in_features*2 for _ in range(2): model1 += [ nn.Conv2d(in_features, out_features, 3, stride=2, padding=1), norm_layer(out_features), nn.ReLU(inplace=True) ] in_features = out_features out_features = in_features*2 self.model1 = nn.Sequential(*model1) model2 = [] # Residual blocks for _ in range(n_residual_blocks): model2 += [ResidualBlock(in_features)] self.model2 = nn.Sequential(*model2) # Upsampling model3 = [] out_features = in_features//2 for _ in range(2): model3 += [ nn.ConvTranspose2d(in_features, out_features, 3, stride=2, padding=1, output_padding=1), norm_layer(out_features), nn.ReLU(inplace=True) ] in_features = out_features out_features = in_features//2 self.model3 = nn.Sequential(*model3) # Output layer model4 = [ nn.ReflectionPad2d(3), nn.Conv2d(64, output_nc, 7)] if sigmoid: model4 += [nn.Sigmoid()] self.model4 = nn.Sequential(*model4) def forward(self, x, cond=None): out = self.model0(x) out = self.model1(out) out = self.model2(out) out = self.model3(out) out = self.model4(out) return out class LineartDetector: model_dir = os.path.join(models_path, "lineart") model_default = 'sk_model.pth' model_coarse = 'sk_model2.pth' def __init__(self, model_name): self.model = None self.model_name = model_name self.device = devices.get_device_for("controlnet") def load_model(self, name): remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/" + name model_path = os.path.join(self.model_dir, name) if not os.path.exists(model_path): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=self.model_dir) model = Generator(3, 1, 3) model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu'))) model.eval() self.model = model.to(self.device) def unload_model(self): if self.model is not None: self.model.cpu() def __call__(self, input_image): if self.model is None: self.load_model(self.model_name) self.model.to(self.device) assert input_image.ndim == 3 image = input_image with torch.no_grad(): image = torch.from_numpy(image).float().to(self.device) image = image / 255.0 image = rearrange(image, 'h w c -> 1 c h w') line = self.model(image)[0][0] line = line.cpu().numpy() line = (line * 255.0).clip(0, 255).astype(np.uint8) return line ================================================ FILE: annotator/lineart_anime/LICENSE ================================================ MIT License Copyright (c) 2022 Caroline Chan Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/lineart_anime/__init__.py ================================================ import numpy as np import torch import torch.nn as nn import functools import os import cv2 from einops import rearrange from modules import devices from annotator.annotator_path import models_path class UnetGenerator(nn.Module): """Create a Unet-based generator""" def __init__(self, input_nc, output_nc, num_downs, ngf=64, norm_layer=nn.BatchNorm2d, use_dropout=False): """Construct a Unet generator Parameters: input_nc (int) -- the number of channels in input images output_nc (int) -- the number of channels in output images num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7, image of size 128x128 will become of size 1x1 # at the bottleneck ngf (int) -- the number of filters in the last conv layer norm_layer -- normalization layer We construct the U-Net from the innermost layer to the outermost layer. It is a recursive process. """ super(UnetGenerator, self).__init__() # construct unet structure unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=None, norm_layer=norm_layer, innermost=True) # add the innermost layer for _ in range(num_downs - 5): # add intermediate layers with ngf * 8 filters unet_block = UnetSkipConnectionBlock(ngf * 8, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer, use_dropout=use_dropout) # gradually reduce the number of filters from ngf * 8 to ngf unet_block = UnetSkipConnectionBlock(ngf * 4, ngf * 8, input_nc=None, submodule=unet_block, norm_layer=norm_layer) unet_block = UnetSkipConnectionBlock(ngf * 2, ngf * 4, input_nc=None, submodule=unet_block, norm_layer=norm_layer) unet_block = UnetSkipConnectionBlock(ngf, ngf * 2, input_nc=None, submodule=unet_block, norm_layer=norm_layer) self.model = UnetSkipConnectionBlock(output_nc, ngf, input_nc=input_nc, submodule=unet_block, outermost=True, norm_layer=norm_layer) # add the outermost layer def forward(self, input): """Standard forward""" return self.model(input) class UnetSkipConnectionBlock(nn.Module): """Defines the Unet submodule with skip connection. X -------------------identity---------------------- |-- downsampling -- |submodule| -- upsampling --| """ def __init__(self, outer_nc, inner_nc, input_nc=None, submodule=None, outermost=False, innermost=False, norm_layer=nn.BatchNorm2d, use_dropout=False): """Construct a Unet submodule with skip connections. Parameters: outer_nc (int) -- the number of filters in the outer conv layer inner_nc (int) -- the number of filters in the inner conv layer input_nc (int) -- the number of channels in input images/features submodule (UnetSkipConnectionBlock) -- previously defined submodules outermost (bool) -- if this module is the outermost module innermost (bool) -- if this module is the innermost module norm_layer -- normalization layer use_dropout (bool) -- if use dropout layers. """ super(UnetSkipConnectionBlock, self).__init__() self.outermost = outermost if type(norm_layer) == functools.partial: use_bias = norm_layer.func == nn.InstanceNorm2d else: use_bias = norm_layer == nn.InstanceNorm2d if input_nc is None: input_nc = outer_nc downconv = nn.Conv2d(input_nc, inner_nc, kernel_size=4, stride=2, padding=1, bias=use_bias) downrelu = nn.LeakyReLU(0.2, True) downnorm = norm_layer(inner_nc) uprelu = nn.ReLU(True) upnorm = norm_layer(outer_nc) if outermost: upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc, kernel_size=4, stride=2, padding=1) down = [downconv] up = [uprelu, upconv, nn.Tanh()] model = down + [submodule] + up elif innermost: upconv = nn.ConvTranspose2d(inner_nc, outer_nc, kernel_size=4, stride=2, padding=1, bias=use_bias) down = [downrelu, downconv] up = [uprelu, upconv, upnorm] model = down + up else: upconv = nn.ConvTranspose2d(inner_nc * 2, outer_nc, kernel_size=4, stride=2, padding=1, bias=use_bias) down = [downrelu, downconv, downnorm] up = [uprelu, upconv, upnorm] if use_dropout: model = down + [submodule] + up + [nn.Dropout(0.5)] else: model = down + [submodule] + up self.model = nn.Sequential(*model) def forward(self, x): if self.outermost: return self.model(x) else: # add skip connections return torch.cat([x, self.model(x)], 1) class LineartAnimeDetector: model_dir = os.path.join(models_path, "lineart_anime") def __init__(self): self.model = None self.device = devices.get_device_for("controlnet") def load_model(self): remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/netG.pth" modelpath = os.path.join(self.model_dir, "netG.pth") if not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=self.model_dir) norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False) net = UnetGenerator(3, 1, 8, 64, norm_layer=norm_layer, use_dropout=False) ckpt = torch.load(modelpath) for key in list(ckpt.keys()): if 'module.' in key: ckpt[key.replace('module.', '')] = ckpt[key] del ckpt[key] net.load_state_dict(ckpt) net.eval() self.model = net.to(self.device) def unload_model(self): if self.model is not None: self.model.cpu() def __call__(self, input_image): if self.model is None: self.load_model() self.model.to(self.device) H, W, C = input_image.shape Hn = 256 * int(np.ceil(float(H) / 256.0)) Wn = 256 * int(np.ceil(float(W) / 256.0)) img = cv2.resize(input_image, (Wn, Hn), interpolation=cv2.INTER_CUBIC) with torch.no_grad(): image_feed = torch.from_numpy(img).float().to(self.device) image_feed = image_feed / 127.5 - 1.0 image_feed = rearrange(image_feed, 'h w c -> 1 c h w') line = self.model(image_feed)[0, 0] * 127.5 + 127.5 line = line.cpu().numpy() line = cv2.resize(line, (W, H), interpolation=cv2.INTER_CUBIC) line = line.clip(0, 255).astype(np.uint8) return line ================================================ FILE: annotator/manga_line/LICENSE ================================================ MIT License Copyright (c) 2021 Miaomiao Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/manga_line/__init__.py ================================================ import os import torch import torch.nn as nn from PIL import Image import fnmatch import cv2 import sys import numpy as np from einops import rearrange from modules import devices from annotator.annotator_path import models_path class _bn_relu_conv(nn.Module): def __init__(self, in_filters, nb_filters, fw, fh, subsample=1): super(_bn_relu_conv, self).__init__() self.model = nn.Sequential( nn.BatchNorm2d(in_filters, eps=1e-3), nn.LeakyReLU(0.2), nn.Conv2d(in_filters, nb_filters, (fw, fh), stride=subsample, padding=(fw//2, fh//2), padding_mode='zeros') ) def forward(self, x): return self.model(x) # the following are for debugs print("****", np.max(x.cpu().numpy()), np.min(x.cpu().numpy()), np.mean(x.cpu().numpy()), np.std(x.cpu().numpy()), x.shape) for i,layer in enumerate(self.model): if i != 2: x = layer(x) else: x = layer(x) #x = nn.functional.pad(x, (1, 1, 1, 1), mode='constant', value=0) print("____", np.max(x.cpu().numpy()), np.min(x.cpu().numpy()), np.mean(x.cpu().numpy()), np.std(x.cpu().numpy()), x.shape) print(x[0]) return x class _u_bn_relu_conv(nn.Module): def __init__(self, in_filters, nb_filters, fw, fh, subsample=1): super(_u_bn_relu_conv, self).__init__() self.model = nn.Sequential( nn.BatchNorm2d(in_filters, eps=1e-3), nn.LeakyReLU(0.2), nn.Conv2d(in_filters, nb_filters, (fw, fh), stride=subsample, padding=(fw//2, fh//2)), nn.Upsample(scale_factor=2, mode='nearest') ) def forward(self, x): return self.model(x) class _shortcut(nn.Module): def __init__(self, in_filters, nb_filters, subsample=1): super(_shortcut, self).__init__() self.process = False self.model = None if in_filters != nb_filters or subsample != 1: self.process = True self.model = nn.Sequential( nn.Conv2d(in_filters, nb_filters, (1, 1), stride=subsample) ) def forward(self, x, y): #print(x.size(), y.size(), self.process) if self.process: y0 = self.model(x) #print("merge+", torch.max(y0+y), torch.min(y0+y),torch.mean(y0+y), torch.std(y0+y), y0.shape) return y0 + y else: #print("merge", torch.max(x+y), torch.min(x+y),torch.mean(x+y), torch.std(x+y), y.shape) return x + y class _u_shortcut(nn.Module): def __init__(self, in_filters, nb_filters, subsample): super(_u_shortcut, self).__init__() self.process = False self.model = None if in_filters != nb_filters: self.process = True self.model = nn.Sequential( nn.Conv2d(in_filters, nb_filters, (1, 1), stride=subsample, padding_mode='zeros'), nn.Upsample(scale_factor=2, mode='nearest') ) def forward(self, x, y): if self.process: return self.model(x) + y else: return x + y class basic_block(nn.Module): def __init__(self, in_filters, nb_filters, init_subsample=1): super(basic_block, self).__init__() self.conv1 = _bn_relu_conv(in_filters, nb_filters, 3, 3, subsample=init_subsample) self.residual = _bn_relu_conv(nb_filters, nb_filters, 3, 3) self.shortcut = _shortcut(in_filters, nb_filters, subsample=init_subsample) def forward(self, x): x1 = self.conv1(x) x2 = self.residual(x1) return self.shortcut(x, x2) class _u_basic_block(nn.Module): def __init__(self, in_filters, nb_filters, init_subsample=1): super(_u_basic_block, self).__init__() self.conv1 = _u_bn_relu_conv(in_filters, nb_filters, 3, 3, subsample=init_subsample) self.residual = _bn_relu_conv(nb_filters, nb_filters, 3, 3) self.shortcut = _u_shortcut(in_filters, nb_filters, subsample=init_subsample) def forward(self, x): y = self.residual(self.conv1(x)) return self.shortcut(x, y) class _residual_block(nn.Module): def __init__(self, in_filters, nb_filters, repetitions, is_first_layer=False): super(_residual_block, self).__init__() layers = [] for i in range(repetitions): init_subsample = 1 if i == repetitions - 1 and not is_first_layer: init_subsample = 2 if i == 0: l = basic_block(in_filters=in_filters, nb_filters=nb_filters, init_subsample=init_subsample) else: l = basic_block(in_filters=nb_filters, nb_filters=nb_filters, init_subsample=init_subsample) layers.append(l) self.model = nn.Sequential(*layers) def forward(self, x): return self.model(x) class _upsampling_residual_block(nn.Module): def __init__(self, in_filters, nb_filters, repetitions): super(_upsampling_residual_block, self).__init__() layers = [] for i in range(repetitions): l = None if i == 0: l = _u_basic_block(in_filters=in_filters, nb_filters=nb_filters)#(input) else: l = basic_block(in_filters=nb_filters, nb_filters=nb_filters)#(input) layers.append(l) self.model = nn.Sequential(*layers) def forward(self, x): return self.model(x) class res_skip(nn.Module): def __init__(self): super(res_skip, self).__init__() self.block0 = _residual_block(in_filters=1, nb_filters=24, repetitions=2, is_first_layer=True)#(input) self.block1 = _residual_block(in_filters=24, nb_filters=48, repetitions=3)#(block0) self.block2 = _residual_block(in_filters=48, nb_filters=96, repetitions=5)#(block1) self.block3 = _residual_block(in_filters=96, nb_filters=192, repetitions=7)#(block2) self.block4 = _residual_block(in_filters=192, nb_filters=384, repetitions=12)#(block3) self.block5 = _upsampling_residual_block(in_filters=384, nb_filters=192, repetitions=7)#(block4) self.res1 = _shortcut(in_filters=192, nb_filters=192)#(block3, block5, subsample=(1,1)) self.block6 = _upsampling_residual_block(in_filters=192, nb_filters=96, repetitions=5)#(res1) self.res2 = _shortcut(in_filters=96, nb_filters=96)#(block2, block6, subsample=(1,1)) self.block7 = _upsampling_residual_block(in_filters=96, nb_filters=48, repetitions=3)#(res2) self.res3 = _shortcut(in_filters=48, nb_filters=48)#(block1, block7, subsample=(1,1)) self.block8 = _upsampling_residual_block(in_filters=48, nb_filters=24, repetitions=2)#(res3) self.res4 = _shortcut(in_filters=24, nb_filters=24)#(block0,block8, subsample=(1,1)) self.block9 = _residual_block(in_filters=24, nb_filters=16, repetitions=2, is_first_layer=True)#(res4) self.conv15 = _bn_relu_conv(in_filters=16, nb_filters=1, fh=1, fw=1, subsample=1)#(block7) def forward(self, x): x0 = self.block0(x) x1 = self.block1(x0) x2 = self.block2(x1) x3 = self.block3(x2) x4 = self.block4(x3) x5 = self.block5(x4) res1 = self.res1(x3, x5) x6 = self.block6(res1) res2 = self.res2(x2, x6) x7 = self.block7(res2) res3 = self.res3(x1, x7) x8 = self.block8(res3) res4 = self.res4(x0, x8) x9 = self.block9(res4) y = self.conv15(x9) return y class MangaLineExtration: model_dir = os.path.join(models_path, "manga_line") def __init__(self): self.model = None self.device = devices.get_device_for("controlnet") def load_model(self): remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/erika.pth" modelpath = os.path.join(self.model_dir, "erika.pth") if not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=self.model_dir) #norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False) net = res_skip() ckpt = torch.load(modelpath) for key in list(ckpt.keys()): if 'module.' in key: ckpt[key.replace('module.', '')] = ckpt[key] del ckpt[key] net.load_state_dict(ckpt) net.eval() self.model = net.to(self.device) def unload_model(self): if self.model is not None: self.model.cpu() def __call__(self, input_image): if self.model is None: self.load_model() self.model.to(self.device) img = cv2.cvtColor(input_image, cv2.COLOR_RGB2GRAY) img = np.ascontiguousarray(img.copy()).copy() with torch.no_grad(): image_feed = torch.from_numpy(img).float().to(self.device) image_feed = rearrange(image_feed, 'h w -> 1 1 h w') line = self.model(image_feed) line = 255 - line.cpu().numpy()[0, 0] return line.clip(0, 255).astype(np.uint8) ================================================ FILE: annotator/mediapipe_face/__init__.py ================================================ from .mediapipe_face_common import generate_annotation def apply_mediapipe_face(image, max_faces: int = 1, min_confidence: float = 0.5): return generate_annotation(image, max_faces, min_confidence) ================================================ FILE: annotator/mediapipe_face/mediapipe_face_common.py ================================================ from typing import Mapping import mediapipe as mp import numpy mp_drawing = mp.solutions.drawing_utils mp_drawing_styles = mp.solutions.drawing_styles mp_face_detection = mp.solutions.face_detection # Only for counting faces. mp_face_mesh = mp.solutions.face_mesh mp_face_connections = mp.solutions.face_mesh_connections.FACEMESH_TESSELATION mp_hand_connections = mp.solutions.hands_connections.HAND_CONNECTIONS mp_body_connections = mp.solutions.pose_connections.POSE_CONNECTIONS DrawingSpec = mp.solutions.drawing_styles.DrawingSpec PoseLandmark = mp.solutions.drawing_styles.PoseLandmark min_face_size_pixels: int = 64 f_thick = 2 f_rad = 1 right_iris_draw = DrawingSpec(color=(10, 200, 250), thickness=f_thick, circle_radius=f_rad) right_eye_draw = DrawingSpec(color=(10, 200, 180), thickness=f_thick, circle_radius=f_rad) right_eyebrow_draw = DrawingSpec(color=(10, 220, 180), thickness=f_thick, circle_radius=f_rad) left_iris_draw = DrawingSpec(color=(250, 200, 10), thickness=f_thick, circle_radius=f_rad) left_eye_draw = DrawingSpec(color=(180, 200, 10), thickness=f_thick, circle_radius=f_rad) left_eyebrow_draw = DrawingSpec(color=(180, 220, 10), thickness=f_thick, circle_radius=f_rad) mouth_draw = DrawingSpec(color=(10, 180, 10), thickness=f_thick, circle_radius=f_rad) head_draw = DrawingSpec(color=(10, 200, 10), thickness=f_thick, circle_radius=f_rad) # mp_face_mesh.FACEMESH_CONTOURS has all the items we care about. face_connection_spec = {} for edge in mp_face_mesh.FACEMESH_FACE_OVAL: face_connection_spec[edge] = head_draw for edge in mp_face_mesh.FACEMESH_LEFT_EYE: face_connection_spec[edge] = left_eye_draw for edge in mp_face_mesh.FACEMESH_LEFT_EYEBROW: face_connection_spec[edge] = left_eyebrow_draw # for edge in mp_face_mesh.FACEMESH_LEFT_IRIS: # face_connection_spec[edge] = left_iris_draw for edge in mp_face_mesh.FACEMESH_RIGHT_EYE: face_connection_spec[edge] = right_eye_draw for edge in mp_face_mesh.FACEMESH_RIGHT_EYEBROW: face_connection_spec[edge] = right_eyebrow_draw # for edge in mp_face_mesh.FACEMESH_RIGHT_IRIS: # face_connection_spec[edge] = right_iris_draw for edge in mp_face_mesh.FACEMESH_LIPS: face_connection_spec[edge] = mouth_draw iris_landmark_spec = {468: right_iris_draw, 473: left_iris_draw} def draw_pupils(image, landmark_list, drawing_spec, halfwidth: int = 2): """We have a custom function to draw the pupils because the mp.draw_landmarks method requires a parameter for all landmarks. Until our PR is merged into mediapipe, we need this separate method.""" if len(image.shape) != 3: raise ValueError("Input image must be H,W,C.") image_rows, image_cols, image_channels = image.shape if image_channels != 3: # BGR channels raise ValueError('Input image must contain three channel bgr data.') for idx, landmark in enumerate(landmark_list.landmark): if ( (landmark.HasField('visibility') and landmark.visibility < 0.9) or (landmark.HasField('presence') and landmark.presence < 0.5) ): continue if landmark.x >= 1.0 or landmark.x < 0 or landmark.y >= 1.0 or landmark.y < 0: continue image_x = int(image_cols*landmark.x) image_y = int(image_rows*landmark.y) draw_color = None if isinstance(drawing_spec, Mapping): if drawing_spec.get(idx) is None: continue else: draw_color = drawing_spec[idx].color elif isinstance(drawing_spec, DrawingSpec): draw_color = drawing_spec.color image[image_y-halfwidth:image_y+halfwidth, image_x-halfwidth:image_x+halfwidth, :] = draw_color def reverse_channels(image): """Given a numpy array in RGB form, convert to BGR. Will also convert from BGR to RGB.""" # im[:,:,::-1] is a neat hack to convert BGR to RGB by reversing the indexing order. # im[:,:,::[2,1,0]] would also work but makes a copy of the data. return image[:, :, ::-1] def generate_annotation( img_rgb, max_faces: int, min_confidence: float ): """ Find up to 'max_faces' inside the provided input image. If min_face_size_pixels is provided and nonzero it will be used to filter faces that occupy less than this many pixels in the image. """ with mp_face_mesh.FaceMesh( static_image_mode=True, max_num_faces=max_faces, refine_landmarks=True, min_detection_confidence=min_confidence, ) as facemesh: img_height, img_width, img_channels = img_rgb.shape assert(img_channels == 3) results = facemesh.process(img_rgb).multi_face_landmarks if results is None: print("No faces detected in controlnet image for Mediapipe face annotator.") return numpy.zeros_like(img_rgb) # Filter faces that are too small filtered_landmarks = [] for lm in results: landmarks = lm.landmark face_rect = [ landmarks[0].x, landmarks[0].y, landmarks[0].x, landmarks[0].y, ] # Left, up, right, down. for i in range(len(landmarks)): face_rect[0] = min(face_rect[0], landmarks[i].x) face_rect[1] = min(face_rect[1], landmarks[i].y) face_rect[2] = max(face_rect[2], landmarks[i].x) face_rect[3] = max(face_rect[3], landmarks[i].y) if min_face_size_pixels > 0: face_width = abs(face_rect[2] - face_rect[0]) face_height = abs(face_rect[3] - face_rect[1]) face_width_pixels = face_width * img_width face_height_pixels = face_height * img_height face_size = min(face_width_pixels, face_height_pixels) if face_size >= min_face_size_pixels: filtered_landmarks.append(lm) else: filtered_landmarks.append(lm) # Annotations are drawn in BGR for some reason, but we don't need to flip a zero-filled image at the start. empty = numpy.zeros_like(img_rgb) # Draw detected faces: for face_landmarks in filtered_landmarks: mp_drawing.draw_landmarks( empty, face_landmarks, connections=face_connection_spec.keys(), landmark_drawing_spec=None, connection_drawing_spec=face_connection_spec ) draw_pupils(empty, face_landmarks, iris_landmark_spec, 2) # Flip BGR back to RGB. empty = reverse_channels(empty).copy() return empty ================================================ FILE: annotator/midas/LICENSE ================================================ MIT License Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/midas/__init__.py ================================================ import cv2 import numpy as np import torch from einops import rearrange from .api import MiDaSInference from modules import devices model = None def unload_midas_model(): global model if model is not None: model = model.cpu() def apply_midas(input_image, a=np.pi * 2.0, bg_th=0.1): global model if model is None: model = MiDaSInference(model_type="dpt_hybrid") if devices.get_device_for("controlnet").type != 'mps': model = model.to(devices.get_device_for("controlnet")) assert input_image.ndim == 3 image_depth = input_image with torch.no_grad(): image_depth = torch.from_numpy(image_depth).float() if devices.get_device_for("controlnet").type != 'mps': image_depth = image_depth.to(devices.get_device_for("controlnet")) image_depth = image_depth / 127.5 - 1.0 image_depth = rearrange(image_depth, 'h w c -> 1 c h w') depth = model(image_depth)[0] depth_pt = depth.clone() depth_pt -= torch.min(depth_pt) depth_pt /= torch.max(depth_pt) depth_pt = depth_pt.cpu().numpy() depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8) depth_np = depth.cpu().numpy() x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3) y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3) z = np.ones_like(x) * a x[depth_pt < bg_th] = 0 y[depth_pt < bg_th] = 0 normal = np.stack([x, y, z], axis=2) normal /= np.sum(normal ** 2.0, axis=2, keepdims=True) ** 0.5 normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8)[:, :, ::-1] return depth_image, normal_image ================================================ FILE: annotator/midas/api.py ================================================ # based on https://github.com/isl-org/MiDaS import cv2 import torch import torch.nn as nn import os from annotator.annotator_path import models_path from torchvision.transforms import Compose from .midas.dpt_depth import DPTDepthModel from .midas.midas_net import MidasNet from .midas.midas_net_custom import MidasNet_small from .midas.transforms import Resize, NormalizeImage, PrepareForNet base_model_path = os.path.join(models_path, "midas") old_modeldir = os.path.dirname(os.path.realpath(__file__)) remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt" ISL_PATHS = { "dpt_large": os.path.join(base_model_path, "dpt_large-midas-2f21e586.pt"), "dpt_hybrid": os.path.join(base_model_path, "dpt_hybrid-midas-501f0c75.pt"), "midas_v21": "", "midas_v21_small": "", } OLD_ISL_PATHS = { "dpt_large": os.path.join(old_modeldir, "dpt_large-midas-2f21e586.pt"), "dpt_hybrid": os.path.join(old_modeldir, "dpt_hybrid-midas-501f0c75.pt"), "midas_v21": "", "midas_v21_small": "", } def disabled_train(self, mode=True): """Overwrite model.train with this function to make sure train/eval mode does not change anymore.""" return self def load_midas_transform(model_type): # https://github.com/isl-org/MiDaS/blob/master/run.py # load transform only if model_type == "dpt_large": # DPT-Large net_w, net_h = 384, 384 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_hybrid": # DPT-Hybrid net_w, net_h = 384, 384 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "midas_v21": net_w, net_h = 384, 384 resize_mode = "upper_bound" normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) elif model_type == "midas_v21_small": net_w, net_h = 256, 256 resize_mode = "upper_bound" normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) else: assert False, f"model_type '{model_type}' not implemented, use: --model_type large" transform = Compose( [ Resize( net_w, net_h, resize_target=None, keep_aspect_ratio=True, ensure_multiple_of=32, resize_method=resize_mode, image_interpolation_method=cv2.INTER_CUBIC, ), normalization, PrepareForNet(), ] ) return transform def load_model(model_type): # https://github.com/isl-org/MiDaS/blob/master/run.py # load network model_path = ISL_PATHS[model_type] old_model_path = OLD_ISL_PATHS[model_type] if model_type == "dpt_large": # DPT-Large model = DPTDepthModel( path=model_path, backbone="vitl16_384", non_negative=True, ) net_w, net_h = 384, 384 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_hybrid": # DPT-Hybrid if os.path.exists(old_model_path): model_path = old_model_path elif not os.path.exists(model_path): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=base_model_path) model = DPTDepthModel( path=model_path, backbone="vitb_rn50_384", non_negative=True, ) net_w, net_h = 384, 384 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "midas_v21": model = MidasNet(model_path, non_negative=True) net_w, net_h = 384, 384 resize_mode = "upper_bound" normalization = NormalizeImage( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) elif model_type == "midas_v21_small": model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True}) net_w, net_h = 256, 256 resize_mode = "upper_bound" normalization = NormalizeImage( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) else: print(f"model_type '{model_type}' not implemented, use: --model_type large") assert False transform = Compose( [ Resize( net_w, net_h, resize_target=None, keep_aspect_ratio=True, ensure_multiple_of=32, resize_method=resize_mode, image_interpolation_method=cv2.INTER_CUBIC, ), normalization, PrepareForNet(), ] ) return model.eval(), transform class MiDaSInference(nn.Module): MODEL_TYPES_TORCH_HUB = [ "DPT_Large", "DPT_Hybrid", "MiDaS_small" ] MODEL_TYPES_ISL = [ "dpt_large", "dpt_hybrid", "midas_v21", "midas_v21_small", ] def __init__(self, model_type): super().__init__() assert (model_type in self.MODEL_TYPES_ISL) model, _ = load_model(model_type) self.model = model self.model.train = disabled_train def forward(self, x): with torch.no_grad(): prediction = self.model(x) return prediction ================================================ FILE: annotator/midas/midas/__init__.py ================================================ ================================================ FILE: annotator/midas/midas/base_model.py ================================================ import torch class BaseModel(torch.nn.Module): def load(self, path): """Load model from file. Args: path (str): file path """ parameters = torch.load(path, map_location=torch.device('cpu')) if "optimizer" in parameters: parameters = parameters["model"] self.load_state_dict(parameters) ================================================ FILE: annotator/midas/midas/blocks.py ================================================ import torch import torch.nn as nn from .vit import ( _make_pretrained_vitb_rn50_384, _make_pretrained_vitl16_384, _make_pretrained_vitb16_384, forward_vit, ) def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",): if backbone == "vitl16_384": pretrained = _make_pretrained_vitl16_384( use_pretrained, hooks=hooks, use_readout=use_readout ) scratch = _make_scratch( [256, 512, 1024, 1024], features, groups=groups, expand=expand ) # ViT-L/16 - 85.0% Top1 (backbone) elif backbone == "vitb_rn50_384": pretrained = _make_pretrained_vitb_rn50_384( use_pretrained, hooks=hooks, use_vit_only=use_vit_only, use_readout=use_readout, ) scratch = _make_scratch( [256, 512, 768, 768], features, groups=groups, expand=expand ) # ViT-H/16 - 85.0% Top1 (backbone) elif backbone == "vitb16_384": pretrained = _make_pretrained_vitb16_384( use_pretrained, hooks=hooks, use_readout=use_readout ) scratch = _make_scratch( [96, 192, 384, 768], features, groups=groups, expand=expand ) # ViT-B/16 - 84.6% Top1 (backbone) elif backbone == "resnext101_wsl": pretrained = _make_pretrained_resnext101_wsl(use_pretrained) scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3 elif backbone == "efficientnet_lite3": pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable) scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3 else: print(f"Backbone '{backbone}' not implemented") assert False return pretrained, scratch def _make_scratch(in_shape, out_shape, groups=1, expand=False): scratch = nn.Module() out_shape1 = out_shape out_shape2 = out_shape out_shape3 = out_shape out_shape4 = out_shape if expand==True: out_shape1 = out_shape out_shape2 = out_shape*2 out_shape3 = out_shape*4 out_shape4 = out_shape*8 scratch.layer1_rn = nn.Conv2d( in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups ) scratch.layer2_rn = nn.Conv2d( in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups ) scratch.layer3_rn = nn.Conv2d( in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups ) scratch.layer4_rn = nn.Conv2d( in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups ) return scratch def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False): efficientnet = torch.hub.load( "rwightman/gen-efficientnet-pytorch", "tf_efficientnet_lite3", pretrained=use_pretrained, exportable=exportable ) return _make_efficientnet_backbone(efficientnet) def _make_efficientnet_backbone(effnet): pretrained = nn.Module() pretrained.layer1 = nn.Sequential( effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2] ) pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3]) pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5]) pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9]) return pretrained def _make_resnet_backbone(resnet): pretrained = nn.Module() pretrained.layer1 = nn.Sequential( resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1 ) pretrained.layer2 = resnet.layer2 pretrained.layer3 = resnet.layer3 pretrained.layer4 = resnet.layer4 return pretrained def _make_pretrained_resnext101_wsl(use_pretrained): resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl") return _make_resnet_backbone(resnet) class Interpolate(nn.Module): """Interpolation module. """ def __init__(self, scale_factor, mode, align_corners=False): """Init. Args: scale_factor (float): scaling mode (str): interpolation mode """ super(Interpolate, self).__init__() self.interp = nn.functional.interpolate self.scale_factor = scale_factor self.mode = mode self.align_corners = align_corners def forward(self, x): """Forward pass. Args: x (tensor): input Returns: tensor: interpolated data """ x = self.interp( x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners ) return x class ResidualConvUnit(nn.Module): """Residual convolution module. """ def __init__(self, features): """Init. Args: features (int): number of features """ super().__init__() self.conv1 = nn.Conv2d( features, features, kernel_size=3, stride=1, padding=1, bias=True ) self.conv2 = nn.Conv2d( features, features, kernel_size=3, stride=1, padding=1, bias=True ) self.relu = nn.ReLU(inplace=True) def forward(self, x): """Forward pass. Args: x (tensor): input Returns: tensor: output """ out = self.relu(x) out = self.conv1(out) out = self.relu(out) out = self.conv2(out) return out + x class FeatureFusionBlock(nn.Module): """Feature fusion block. """ def __init__(self, features): """Init. Args: features (int): number of features """ super(FeatureFusionBlock, self).__init__() self.resConfUnit1 = ResidualConvUnit(features) self.resConfUnit2 = ResidualConvUnit(features) def forward(self, *xs): """Forward pass. Returns: tensor: output """ output = xs[0] if len(xs) == 2: output += self.resConfUnit1(xs[1]) output = self.resConfUnit2(output) output = nn.functional.interpolate( output, scale_factor=2, mode="bilinear", align_corners=True ) return output class ResidualConvUnit_custom(nn.Module): """Residual convolution module. """ def __init__(self, features, activation, bn): """Init. Args: features (int): number of features """ super().__init__() self.bn = bn self.groups=1 self.conv1 = nn.Conv2d( features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups ) self.conv2 = nn.Conv2d( features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups ) if self.bn==True: self.bn1 = nn.BatchNorm2d(features) self.bn2 = nn.BatchNorm2d(features) self.activation = activation self.skip_add = nn.quantized.FloatFunctional() def forward(self, x): """Forward pass. Args: x (tensor): input Returns: tensor: output """ out = self.activation(x) out = self.conv1(out) if self.bn==True: out = self.bn1(out) out = self.activation(out) out = self.conv2(out) if self.bn==True: out = self.bn2(out) if self.groups > 1: out = self.conv_merge(out) return self.skip_add.add(out, x) # return out + x class FeatureFusionBlock_custom(nn.Module): """Feature fusion block. """ def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True): """Init. Args: features (int): number of features """ super(FeatureFusionBlock_custom, self).__init__() self.deconv = deconv self.align_corners = align_corners self.groups=1 self.expand = expand out_features = features if self.expand==True: out_features = features//2 self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) self.skip_add = nn.quantized.FloatFunctional() def forward(self, *xs): """Forward pass. Returns: tensor: output """ output = xs[0] if len(xs) == 2: res = self.resConfUnit1(xs[1]) output = self.skip_add.add(output, res) # output += res output = self.resConfUnit2(output) output = nn.functional.interpolate( output, scale_factor=2, mode="bilinear", align_corners=self.align_corners ) output = self.out_conv(output) return output ================================================ FILE: annotator/midas/midas/dpt_depth.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from .base_model import BaseModel from .blocks import ( FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder, forward_vit, ) def _make_fusion_block(features, use_bn): return FeatureFusionBlock_custom( features, nn.ReLU(False), deconv=False, bn=use_bn, expand=False, align_corners=True, ) class DPT(BaseModel): def __init__( self, head, features=256, backbone="vitb_rn50_384", readout="project", channels_last=False, use_bn=False, ): super(DPT, self).__init__() self.channels_last = channels_last hooks = { "vitb_rn50_384": [0, 1, 8, 11], "vitb16_384": [2, 5, 8, 11], "vitl16_384": [5, 11, 17, 23], } # Instantiate backbone and reassemble blocks self.pretrained, self.scratch = _make_encoder( backbone, features, False, # Set to true of you want to train from scratch, uses ImageNet weights groups=1, expand=False, exportable=False, hooks=hooks[backbone], use_readout=readout, ) self.scratch.refinenet1 = _make_fusion_block(features, use_bn) self.scratch.refinenet2 = _make_fusion_block(features, use_bn) self.scratch.refinenet3 = _make_fusion_block(features, use_bn) self.scratch.refinenet4 = _make_fusion_block(features, use_bn) self.scratch.output_conv = head def forward(self, x): if self.channels_last == True: x.contiguous(memory_format=torch.channels_last) layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) layer_1_rn = self.scratch.layer1_rn(layer_1) layer_2_rn = self.scratch.layer2_rn(layer_2) layer_3_rn = self.scratch.layer3_rn(layer_3) layer_4_rn = self.scratch.layer4_rn(layer_4) path_4 = self.scratch.refinenet4(layer_4_rn) path_3 = self.scratch.refinenet3(path_4, layer_3_rn) path_2 = self.scratch.refinenet2(path_3, layer_2_rn) path_1 = self.scratch.refinenet1(path_2, layer_1_rn) out = self.scratch.output_conv(path_1) return out class DPTDepthModel(DPT): def __init__(self, path=None, non_negative=True, **kwargs): features = kwargs["features"] if "features" in kwargs else 256 head = nn.Sequential( nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), Interpolate(scale_factor=2, mode="bilinear", align_corners=True), nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), nn.ReLU(True), nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), nn.ReLU(True) if non_negative else nn.Identity(), nn.Identity(), ) super().__init__(head, **kwargs) if path is not None: self.load(path) def forward(self, x): return super().forward(x).squeeze(dim=1) ================================================ FILE: annotator/midas/midas/midas_net.py ================================================ """MidashNet: Network for monocular depth estimation trained by mixing several datasets. This file contains code that is adapted from https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py """ import torch import torch.nn as nn from .base_model import BaseModel from .blocks import FeatureFusionBlock, Interpolate, _make_encoder class MidasNet(BaseModel): """Network for monocular depth estimation. """ def __init__(self, path=None, features=256, non_negative=True): """Init. Args: path (str, optional): Path to saved model. Defaults to None. features (int, optional): Number of features. Defaults to 256. backbone (str, optional): Backbone network for encoder. Defaults to resnet50 """ print("Loading weights: ", path) super(MidasNet, self).__init__() use_pretrained = False if path is None else True self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained) self.scratch.refinenet4 = FeatureFusionBlock(features) self.scratch.refinenet3 = FeatureFusionBlock(features) self.scratch.refinenet2 = FeatureFusionBlock(features) self.scratch.refinenet1 = FeatureFusionBlock(features) self.scratch.output_conv = nn.Sequential( nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), Interpolate(scale_factor=2, mode="bilinear"), nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), nn.ReLU(True), nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), nn.ReLU(True) if non_negative else nn.Identity(), ) if path: self.load(path) def forward(self, x): """Forward pass. Args: x (tensor): input data (image) Returns: tensor: depth """ layer_1 = self.pretrained.layer1(x) layer_2 = self.pretrained.layer2(layer_1) layer_3 = self.pretrained.layer3(layer_2) layer_4 = self.pretrained.layer4(layer_3) layer_1_rn = self.scratch.layer1_rn(layer_1) layer_2_rn = self.scratch.layer2_rn(layer_2) layer_3_rn = self.scratch.layer3_rn(layer_3) layer_4_rn = self.scratch.layer4_rn(layer_4) path_4 = self.scratch.refinenet4(layer_4_rn) path_3 = self.scratch.refinenet3(path_4, layer_3_rn) path_2 = self.scratch.refinenet2(path_3, layer_2_rn) path_1 = self.scratch.refinenet1(path_2, layer_1_rn) out = self.scratch.output_conv(path_1) return torch.squeeze(out, dim=1) ================================================ FILE: annotator/midas/midas/midas_net_custom.py ================================================ """MidashNet: Network for monocular depth estimation trained by mixing several datasets. This file contains code that is adapted from https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py """ import torch import torch.nn as nn from .base_model import BaseModel from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder class MidasNet_small(BaseModel): """Network for monocular depth estimation. """ def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True, blocks={'expand': True}): """Init. Args: path (str, optional): Path to saved model. Defaults to None. features (int, optional): Number of features. Defaults to 256. backbone (str, optional): Backbone network for encoder. Defaults to resnet50 """ print("Loading weights: ", path) super(MidasNet_small, self).__init__() use_pretrained = False if path else True self.channels_last = channels_last self.blocks = blocks self.backbone = backbone self.groups = 1 features1=features features2=features features3=features features4=features self.expand = False if "expand" in self.blocks and self.blocks['expand'] == True: self.expand = True features1=features features2=features*2 features3=features*4 features4=features*8 self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable) self.scratch.activation = nn.ReLU(False) self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners) self.scratch.output_conv = nn.Sequential( nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups), Interpolate(scale_factor=2, mode="bilinear"), nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1), self.scratch.activation, nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), nn.ReLU(True) if non_negative else nn.Identity(), nn.Identity(), ) if path: self.load(path) def forward(self, x): """Forward pass. Args: x (tensor): input data (image) Returns: tensor: depth """ if self.channels_last==True: print("self.channels_last = ", self.channels_last) x.contiguous(memory_format=torch.channels_last) layer_1 = self.pretrained.layer1(x) layer_2 = self.pretrained.layer2(layer_1) layer_3 = self.pretrained.layer3(layer_2) layer_4 = self.pretrained.layer4(layer_3) layer_1_rn = self.scratch.layer1_rn(layer_1) layer_2_rn = self.scratch.layer2_rn(layer_2) layer_3_rn = self.scratch.layer3_rn(layer_3) layer_4_rn = self.scratch.layer4_rn(layer_4) path_4 = self.scratch.refinenet4(layer_4_rn) path_3 = self.scratch.refinenet3(path_4, layer_3_rn) path_2 = self.scratch.refinenet2(path_3, layer_2_rn) path_1 = self.scratch.refinenet1(path_2, layer_1_rn) out = self.scratch.output_conv(path_1) return torch.squeeze(out, dim=1) def fuse_model(m): prev_previous_type = nn.Identity() prev_previous_name = '' previous_type = nn.Identity() previous_name = '' for name, module in m.named_modules(): if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU: # print("FUSED ", prev_previous_name, previous_name, name) torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True) elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d: # print("FUSED ", prev_previous_name, previous_name) torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True) # elif previous_type == nn.Conv2d and type(module) == nn.ReLU: # print("FUSED ", previous_name, name) # torch.quantization.fuse_modules(m, [previous_name, name], inplace=True) prev_previous_type = previous_type prev_previous_name = previous_name previous_type = type(module) previous_name = name ================================================ FILE: annotator/midas/midas/transforms.py ================================================ import numpy as np import cv2 import math def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): """Rezise the sample to ensure the given size. Keeps aspect ratio. Args: sample (dict): sample size (tuple): image size Returns: tuple: new size """ shape = list(sample["disparity"].shape) if shape[0] >= size[0] and shape[1] >= size[1]: return sample scale = [0, 0] scale[0] = size[0] / shape[0] scale[1] = size[1] / shape[1] scale = max(scale) shape[0] = math.ceil(scale * shape[0]) shape[1] = math.ceil(scale * shape[1]) # resize sample["image"] = cv2.resize( sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method ) sample["disparity"] = cv2.resize( sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST ) sample["mask"] = cv2.resize( sample["mask"].astype(np.float32), tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST, ) sample["mask"] = sample["mask"].astype(bool) return tuple(shape) class Resize(object): """Resize sample to given size (width, height). """ def __init__( self, width, height, resize_target=True, keep_aspect_ratio=False, ensure_multiple_of=1, resize_method="lower_bound", image_interpolation_method=cv2.INTER_AREA, ): """Init. Args: width (int): desired output width height (int): desired output height resize_target (bool, optional): True: Resize the full sample (image, mask, target). False: Resize image only. Defaults to True. keep_aspect_ratio (bool, optional): True: Keep the aspect ratio of the input sample. Output sample might not have the given width and height, and resize behaviour depends on the parameter 'resize_method'. Defaults to False. ensure_multiple_of (int, optional): Output width and height is constrained to be multiple of this parameter. Defaults to 1. resize_method (str, optional): "lower_bound": Output will be at least as large as the given size. "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) "minimal": Scale as least as possible. (Output size might be smaller than given size.) Defaults to "lower_bound". """ self.__width = width self.__height = height self.__resize_target = resize_target self.__keep_aspect_ratio = keep_aspect_ratio self.__multiple_of = ensure_multiple_of self.__resize_method = resize_method self.__image_interpolation_method = image_interpolation_method def constrain_to_multiple_of(self, x, min_val=0, max_val=None): y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) if max_val is not None and y > max_val: y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) if y < min_val: y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) return y def get_size(self, width, height): # determine new height and width scale_height = self.__height / height scale_width = self.__width / width if self.__keep_aspect_ratio: if self.__resize_method == "lower_bound": # scale such that output size is lower bound if scale_width > scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "upper_bound": # scale such that output size is upper bound if scale_width < scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "minimal": # scale as least as possbile if abs(1 - scale_width) < abs(1 - scale_height): # fit width scale_height = scale_width else: # fit height scale_width = scale_height else: raise ValueError( f"resize_method {self.__resize_method} not implemented" ) if self.__resize_method == "lower_bound": new_height = self.constrain_to_multiple_of( scale_height * height, min_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, min_val=self.__width ) elif self.__resize_method == "upper_bound": new_height = self.constrain_to_multiple_of( scale_height * height, max_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, max_val=self.__width ) elif self.__resize_method == "minimal": new_height = self.constrain_to_multiple_of(scale_height * height) new_width = self.constrain_to_multiple_of(scale_width * width) else: raise ValueError(f"resize_method {self.__resize_method} not implemented") return (new_width, new_height) def __call__(self, sample): width, height = self.get_size( sample["image"].shape[1], sample["image"].shape[0] ) # resize sample sample["image"] = cv2.resize( sample["image"], (width, height), interpolation=self.__image_interpolation_method, ) if self.__resize_target: if "disparity" in sample: sample["disparity"] = cv2.resize( sample["disparity"], (width, height), interpolation=cv2.INTER_NEAREST, ) if "depth" in sample: sample["depth"] = cv2.resize( sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST ) sample["mask"] = cv2.resize( sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST, ) sample["mask"] = sample["mask"].astype(bool) return sample class NormalizeImage(object): """Normlize image by given mean and std. """ def __init__(self, mean, std): self.__mean = mean self.__std = std def __call__(self, sample): sample["image"] = (sample["image"] - self.__mean) / self.__std return sample class PrepareForNet(object): """Prepare sample for usage as network input. """ def __init__(self): pass def __call__(self, sample): image = np.transpose(sample["image"], (2, 0, 1)) sample["image"] = np.ascontiguousarray(image).astype(np.float32) if "mask" in sample: sample["mask"] = sample["mask"].astype(np.float32) sample["mask"] = np.ascontiguousarray(sample["mask"]) if "disparity" in sample: disparity = sample["disparity"].astype(np.float32) sample["disparity"] = np.ascontiguousarray(disparity) if "depth" in sample: depth = sample["depth"].astype(np.float32) sample["depth"] = np.ascontiguousarray(depth) return sample ================================================ FILE: annotator/midas/midas/vit.py ================================================ import torch import torch.nn as nn import timm import types import math import torch.nn.functional as F class Slice(nn.Module): def __init__(self, start_index=1): super(Slice, self).__init__() self.start_index = start_index def forward(self, x): return x[:, self.start_index :] class AddReadout(nn.Module): def __init__(self, start_index=1): super(AddReadout, self).__init__() self.start_index = start_index def forward(self, x): if self.start_index == 2: readout = (x[:, 0] + x[:, 1]) / 2 else: readout = x[:, 0] return x[:, self.start_index :] + readout.unsqueeze(1) class ProjectReadout(nn.Module): def __init__(self, in_features, start_index=1): super(ProjectReadout, self).__init__() self.start_index = start_index self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU()) def forward(self, x): readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :]) features = torch.cat((x[:, self.start_index :], readout), -1) return self.project(features) class Transpose(nn.Module): def __init__(self, dim0, dim1): super(Transpose, self).__init__() self.dim0 = dim0 self.dim1 = dim1 def forward(self, x): x = x.transpose(self.dim0, self.dim1) return x def forward_vit(pretrained, x): b, c, h, w = x.shape glob = pretrained.model.forward_flex(x) layer_1 = pretrained.activations["1"] layer_2 = pretrained.activations["2"] layer_3 = pretrained.activations["3"] layer_4 = pretrained.activations["4"] layer_1 = pretrained.act_postprocess1[0:2](layer_1) layer_2 = pretrained.act_postprocess2[0:2](layer_2) layer_3 = pretrained.act_postprocess3[0:2](layer_3) layer_4 = pretrained.act_postprocess4[0:2](layer_4) unflatten = nn.Sequential( nn.Unflatten( 2, torch.Size( [ h // pretrained.model.patch_size[1], w // pretrained.model.patch_size[0], ] ), ) ) if layer_1.ndim == 3: layer_1 = unflatten(layer_1) if layer_2.ndim == 3: layer_2 = unflatten(layer_2) if layer_3.ndim == 3: layer_3 = unflatten(layer_3) if layer_4.ndim == 3: layer_4 = unflatten(layer_4) layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1) layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2) layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3) layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4) return layer_1, layer_2, layer_3, layer_4 def _resize_pos_embed(self, posemb, gs_h, gs_w): posemb_tok, posemb_grid = ( posemb[:, : self.start_index], posemb[0, self.start_index :], ) gs_old = int(math.sqrt(len(posemb_grid))) posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear") posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) posemb = torch.cat([posemb_tok, posemb_grid], dim=1) return posemb def forward_flex(self, x): b, c, h, w = x.shape pos_embed = self._resize_pos_embed( self.pos_embed, h // self.patch_size[1], w // self.patch_size[0] ) B = x.shape[0] if hasattr(self.patch_embed, "backbone"): x = self.patch_embed.backbone(x) if isinstance(x, (list, tuple)): x = x[-1] # last feature if backbone outputs list/tuple of features x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) if getattr(self, "dist_token", None) is not None: cls_tokens = self.cls_token.expand( B, -1, -1 ) # stole cls_tokens impl from Phil Wang, thanks dist_token = self.dist_token.expand(B, -1, -1) x = torch.cat((cls_tokens, dist_token, x), dim=1) else: cls_tokens = self.cls_token.expand( B, -1, -1 ) # stole cls_tokens impl from Phil Wang, thanks x = torch.cat((cls_tokens, x), dim=1) x = x + pos_embed x = self.pos_drop(x) for blk in self.blocks: x = blk(x) x = self.norm(x) return x activations = {} def get_activation(name): def hook(model, input, output): activations[name] = output return hook def get_readout_oper(vit_features, features, use_readout, start_index=1): if use_readout == "ignore": readout_oper = [Slice(start_index)] * len(features) elif use_readout == "add": readout_oper = [AddReadout(start_index)] * len(features) elif use_readout == "project": readout_oper = [ ProjectReadout(vit_features, start_index) for out_feat in features ] else: assert ( False ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" return readout_oper def _make_vit_b16_backbone( model, features=[96, 192, 384, 768], size=[384, 384], hooks=[2, 5, 8, 11], vit_features=768, use_readout="ignore", start_index=1, ): pretrained = nn.Module() pretrained.model = model pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) pretrained.activations = activations readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) # 32, 48, 136, 384 pretrained.act_postprocess1 = nn.Sequential( readout_oper[0], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[0], kernel_size=1, stride=1, padding=0, ), nn.ConvTranspose2d( in_channels=features[0], out_channels=features[0], kernel_size=4, stride=4, padding=0, bias=True, dilation=1, groups=1, ), ) pretrained.act_postprocess2 = nn.Sequential( readout_oper[1], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[1], kernel_size=1, stride=1, padding=0, ), nn.ConvTranspose2d( in_channels=features[1], out_channels=features[1], kernel_size=2, stride=2, padding=0, bias=True, dilation=1, groups=1, ), ) pretrained.act_postprocess3 = nn.Sequential( readout_oper[2], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[2], kernel_size=1, stride=1, padding=0, ), ) pretrained.act_postprocess4 = nn.Sequential( readout_oper[3], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[3], kernel_size=1, stride=1, padding=0, ), nn.Conv2d( in_channels=features[3], out_channels=features[3], kernel_size=3, stride=2, padding=1, ), ) pretrained.model.start_index = start_index pretrained.model.patch_size = [16, 16] # We inject this function into the VisionTransformer instances so that # we can use it with interpolated position embeddings without modifying the library source. pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) pretrained.model._resize_pos_embed = types.MethodType( _resize_pos_embed, pretrained.model ) return pretrained def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None): model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) hooks = [5, 11, 17, 23] if hooks == None else hooks return _make_vit_b16_backbone( model, features=[256, 512, 1024, 1024], hooks=hooks, vit_features=1024, use_readout=use_readout, ) def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None): model = timm.create_model("vit_base_patch16_384", pretrained=pretrained) hooks = [2, 5, 8, 11] if hooks == None else hooks return _make_vit_b16_backbone( model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout ) def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None): model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained) hooks = [2, 5, 8, 11] if hooks == None else hooks return _make_vit_b16_backbone( model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout ) def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None): model = timm.create_model( "vit_deit_base_distilled_patch16_384", pretrained=pretrained ) hooks = [2, 5, 8, 11] if hooks == None else hooks return _make_vit_b16_backbone( model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout, start_index=2, ) def _make_vit_b_rn50_backbone( model, features=[256, 512, 768, 768], size=[384, 384], hooks=[0, 1, 8, 11], vit_features=768, use_vit_only=False, use_readout="ignore", start_index=1, ): pretrained = nn.Module() pretrained.model = model if use_vit_only == True: pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) else: pretrained.model.patch_embed.backbone.stages[0].register_forward_hook( get_activation("1") ) pretrained.model.patch_embed.backbone.stages[1].register_forward_hook( get_activation("2") ) pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) pretrained.activations = activations readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) if use_vit_only == True: pretrained.act_postprocess1 = nn.Sequential( readout_oper[0], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[0], kernel_size=1, stride=1, padding=0, ), nn.ConvTranspose2d( in_channels=features[0], out_channels=features[0], kernel_size=4, stride=4, padding=0, bias=True, dilation=1, groups=1, ), ) pretrained.act_postprocess2 = nn.Sequential( readout_oper[1], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[1], kernel_size=1, stride=1, padding=0, ), nn.ConvTranspose2d( in_channels=features[1], out_channels=features[1], kernel_size=2, stride=2, padding=0, bias=True, dilation=1, groups=1, ), ) else: pretrained.act_postprocess1 = nn.Sequential( nn.Identity(), nn.Identity(), nn.Identity() ) pretrained.act_postprocess2 = nn.Sequential( nn.Identity(), nn.Identity(), nn.Identity() ) pretrained.act_postprocess3 = nn.Sequential( readout_oper[2], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[2], kernel_size=1, stride=1, padding=0, ), ) pretrained.act_postprocess4 = nn.Sequential( readout_oper[3], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[3], kernel_size=1, stride=1, padding=0, ), nn.Conv2d( in_channels=features[3], out_channels=features[3], kernel_size=3, stride=2, padding=1, ), ) pretrained.model.start_index = start_index pretrained.model.patch_size = [16, 16] # We inject this function into the VisionTransformer instances so that # we can use it with interpolated position embeddings without modifying the library source. pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) # We inject this function into the VisionTransformer instances so that # we can use it with interpolated position embeddings without modifying the library source. pretrained.model._resize_pos_embed = types.MethodType( _resize_pos_embed, pretrained.model ) return pretrained def _make_pretrained_vitb_rn50_384( pretrained, use_readout="ignore", hooks=None, use_vit_only=False ): model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained) hooks = [0, 1, 8, 11] if hooks == None else hooks return _make_vit_b_rn50_backbone( model, features=[256, 512, 768, 768], size=[384, 384], hooks=hooks, use_vit_only=use_vit_only, use_readout=use_readout, ) ================================================ FILE: annotator/midas/utils.py ================================================ """Utils for monoDepth.""" import sys import re import numpy as np import cv2 import torch def read_pfm(path): """Read pfm file. Args: path (str): path to file Returns: tuple: (data, scale) """ with open(path, "rb") as file: color = None width = None height = None scale = None endian = None header = file.readline().rstrip() if header.decode("ascii") == "PF": color = True elif header.decode("ascii") == "Pf": color = False else: raise Exception("Not a PFM file: " + path) dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii")) if dim_match: width, height = list(map(int, dim_match.groups())) else: raise Exception("Malformed PFM header.") scale = float(file.readline().decode("ascii").rstrip()) if scale < 0: # little-endian endian = "<" scale = -scale else: # big-endian endian = ">" data = np.fromfile(file, endian + "f") shape = (height, width, 3) if color else (height, width) data = np.reshape(data, shape) data = np.flipud(data) return data, scale def write_pfm(path, image, scale=1): """Write pfm file. Args: path (str): pathto file image (array): data scale (int, optional): Scale. Defaults to 1. """ with open(path, "wb") as file: color = None if image.dtype.name != "float32": raise Exception("Image dtype must be float32.") image = np.flipud(image) if len(image.shape) == 3 and image.shape[2] == 3: # color image color = True elif ( len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1 ): # greyscale color = False else: raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") file.write("PF\n" if color else "Pf\n".encode()) file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) endian = image.dtype.byteorder if endian == "<" or endian == "=" and sys.byteorder == "little": scale = -scale file.write("%f\n".encode() % scale) image.tofile(file) def read_image(path): """Read image and output RGB image (0-1). Args: path (str): path to file Returns: array: RGB image (0-1) """ img = cv2.imread(path) if img.ndim == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 return img def resize_image(img): """Resize image and make it fit for network. Args: img (array): image Returns: tensor: data ready for network """ height_orig = img.shape[0] width_orig = img.shape[1] if width_orig > height_orig: scale = width_orig / 384 else: scale = height_orig / 384 height = (np.ceil(height_orig / scale / 32) * 32).astype(int) width = (np.ceil(width_orig / scale / 32) * 32).astype(int) img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA) img_resized = ( torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float() ) img_resized = img_resized.unsqueeze(0) return img_resized def resize_depth(depth, width, height): """Resize depth map and bring to CPU (numpy). Args: depth (tensor): depth width (int): image width height (int): image height Returns: array: processed depth """ depth = torch.squeeze(depth[0, :, :, :]).to("cpu") depth_resized = cv2.resize( depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC ) return depth_resized def write_depth(path, depth, bits=1): """Write depth map to pfm and png file. Args: path (str): filepath without extension depth (array): depth """ write_pfm(path + ".pfm", depth.astype(np.float32)) depth_min = depth.min() depth_max = depth.max() max_val = (2**(8*bits))-1 if depth_max - depth_min > np.finfo("float").eps: out = max_val * (depth - depth_min) / (depth_max - depth_min) else: out = np.zeros(depth.shape, dtype=depth.type) if bits == 1: cv2.imwrite(path + ".png", out.astype("uint8")) elif bits == 2: cv2.imwrite(path + ".png", out.astype("uint16")) return ================================================ FILE: annotator/mlsd/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2021-present NAVER Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: annotator/mlsd/__init__.py ================================================ import cv2 import numpy as np import torch import os from einops import rearrange from .models.mbv2_mlsd_tiny import MobileV2_MLSD_Tiny from .models.mbv2_mlsd_large import MobileV2_MLSD_Large from .utils import pred_lines from modules import devices from annotator.annotator_path import models_path mlsdmodel = None remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/mlsd_large_512_fp32.pth" old_modeldir = os.path.dirname(os.path.realpath(__file__)) modeldir = os.path.join(models_path, "mlsd") def unload_mlsd_model(): global mlsdmodel if mlsdmodel is not None: mlsdmodel = mlsdmodel.cpu() def apply_mlsd(input_image, thr_v, thr_d): global modelpath, mlsdmodel if mlsdmodel is None: modelpath = os.path.join(modeldir, "mlsd_large_512_fp32.pth") old_modelpath = os.path.join(old_modeldir, "mlsd_large_512_fp32.pth") if os.path.exists(old_modelpath): modelpath = old_modelpath elif not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=modeldir) mlsdmodel = MobileV2_MLSD_Large() mlsdmodel.load_state_dict(torch.load(modelpath), strict=True) mlsdmodel = mlsdmodel.to(devices.get_device_for("controlnet")).eval() model = mlsdmodel assert input_image.ndim == 3 img = input_image img_output = np.zeros_like(img) try: with torch.no_grad(): lines = pred_lines(img, model, [img.shape[0], img.shape[1]], thr_v, thr_d) for line in lines: x_start, y_start, x_end, y_end = [int(val) for val in line] cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1) except Exception as e: pass return img_output[:, :, 0] ================================================ FILE: annotator/mlsd/models/mbv2_mlsd_large.py ================================================ import os import sys import torch import torch.nn as nn import torch.utils.model_zoo as model_zoo from torch.nn import functional as F class BlockTypeA(nn.Module): def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True): super(BlockTypeA, self).__init__() self.conv1 = nn.Sequential( nn.Conv2d(in_c2, out_c2, kernel_size=1), nn.BatchNorm2d(out_c2), nn.ReLU(inplace=True) ) self.conv2 = nn.Sequential( nn.Conv2d(in_c1, out_c1, kernel_size=1), nn.BatchNorm2d(out_c1), nn.ReLU(inplace=True) ) self.upscale = upscale def forward(self, a, b): b = self.conv1(b) a = self.conv2(a) if self.upscale: b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True) return torch.cat((a, b), dim=1) class BlockTypeB(nn.Module): def __init__(self, in_c, out_c): super(BlockTypeB, self).__init__() self.conv1 = nn.Sequential( nn.Conv2d(in_c, in_c, kernel_size=3, padding=1), nn.BatchNorm2d(in_c), nn.ReLU() ) self.conv2 = nn.Sequential( nn.Conv2d(in_c, out_c, kernel_size=3, padding=1), nn.BatchNorm2d(out_c), nn.ReLU() ) def forward(self, x): x = self.conv1(x) + x x = self.conv2(x) return x class BlockTypeC(nn.Module): def __init__(self, in_c, out_c): super(BlockTypeC, self).__init__() self.conv1 = nn.Sequential( nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5), nn.BatchNorm2d(in_c), nn.ReLU() ) self.conv2 = nn.Sequential( nn.Conv2d(in_c, in_c, kernel_size=3, padding=1), nn.BatchNorm2d(in_c), nn.ReLU() ) self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1) def forward(self, x): x = self.conv1(x) x = self.conv2(x) x = self.conv3(x) return x def _make_divisible(v, divisor, min_value=None): """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py :param v: :param divisor: :param min_value: :return: """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v class ConvBNReLU(nn.Sequential): def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): self.channel_pad = out_planes - in_planes self.stride = stride #padding = (kernel_size - 1) // 2 # TFLite uses slightly different padding than PyTorch if stride == 2: padding = 0 else: padding = (kernel_size - 1) // 2 super(ConvBNReLU, self).__init__( nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), nn.BatchNorm2d(out_planes), nn.ReLU6(inplace=True) ) self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride) def forward(self, x): # TFLite uses different padding if self.stride == 2: x = F.pad(x, (0, 1, 0, 1), "constant", 0) #print(x.shape) for module in self: if not isinstance(module, nn.MaxPool2d): x = module(x) return x class InvertedResidual(nn.Module): def __init__(self, inp, oup, stride, expand_ratio): super(InvertedResidual, self).__init__() self.stride = stride assert stride in [1, 2] hidden_dim = int(round(inp * expand_ratio)) self.use_res_connect = self.stride == 1 and inp == oup layers = [] if expand_ratio != 1: # pw layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) layers.extend([ # dw ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), # pw-linear nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup), ]) self.conv = nn.Sequential(*layers) def forward(self, x): if self.use_res_connect: return x + self.conv(x) else: return self.conv(x) class MobileNetV2(nn.Module): def __init__(self, pretrained=True): """ MobileNet V2 main class Args: num_classes (int): Number of classes width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount inverted_residual_setting: Network structure round_nearest (int): Round the number of channels in each layer to be a multiple of this number Set to 1 to turn off rounding block: Module specifying inverted residual building block for mobilenet """ super(MobileNetV2, self).__init__() block = InvertedResidual input_channel = 32 last_channel = 1280 width_mult = 1.0 round_nearest = 8 inverted_residual_setting = [ # t, c, n, s [1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1], #[6, 160, 3, 2], #[6, 320, 1, 1], ] # only check the first element, assuming user knows t,c,n,s are required if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: raise ValueError("inverted_residual_setting should be non-empty " "or a 4-element list, got {}".format(inverted_residual_setting)) # building first layer input_channel = _make_divisible(input_channel * width_mult, round_nearest) self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) features = [ConvBNReLU(4, input_channel, stride=2)] # building inverted residual blocks for t, c, n, s in inverted_residual_setting: output_channel = _make_divisible(c * width_mult, round_nearest) for i in range(n): stride = s if i == 0 else 1 features.append(block(input_channel, output_channel, stride, expand_ratio=t)) input_channel = output_channel self.features = nn.Sequential(*features) self.fpn_selected = [1, 3, 6, 10, 13] # weight initialization for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, nn.BatchNorm2d): nn.init.ones_(m.weight) nn.init.zeros_(m.bias) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 0.01) nn.init.zeros_(m.bias) if pretrained: self._load_pretrained_model() def _forward_impl(self, x): # This exists since TorchScript doesn't support inheritance, so the superclass method # (this one) needs to have a name other than `forward` that can be accessed in a subclass fpn_features = [] for i, f in enumerate(self.features): if i > self.fpn_selected[-1]: break x = f(x) if i in self.fpn_selected: fpn_features.append(x) c1, c2, c3, c4, c5 = fpn_features return c1, c2, c3, c4, c5 def forward(self, x): return self._forward_impl(x) def _load_pretrained_model(self): pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth') model_dict = {} state_dict = self.state_dict() for k, v in pretrain_dict.items(): if k in state_dict: model_dict[k] = v state_dict.update(model_dict) self.load_state_dict(state_dict) class MobileV2_MLSD_Large(nn.Module): def __init__(self): super(MobileV2_MLSD_Large, self).__init__() self.backbone = MobileNetV2(pretrained=False) ## A, B self.block15 = BlockTypeA(in_c1= 64, in_c2= 96, out_c1= 64, out_c2=64, upscale=False) self.block16 = BlockTypeB(128, 64) ## A, B self.block17 = BlockTypeA(in_c1 = 32, in_c2 = 64, out_c1= 64, out_c2= 64) self.block18 = BlockTypeB(128, 64) ## A, B self.block19 = BlockTypeA(in_c1=24, in_c2=64, out_c1=64, out_c2=64) self.block20 = BlockTypeB(128, 64) ## A, B, C self.block21 = BlockTypeA(in_c1=16, in_c2=64, out_c1=64, out_c2=64) self.block22 = BlockTypeB(128, 64) self.block23 = BlockTypeC(64, 16) def forward(self, x): c1, c2, c3, c4, c5 = self.backbone(x) x = self.block15(c4, c5) x = self.block16(x) x = self.block17(c3, x) x = self.block18(x) x = self.block19(c2, x) x = self.block20(x) x = self.block21(c1, x) x = self.block22(x) x = self.block23(x) x = x[:, 7:, :, :] return x ================================================ FILE: annotator/mlsd/models/mbv2_mlsd_tiny.py ================================================ import os import sys import torch import torch.nn as nn import torch.utils.model_zoo as model_zoo from torch.nn import functional as F class BlockTypeA(nn.Module): def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale = True): super(BlockTypeA, self).__init__() self.conv1 = nn.Sequential( nn.Conv2d(in_c2, out_c2, kernel_size=1), nn.BatchNorm2d(out_c2), nn.ReLU(inplace=True) ) self.conv2 = nn.Sequential( nn.Conv2d(in_c1, out_c1, kernel_size=1), nn.BatchNorm2d(out_c1), nn.ReLU(inplace=True) ) self.upscale = upscale def forward(self, a, b): b = self.conv1(b) a = self.conv2(a) b = F.interpolate(b, scale_factor=2.0, mode='bilinear', align_corners=True) return torch.cat((a, b), dim=1) class BlockTypeB(nn.Module): def __init__(self, in_c, out_c): super(BlockTypeB, self).__init__() self.conv1 = nn.Sequential( nn.Conv2d(in_c, in_c, kernel_size=3, padding=1), nn.BatchNorm2d(in_c), nn.ReLU() ) self.conv2 = nn.Sequential( nn.Conv2d(in_c, out_c, kernel_size=3, padding=1), nn.BatchNorm2d(out_c), nn.ReLU() ) def forward(self, x): x = self.conv1(x) + x x = self.conv2(x) return x class BlockTypeC(nn.Module): def __init__(self, in_c, out_c): super(BlockTypeC, self).__init__() self.conv1 = nn.Sequential( nn.Conv2d(in_c, in_c, kernel_size=3, padding=5, dilation=5), nn.BatchNorm2d(in_c), nn.ReLU() ) self.conv2 = nn.Sequential( nn.Conv2d(in_c, in_c, kernel_size=3, padding=1), nn.BatchNorm2d(in_c), nn.ReLU() ) self.conv3 = nn.Conv2d(in_c, out_c, kernel_size=1) def forward(self, x): x = self.conv1(x) x = self.conv2(x) x = self.conv3(x) return x def _make_divisible(v, divisor, min_value=None): """ This function is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by 8 It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py :param v: :param divisor: :param min_value: :return: """ if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v class ConvBNReLU(nn.Sequential): def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): self.channel_pad = out_planes - in_planes self.stride = stride #padding = (kernel_size - 1) // 2 # TFLite uses slightly different padding than PyTorch if stride == 2: padding = 0 else: padding = (kernel_size - 1) // 2 super(ConvBNReLU, self).__init__( nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), nn.BatchNorm2d(out_planes), nn.ReLU6(inplace=True) ) self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride) def forward(self, x): # TFLite uses different padding if self.stride == 2: x = F.pad(x, (0, 1, 0, 1), "constant", 0) #print(x.shape) for module in self: if not isinstance(module, nn.MaxPool2d): x = module(x) return x class InvertedResidual(nn.Module): def __init__(self, inp, oup, stride, expand_ratio): super(InvertedResidual, self).__init__() self.stride = stride assert stride in [1, 2] hidden_dim = int(round(inp * expand_ratio)) self.use_res_connect = self.stride == 1 and inp == oup layers = [] if expand_ratio != 1: # pw layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) layers.extend([ # dw ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), # pw-linear nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup), ]) self.conv = nn.Sequential(*layers) def forward(self, x): if self.use_res_connect: return x + self.conv(x) else: return self.conv(x) class MobileNetV2(nn.Module): def __init__(self, pretrained=True): """ MobileNet V2 main class Args: num_classes (int): Number of classes width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount inverted_residual_setting: Network structure round_nearest (int): Round the number of channels in each layer to be a multiple of this number Set to 1 to turn off rounding block: Module specifying inverted residual building block for mobilenet """ super(MobileNetV2, self).__init__() block = InvertedResidual input_channel = 32 last_channel = 1280 width_mult = 1.0 round_nearest = 8 inverted_residual_setting = [ # t, c, n, s [1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], #[6, 96, 3, 1], #[6, 160, 3, 2], #[6, 320, 1, 1], ] # only check the first element, assuming user knows t,c,n,s are required if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: raise ValueError("inverted_residual_setting should be non-empty " "or a 4-element list, got {}".format(inverted_residual_setting)) # building first layer input_channel = _make_divisible(input_channel * width_mult, round_nearest) self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) features = [ConvBNReLU(4, input_channel, stride=2)] # building inverted residual blocks for t, c, n, s in inverted_residual_setting: output_channel = _make_divisible(c * width_mult, round_nearest) for i in range(n): stride = s if i == 0 else 1 features.append(block(input_channel, output_channel, stride, expand_ratio=t)) input_channel = output_channel self.features = nn.Sequential(*features) self.fpn_selected = [3, 6, 10] # weight initialization for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out') if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, nn.BatchNorm2d): nn.init.ones_(m.weight) nn.init.zeros_(m.bias) elif isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 0.01) nn.init.zeros_(m.bias) #if pretrained: # self._load_pretrained_model() def _forward_impl(self, x): # This exists since TorchScript doesn't support inheritance, so the superclass method # (this one) needs to have a name other than `forward` that can be accessed in a subclass fpn_features = [] for i, f in enumerate(self.features): if i > self.fpn_selected[-1]: break x = f(x) if i in self.fpn_selected: fpn_features.append(x) c2, c3, c4 = fpn_features return c2, c3, c4 def forward(self, x): return self._forward_impl(x) def _load_pretrained_model(self): pretrain_dict = model_zoo.load_url('https://download.pytorch.org/models/mobilenet_v2-b0353104.pth') model_dict = {} state_dict = self.state_dict() for k, v in pretrain_dict.items(): if k in state_dict: model_dict[k] = v state_dict.update(model_dict) self.load_state_dict(state_dict) class MobileV2_MLSD_Tiny(nn.Module): def __init__(self): super(MobileV2_MLSD_Tiny, self).__init__() self.backbone = MobileNetV2(pretrained=True) self.block12 = BlockTypeA(in_c1= 32, in_c2= 64, out_c1= 64, out_c2=64) self.block13 = BlockTypeB(128, 64) self.block14 = BlockTypeA(in_c1 = 24, in_c2 = 64, out_c1= 32, out_c2= 32) self.block15 = BlockTypeB(64, 64) self.block16 = BlockTypeC(64, 16) def forward(self, x): c2, c3, c4 = self.backbone(x) x = self.block12(c3, c4) x = self.block13(x) x = self.block14(c2, x) x = self.block15(x) x = self.block16(x) x = x[:, 7:, :, :] #print(x.shape) x = F.interpolate(x, scale_factor=2.0, mode='bilinear', align_corners=True) return x ================================================ FILE: annotator/mlsd/utils.py ================================================ ''' modified by lihaoweicv pytorch version ''' ''' M-LSD Copyright 2021-present NAVER Corp. Apache License v2.0 ''' import os import numpy as np import cv2 import torch from torch.nn import functional as F from modules import devices def deccode_output_score_and_ptss(tpMap, topk_n = 200, ksize = 5): ''' tpMap: center: tpMap[1, 0, :, :] displacement: tpMap[1, 1:5, :, :] ''' b, c, h, w = tpMap.shape assert b==1, 'only support bsize==1' displacement = tpMap[:, 1:5, :, :][0] center = tpMap[:, 0, :, :] heat = torch.sigmoid(center) hmax = F.max_pool2d( heat, (ksize, ksize), stride=1, padding=(ksize-1)//2) keep = (hmax == heat).float() heat = heat * keep heat = heat.reshape(-1, ) scores, indices = torch.topk(heat, topk_n, dim=-1, largest=True) yy = torch.floor_divide(indices, w).unsqueeze(-1) xx = torch.fmod(indices, w).unsqueeze(-1) ptss = torch.cat((yy, xx),dim=-1) ptss = ptss.detach().cpu().numpy() scores = scores.detach().cpu().numpy() displacement = displacement.detach().cpu().numpy() displacement = displacement.transpose((1,2,0)) return ptss, scores, displacement def pred_lines(image, model, input_shape=[512, 512], score_thr=0.10, dist_thr=20.0): h, w, _ = image.shape h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]] resized_image = np.concatenate([cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA), np.ones([input_shape[0], input_shape[1], 1])], axis=-1) resized_image = resized_image.transpose((2,0,1)) batch_image = np.expand_dims(resized_image, axis=0).astype('float32') batch_image = (batch_image / 127.5) - 1.0 batch_image = torch.from_numpy(batch_image).float().to(devices.get_device_for("controlnet")) outputs = model(batch_image) pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3) start = vmap[:, :, :2] end = vmap[:, :, 2:] dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1)) segments_list = [] for center, score in zip(pts, pts_score): y, x = center distance = dist_map[y, x] if score > score_thr and distance > dist_thr: disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :] x_start = x + disp_x_start y_start = y + disp_y_start x_end = x + disp_x_end y_end = y + disp_y_end segments_list.append([x_start, y_start, x_end, y_end]) lines = 2 * np.array(segments_list) # 256 > 512 lines[:, 0] = lines[:, 0] * w_ratio lines[:, 1] = lines[:, 1] * h_ratio lines[:, 2] = lines[:, 2] * w_ratio lines[:, 3] = lines[:, 3] * h_ratio return lines def pred_squares(image, model, input_shape=[512, 512], params={'score': 0.06, 'outside_ratio': 0.28, 'inside_ratio': 0.45, 'w_overlap': 0.0, 'w_degree': 1.95, 'w_length': 0.0, 'w_area': 1.86, 'w_center': 0.14}): ''' shape = [height, width] ''' h, w, _ = image.shape original_shape = [h, w] resized_image = np.concatenate([cv2.resize(image, (input_shape[0], input_shape[1]), interpolation=cv2.INTER_AREA), np.ones([input_shape[0], input_shape[1], 1])], axis=-1) resized_image = resized_image.transpose((2, 0, 1)) batch_image = np.expand_dims(resized_image, axis=0).astype('float32') batch_image = (batch_image / 127.5) - 1.0 batch_image = torch.from_numpy(batch_image).float().to(devices.get_device_for("controlnet")) outputs = model(batch_image) pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3) start = vmap[:, :, :2] # (x, y) end = vmap[:, :, 2:] # (x, y) dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1)) junc_list = [] segments_list = [] for junc, score in zip(pts, pts_score): y, x = junc distance = dist_map[y, x] if score > params['score'] and distance > 20.0: junc_list.append([x, y]) disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[y, x, :] d_arrow = 1.0 x_start = x + d_arrow * disp_x_start y_start = y + d_arrow * disp_y_start x_end = x + d_arrow * disp_x_end y_end = y + d_arrow * disp_y_end segments_list.append([x_start, y_start, x_end, y_end]) segments = np.array(segments_list) ####### post processing for squares # 1. get unique lines point = np.array([[0, 0]]) point = point[0] start = segments[:, :2] end = segments[:, 2:] diff = start - end a = diff[:, 1] b = -diff[:, 0] c = a * start[:, 0] + b * start[:, 1] d = np.abs(a * point[0] + b * point[1] - c) / np.sqrt(a ** 2 + b ** 2 + 1e-10) theta = np.arctan2(diff[:, 0], diff[:, 1]) * 180 / np.pi theta[theta < 0.0] += 180 hough = np.concatenate([d[:, None], theta[:, None]], axis=-1) d_quant = 1 theta_quant = 2 hough[:, 0] //= d_quant hough[:, 1] //= theta_quant _, indices, counts = np.unique(hough, axis=0, return_index=True, return_counts=True) acc_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='float32') idx_map = np.zeros([512 // d_quant + 1, 360 // theta_quant + 1], dtype='int32') - 1 yx_indices = hough[indices, :].astype('int32') acc_map[yx_indices[:, 0], yx_indices[:, 1]] = counts idx_map[yx_indices[:, 0], yx_indices[:, 1]] = indices acc_map_np = acc_map # acc_map = acc_map[None, :, :, None] # # ### fast suppression using tensorflow op # acc_map = tf.constant(acc_map, dtype=tf.float32) # max_acc_map = tf.keras.layers.MaxPool2D(pool_size=(5, 5), strides=1, padding='same')(acc_map) # acc_map = acc_map * tf.cast(tf.math.equal(acc_map, max_acc_map), tf.float32) # flatten_acc_map = tf.reshape(acc_map, [1, -1]) # topk_values, topk_indices = tf.math.top_k(flatten_acc_map, k=len(pts)) # _, h, w, _ = acc_map.shape # y = tf.expand_dims(topk_indices // w, axis=-1) # x = tf.expand_dims(topk_indices % w, axis=-1) # yx = tf.concat([y, x], axis=-1) ### fast suppression using pytorch op acc_map = torch.from_numpy(acc_map_np).unsqueeze(0).unsqueeze(0) _,_, h, w = acc_map.shape max_acc_map = F.max_pool2d(acc_map,kernel_size=5, stride=1, padding=2) acc_map = acc_map * ( (acc_map == max_acc_map).float() ) flatten_acc_map = acc_map.reshape([-1, ]) scores, indices = torch.topk(flatten_acc_map, len(pts), dim=-1, largest=True) yy = torch.div(indices, w, rounding_mode='floor').unsqueeze(-1) xx = torch.fmod(indices, w).unsqueeze(-1) yx = torch.cat((yy, xx), dim=-1) yx = yx.detach().cpu().numpy() topk_values = scores.detach().cpu().numpy() indices = idx_map[yx[:, 0], yx[:, 1]] basis = 5 // 2 merged_segments = [] for yx_pt, max_indice, value in zip(yx, indices, topk_values): y, x = yx_pt if max_indice == -1 or value == 0: continue segment_list = [] for y_offset in range(-basis, basis + 1): for x_offset in range(-basis, basis + 1): indice = idx_map[y + y_offset, x + x_offset] cnt = int(acc_map_np[y + y_offset, x + x_offset]) if indice != -1: segment_list.append(segments[indice]) if cnt > 1: check_cnt = 1 current_hough = hough[indice] for new_indice, new_hough in enumerate(hough): if (current_hough == new_hough).all() and indice != new_indice: segment_list.append(segments[new_indice]) check_cnt += 1 if check_cnt == cnt: break group_segments = np.array(segment_list).reshape([-1, 2]) sorted_group_segments = np.sort(group_segments, axis=0) x_min, y_min = sorted_group_segments[0, :] x_max, y_max = sorted_group_segments[-1, :] deg = theta[max_indice] if deg >= 90: merged_segments.append([x_min, y_max, x_max, y_min]) else: merged_segments.append([x_min, y_min, x_max, y_max]) # 2. get intersections new_segments = np.array(merged_segments) # (x1, y1, x2, y2) start = new_segments[:, :2] # (x1, y1) end = new_segments[:, 2:] # (x2, y2) new_centers = (start + end) / 2.0 diff = start - end dist_segments = np.sqrt(np.sum(diff ** 2, axis=-1)) # ax + by = c a = diff[:, 1] b = -diff[:, 0] c = a * start[:, 0] + b * start[:, 1] pre_det = a[:, None] * b[None, :] det = pre_det - np.transpose(pre_det) pre_inter_y = a[:, None] * c[None, :] inter_y = (pre_inter_y - np.transpose(pre_inter_y)) / (det + 1e-10) pre_inter_x = c[:, None] * b[None, :] inter_x = (pre_inter_x - np.transpose(pre_inter_x)) / (det + 1e-10) inter_pts = np.concatenate([inter_x[:, :, None], inter_y[:, :, None]], axis=-1).astype('int32') # 3. get corner information # 3.1 get distance ''' dist_segments: | dist(0), dist(1), dist(2), ...| dist_inter_to_segment1: | dist(inter,0), dist(inter,0), dist(inter,0), ... | | dist(inter,1), dist(inter,1), dist(inter,1), ... | ... dist_inter_to_semgnet2: | dist(inter,0), dist(inter,1), dist(inter,2), ... | | dist(inter,0), dist(inter,1), dist(inter,2), ... | ... ''' dist_inter_to_segment1_start = np.sqrt( np.sum(((inter_pts - start[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1] dist_inter_to_segment1_end = np.sqrt( np.sum(((inter_pts - end[:, None, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1] dist_inter_to_segment2_start = np.sqrt( np.sum(((inter_pts - start[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1] dist_inter_to_segment2_end = np.sqrt( np.sum(((inter_pts - end[None, :, :]) ** 2), axis=-1, keepdims=True)) # [n_batch, n_batch, 1] # sort ascending dist_inter_to_segment1 = np.sort( np.concatenate([dist_inter_to_segment1_start, dist_inter_to_segment1_end], axis=-1), axis=-1) # [n_batch, n_batch, 2] dist_inter_to_segment2 = np.sort( np.concatenate([dist_inter_to_segment2_start, dist_inter_to_segment2_end], axis=-1), axis=-1) # [n_batch, n_batch, 2] # 3.2 get degree inter_to_start = new_centers[:, None, :] - inter_pts deg_inter_to_start = np.arctan2(inter_to_start[:, :, 1], inter_to_start[:, :, 0]) * 180 / np.pi deg_inter_to_start[deg_inter_to_start < 0.0] += 360 inter_to_end = new_centers[None, :, :] - inter_pts deg_inter_to_end = np.arctan2(inter_to_end[:, :, 1], inter_to_end[:, :, 0]) * 180 / np.pi deg_inter_to_end[deg_inter_to_end < 0.0] += 360 ''' B -- G | | C -- R B : blue / G: green / C: cyan / R: red 0 -- 1 | | 3 -- 2 ''' # rename variables deg1_map, deg2_map = deg_inter_to_start, deg_inter_to_end # sort deg ascending deg_sort = np.sort(np.concatenate([deg1_map[:, :, None], deg2_map[:, :, None]], axis=-1), axis=-1) deg_diff_map = np.abs(deg1_map - deg2_map) # we only consider the smallest degree of intersect deg_diff_map[deg_diff_map > 180] = 360 - deg_diff_map[deg_diff_map > 180] # define available degree range deg_range = [60, 120] corner_dict = {corner_info: [] for corner_info in range(4)} inter_points = [] for i in range(inter_pts.shape[0]): for j in range(i + 1, inter_pts.shape[1]): # i, j > line index, always i < j x, y = inter_pts[i, j, :] deg1, deg2 = deg_sort[i, j, :] deg_diff = deg_diff_map[i, j] check_degree = deg_diff > deg_range[0] and deg_diff < deg_range[1] outside_ratio = params['outside_ratio'] # over ratio >>> drop it! inside_ratio = params['inside_ratio'] # over ratio >>> drop it! check_distance = ((dist_inter_to_segment1[i, j, 1] >= dist_segments[i] and \ dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * outside_ratio) or \ (dist_inter_to_segment1[i, j, 1] <= dist_segments[i] and \ dist_inter_to_segment1[i, j, 0] <= dist_segments[i] * inside_ratio)) and \ ((dist_inter_to_segment2[i, j, 1] >= dist_segments[j] and \ dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * outside_ratio) or \ (dist_inter_to_segment2[i, j, 1] <= dist_segments[j] and \ dist_inter_to_segment2[i, j, 0] <= dist_segments[j] * inside_ratio)) if check_degree and check_distance: corner_info = None if (deg1 >= 0 and deg1 <= 45 and deg2 >= 45 and deg2 <= 120) or \ (deg2 >= 315 and deg1 >= 45 and deg1 <= 120): corner_info, color_info = 0, 'blue' elif (deg1 >= 45 and deg1 <= 125 and deg2 >= 125 and deg2 <= 225): corner_info, color_info = 1, 'green' elif (deg1 >= 125 and deg1 <= 225 and deg2 >= 225 and deg2 <= 315): corner_info, color_info = 2, 'black' elif (deg1 >= 0 and deg1 <= 45 and deg2 >= 225 and deg2 <= 315) or \ (deg2 >= 315 and deg1 >= 225 and deg1 <= 315): corner_info, color_info = 3, 'cyan' else: corner_info, color_info = 4, 'red' # we don't use it continue corner_dict[corner_info].append([x, y, i, j]) inter_points.append([x, y]) square_list = [] connect_list = [] segments_list = [] for corner0 in corner_dict[0]: for corner1 in corner_dict[1]: connect01 = False for corner0_line in corner0[2:]: if corner0_line in corner1[2:]: connect01 = True break if connect01: for corner2 in corner_dict[2]: connect12 = False for corner1_line in corner1[2:]: if corner1_line in corner2[2:]: connect12 = True break if connect12: for corner3 in corner_dict[3]: connect23 = False for corner2_line in corner2[2:]: if corner2_line in corner3[2:]: connect23 = True break if connect23: for corner3_line in corner3[2:]: if corner3_line in corner0[2:]: # SQUARE!!! ''' 0 -- 1 | | 3 -- 2 square_list: order: 0 > 1 > 2 > 3 | x0, y0, x1, y1, x2, y2, x3, y3 | | x0, y0, x1, y1, x2, y2, x3, y3 | ... connect_list: order: 01 > 12 > 23 > 30 | line_idx01, line_idx12, line_idx23, line_idx30 | | line_idx01, line_idx12, line_idx23, line_idx30 | ... segments_list: order: 0 > 1 > 2 > 3 | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j | | line_idx0_i, line_idx0_j, line_idx1_i, line_idx1_j, line_idx2_i, line_idx2_j, line_idx3_i, line_idx3_j | ... ''' square_list.append(corner0[:2] + corner1[:2] + corner2[:2] + corner3[:2]) connect_list.append([corner0_line, corner1_line, corner2_line, corner3_line]) segments_list.append(corner0[2:] + corner1[2:] + corner2[2:] + corner3[2:]) def check_outside_inside(segments_info, connect_idx): # return 'outside or inside', min distance, cover_param, peri_param if connect_idx == segments_info[0]: check_dist_mat = dist_inter_to_segment1 else: check_dist_mat = dist_inter_to_segment2 i, j = segments_info min_dist, max_dist = check_dist_mat[i, j, :] connect_dist = dist_segments[connect_idx] if max_dist > connect_dist: return 'outside', min_dist, 0, 1 else: return 'inside', min_dist, -1, -1 top_square = None try: map_size = input_shape[0] / 2 squares = np.array(square_list).reshape([-1, 4, 2]) score_array = [] connect_array = np.array(connect_list) segments_array = np.array(segments_list).reshape([-1, 4, 2]) # get degree of corners: squares_rollup = np.roll(squares, 1, axis=1) squares_rolldown = np.roll(squares, -1, axis=1) vec1 = squares_rollup - squares normalized_vec1 = vec1 / (np.linalg.norm(vec1, axis=-1, keepdims=True) + 1e-10) vec2 = squares_rolldown - squares normalized_vec2 = vec2 / (np.linalg.norm(vec2, axis=-1, keepdims=True) + 1e-10) inner_products = np.sum(normalized_vec1 * normalized_vec2, axis=-1) # [n_squares, 4] squares_degree = np.arccos(inner_products) * 180 / np.pi # [n_squares, 4] # get square score overlap_scores = [] degree_scores = [] length_scores = [] for connects, segments, square, degree in zip(connect_array, segments_array, squares, squares_degree): ''' 0 -- 1 | | 3 -- 2 # segments: [4, 2] # connects: [4] ''' ###################################### OVERLAP SCORES cover = 0 perimeter = 0 # check 0 > 1 > 2 > 3 square_length = [] for start_idx in range(4): end_idx = (start_idx + 1) % 4 connect_idx = connects[start_idx] # segment idx of segment01 start_segments = segments[start_idx] end_segments = segments[end_idx] start_point = square[start_idx] end_point = square[end_idx] # check whether outside or inside start_position, start_min, start_cover_param, start_peri_param = check_outside_inside(start_segments, connect_idx) end_position, end_min, end_cover_param, end_peri_param = check_outside_inside(end_segments, connect_idx) cover += dist_segments[connect_idx] + start_cover_param * start_min + end_cover_param * end_min perimeter += dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min square_length.append( dist_segments[connect_idx] + start_peri_param * start_min + end_peri_param * end_min) overlap_scores.append(cover / perimeter) ###################################### ###################################### DEGREE SCORES ''' deg0 vs deg2 deg1 vs deg3 ''' deg0, deg1, deg2, deg3 = degree deg_ratio1 = deg0 / deg2 if deg_ratio1 > 1.0: deg_ratio1 = 1 / deg_ratio1 deg_ratio2 = deg1 / deg3 if deg_ratio2 > 1.0: deg_ratio2 = 1 / deg_ratio2 degree_scores.append((deg_ratio1 + deg_ratio2) / 2) ###################################### ###################################### LENGTH SCORES ''' len0 vs len2 len1 vs len3 ''' len0, len1, len2, len3 = square_length len_ratio1 = len0 / len2 if len2 > len0 else len2 / len0 len_ratio2 = len1 / len3 if len3 > len1 else len3 / len1 length_scores.append((len_ratio1 + len_ratio2) / 2) ###################################### overlap_scores = np.array(overlap_scores) overlap_scores /= np.max(overlap_scores) degree_scores = np.array(degree_scores) # degree_scores /= np.max(degree_scores) length_scores = np.array(length_scores) ###################################### AREA SCORES area_scores = np.reshape(squares, [-1, 4, 2]) area_x = area_scores[:, :, 0] area_y = area_scores[:, :, 1] correction = area_x[:, -1] * area_y[:, 0] - area_y[:, -1] * area_x[:, 0] area_scores = np.sum(area_x[:, :-1] * area_y[:, 1:], axis=-1) - np.sum(area_y[:, :-1] * area_x[:, 1:], axis=-1) area_scores = 0.5 * np.abs(area_scores + correction) area_scores /= (map_size * map_size) # np.max(area_scores) ###################################### ###################################### CENTER SCORES centers = np.array([[256 // 2, 256 // 2]], dtype='float32') # [1, 2] # squares: [n, 4, 2] square_centers = np.mean(squares, axis=1) # [n, 2] center2center = np.sqrt(np.sum((centers - square_centers) ** 2)) center_scores = center2center / (map_size / np.sqrt(2.0)) ''' score_w = [overlap, degree, area, center, length] ''' score_w = [0.0, 1.0, 10.0, 0.5, 1.0] score_array = params['w_overlap'] * overlap_scores \ + params['w_degree'] * degree_scores \ + params['w_area'] * area_scores \ - params['w_center'] * center_scores \ + params['w_length'] * length_scores best_square = [] sorted_idx = np.argsort(score_array)[::-1] score_array = score_array[sorted_idx] squares = squares[sorted_idx] except Exception as e: pass '''return list merged_lines, squares, scores ''' try: new_segments[:, 0] = new_segments[:, 0] * 2 / input_shape[1] * original_shape[1] new_segments[:, 1] = new_segments[:, 1] * 2 / input_shape[0] * original_shape[0] new_segments[:, 2] = new_segments[:, 2] * 2 / input_shape[1] * original_shape[1] new_segments[:, 3] = new_segments[:, 3] * 2 / input_shape[0] * original_shape[0] except: new_segments = [] try: squares[:, :, 0] = squares[:, :, 0] * 2 / input_shape[1] * original_shape[1] squares[:, :, 1] = squares[:, :, 1] * 2 / input_shape[0] * original_shape[0] except: squares = [] score_array = [] try: inter_points = np.array(inter_points) inter_points[:, 0] = inter_points[:, 0] * 2 / input_shape[1] * original_shape[1] inter_points[:, 1] = inter_points[:, 1] * 2 / input_shape[0] * original_shape[0] except: inter_points = [] return new_segments, squares, score_array, inter_points ================================================ FILE: annotator/mmpkg/mmcv/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # flake8: noqa from .arraymisc import * from .fileio import * from .image import * from .utils import * from .version import * from .video import * from .visualization import * # The following modules are not imported to this level, so mmcv may be used # without PyTorch. # - runner # - parallel # - op ================================================ FILE: annotator/mmpkg/mmcv/arraymisc/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .quantization import dequantize, quantize __all__ = ['quantize', 'dequantize'] ================================================ FILE: annotator/mmpkg/mmcv/arraymisc/quantization.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np def quantize(arr, min_val, max_val, levels, dtype=np.int64): """Quantize an array of (-inf, inf) to [0, levels-1]. Args: arr (ndarray): Input array. min_val (scalar): Minimum value to be clipped. max_val (scalar): Maximum value to be clipped. levels (int): Quantization levels. dtype (np.type): The type of the quantized array. Returns: tuple: Quantized array. """ if not (isinstance(levels, int) and levels > 1): raise ValueError( f'levels must be a positive integer, but got {levels}') if min_val >= max_val: raise ValueError( f'min_val ({min_val}) must be smaller than max_val ({max_val})') arr = np.clip(arr, min_val, max_val) - min_val quantized_arr = np.minimum( np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1) return quantized_arr def dequantize(arr, min_val, max_val, levels, dtype=np.float64): """Dequantize an array. Args: arr (ndarray): Input array. min_val (scalar): Minimum value to be clipped. max_val (scalar): Maximum value to be clipped. levels (int): Quantization levels. dtype (np.type): The type of the dequantized array. Returns: tuple: Dequantized array. """ if not (isinstance(levels, int) and levels > 1): raise ValueError( f'levels must be a positive integer, but got {levels}') if min_val >= max_val: raise ValueError( f'min_val ({min_val}) must be smaller than max_val ({max_val})') dequantized_arr = (arr + 0.5).astype(dtype) * (max_val - min_val) / levels + min_val return dequantized_arr ================================================ FILE: annotator/mmpkg/mmcv/cnn/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .alexnet import AlexNet # yapf: disable from .bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS, PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS, ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule, ConvTranspose2d, ConvTranspose3d, ConvWS2d, DepthwiseSeparableConvModule, GeneralizedAttention, HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d, NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish, build_activation_layer, build_conv_layer, build_norm_layer, build_padding_layer, build_plugin_layer, build_upsample_layer, conv_ws_2d, is_norm) from .builder import MODELS, build_model_from_cfg # yapf: enable from .resnet import ResNet, make_res_layer from .utils import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit, NormalInit, PretrainedInit, TruncNormalInit, UniformInit, XavierInit, bias_init_with_prob, caffe2_xavier_init, constant_init, fuse_conv_bn, get_model_complexity_info, initialize, kaiming_init, normal_init, trunc_normal_init, uniform_init, xavier_init) from .vgg import VGG, make_vgg_layer __all__ = [ 'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer', 'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init', 'uniform_init', 'kaiming_init', 'caffe2_xavier_init', 'bias_init_with_prob', 'ConvModule', 'build_activation_layer', 'build_conv_layer', 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', 'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'get_model_complexity_info', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d', 'fuse_conv_bn', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit', 'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit', 'Caffe2XavierInit', 'MODELS', 'build_model_from_cfg' ] ================================================ FILE: annotator/mmpkg/mmcv/cnn/alexnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import logging import torch.nn as nn class AlexNet(nn.Module): """AlexNet backbone. Args: num_classes (int): number of classes for classification. """ def __init__(self, num_classes=-1): super(AlexNet, self).__init__() self.num_classes = num_classes self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(64, 192, kernel_size=5, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(192, 384, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), ) if self.num_classes > 0: self.classifier = nn.Sequential( nn.Dropout(), nn.Linear(256 * 6 * 6, 4096), nn.ReLU(inplace=True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(inplace=True), nn.Linear(4096, num_classes), ) def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = logging.getLogger() from ..runner import load_checkpoint load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: # use default initializer pass else: raise TypeError('pretrained must be a str or None') def forward(self, x): x = self.features(x) if self.num_classes > 0: x = x.view(x.size(0), 256 * 6 * 6) x = self.classifier(x) return x ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .activation import build_activation_layer from .context_block import ContextBlock from .conv import build_conv_layer from .conv2d_adaptive_padding import Conv2dAdaptivePadding from .conv_module import ConvModule from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d from .depthwise_separable_conv_module import DepthwiseSeparableConvModule from .drop import Dropout, DropPath from .generalized_attention import GeneralizedAttention from .hsigmoid import HSigmoid from .hswish import HSwish from .non_local import NonLocal1d, NonLocal2d, NonLocal3d from .norm import build_norm_layer, is_norm from .padding import build_padding_layer from .plugin import build_plugin_layer from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS, PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS) from .scale import Scale from .swish import Swish from .upsample import build_upsample_layer from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d, Linear, MaxPool2d, MaxPool3d) __all__ = [ 'ConvModule', 'build_activation_layer', 'build_conv_layer', 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d', 'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear', 'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath' ] ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/activation.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.utils import TORCH_VERSION, build_from_cfg, digit_version from .registry import ACTIVATION_LAYERS for module in [ nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU, nn.Sigmoid, nn.Tanh ]: ACTIVATION_LAYERS.register_module(module=module) @ACTIVATION_LAYERS.register_module(name='Clip') @ACTIVATION_LAYERS.register_module() class Clamp(nn.Module): """Clamp activation layer. This activation function is to clamp the feature map value within :math:`[min, max]`. More details can be found in ``torch.clamp()``. Args: min (Number | optional): Lower-bound of the range to be clamped to. Default to -1. max (Number | optional): Upper-bound of the range to be clamped to. Default to 1. """ def __init__(self, min=-1., max=1.): super(Clamp, self).__init__() self.min = min self.max = max def forward(self, x): """Forward function. Args: x (torch.Tensor): The input tensor. Returns: torch.Tensor: Clamped tensor. """ return torch.clamp(x, min=self.min, max=self.max) class GELU(nn.Module): r"""Applies the Gaussian Error Linear Units function: .. math:: \text{GELU}(x) = x * \Phi(x) where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution. Shape: - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - Output: :math:`(N, *)`, same shape as the input .. image:: scripts/activation_images/GELU.png Examples:: >>> m = nn.GELU() >>> input = torch.randn(2) >>> output = m(input) """ def forward(self, input): return F.gelu(input) if (TORCH_VERSION == 'parrots' or digit_version(TORCH_VERSION) < digit_version('1.4')): ACTIVATION_LAYERS.register_module(module=GELU) else: ACTIVATION_LAYERS.register_module(module=nn.GELU) def build_activation_layer(cfg): """Build activation layer. Args: cfg (dict): The activation layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate an activation layer. Returns: nn.Module: Created activation layer. """ return build_from_cfg(cfg, ACTIVATION_LAYERS) ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/context_block.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn from ..utils import constant_init, kaiming_init from .registry import PLUGIN_LAYERS def last_zero_init(m): if isinstance(m, nn.Sequential): constant_init(m[-1], val=0) else: constant_init(m, val=0) @PLUGIN_LAYERS.register_module() class ContextBlock(nn.Module): """ContextBlock module in GCNet. See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond' (https://arxiv.org/abs/1904.11492) for details. Args: in_channels (int): Channels of the input feature map. ratio (float): Ratio of channels of transform bottleneck pooling_type (str): Pooling method for context modeling. Options are 'att' and 'avg', stand for attention pooling and average pooling respectively. Default: 'att'. fusion_types (Sequence[str]): Fusion method for feature fusion, Options are 'channels_add', 'channel_mul', stand for channelwise addition and multiplication respectively. Default: ('channel_add',) """ _abbr_ = 'context_block' def __init__(self, in_channels, ratio, pooling_type='att', fusion_types=('channel_add', )): super(ContextBlock, self).__init__() assert pooling_type in ['avg', 'att'] assert isinstance(fusion_types, (list, tuple)) valid_fusion_types = ['channel_add', 'channel_mul'] assert all([f in valid_fusion_types for f in fusion_types]) assert len(fusion_types) > 0, 'at least one fusion should be used' self.in_channels = in_channels self.ratio = ratio self.planes = int(in_channels * ratio) self.pooling_type = pooling_type self.fusion_types = fusion_types if pooling_type == 'att': self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1) self.softmax = nn.Softmax(dim=2) else: self.avg_pool = nn.AdaptiveAvgPool2d(1) if 'channel_add' in fusion_types: self.channel_add_conv = nn.Sequential( nn.Conv2d(self.in_channels, self.planes, kernel_size=1), nn.LayerNorm([self.planes, 1, 1]), nn.ReLU(inplace=True), # yapf: disable nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) else: self.channel_add_conv = None if 'channel_mul' in fusion_types: self.channel_mul_conv = nn.Sequential( nn.Conv2d(self.in_channels, self.planes, kernel_size=1), nn.LayerNorm([self.planes, 1, 1]), nn.ReLU(inplace=True), # yapf: disable nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) else: self.channel_mul_conv = None self.reset_parameters() def reset_parameters(self): if self.pooling_type == 'att': kaiming_init(self.conv_mask, mode='fan_in') self.conv_mask.inited = True if self.channel_add_conv is not None: last_zero_init(self.channel_add_conv) if self.channel_mul_conv is not None: last_zero_init(self.channel_mul_conv) def spatial_pool(self, x): batch, channel, height, width = x.size() if self.pooling_type == 'att': input_x = x # [N, C, H * W] input_x = input_x.view(batch, channel, height * width) # [N, 1, C, H * W] input_x = input_x.unsqueeze(1) # [N, 1, H, W] context_mask = self.conv_mask(x) # [N, 1, H * W] context_mask = context_mask.view(batch, 1, height * width) # [N, 1, H * W] context_mask = self.softmax(context_mask) # [N, 1, H * W, 1] context_mask = context_mask.unsqueeze(-1) # [N, 1, C, 1] context = torch.matmul(input_x, context_mask) # [N, C, 1, 1] context = context.view(batch, channel, 1, 1) else: # [N, C, 1, 1] context = self.avg_pool(x) return context def forward(self, x): # [N, C, 1, 1] context = self.spatial_pool(x) out = x if self.channel_mul_conv is not None: # [N, C, 1, 1] channel_mul_term = torch.sigmoid(self.channel_mul_conv(context)) out = out * channel_mul_term if self.channel_add_conv is not None: # [N, C, 1, 1] channel_add_term = self.channel_add_conv(context) out = out + channel_add_term return out ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/conv.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from torch import nn from .registry import CONV_LAYERS CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d) CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d) CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d) CONV_LAYERS.register_module('Conv', module=nn.Conv2d) def build_conv_layer(cfg, *args, **kwargs): """Build convolution layer. Args: cfg (None or dict): The conv layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate an conv layer. args (argument list): Arguments passed to the `__init__` method of the corresponding conv layer. kwargs (keyword arguments): Keyword arguments passed to the `__init__` method of the corresponding conv layer. Returns: nn.Module: Created conv layer. """ if cfg is None: cfg_ = dict(type='Conv2d') else: if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() layer_type = cfg_.pop('type') if layer_type not in CONV_LAYERS: raise KeyError(f'Unrecognized norm type {layer_type}') else: conv_layer = CONV_LAYERS.get(layer_type) layer = conv_layer(*args, **kwargs, **cfg_) return layer ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/conv2d_adaptive_padding.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math from torch import nn from torch.nn import functional as F from .registry import CONV_LAYERS @CONV_LAYERS.register_module() class Conv2dAdaptivePadding(nn.Conv2d): """Implementation of 2D convolution in tensorflow with `padding` as "same", which applies padding to input (if needed) so that input image gets fully covered by filter and stride you specified. For stride 1, this will ensure that output image size is same as input. For stride of 2, output dimensions will be half, for example. Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution kernel_size (int or tuple): Size of the convolving kernel stride (int or tuple, optional): Stride of the convolution. Default: 1 padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0 dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` """ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) def forward(self, x): img_h, img_w = x.size()[-2:] kernel_h, kernel_w = self.weight.size()[-2:] stride_h, stride_w = self.stride output_h = math.ceil(img_h / stride_h) output_w = math.ceil(img_w / stride_w) pad_h = ( max((output_h - 1) * self.stride[0] + (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0)) pad_w = ( max((output_w - 1) * self.stride[1] + (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0)) if pad_h > 0 or pad_w > 0: x = F.pad(x, [ pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 ]) return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/conv_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import torch.nn as nn from annotator.mmpkg.mmcv.utils import _BatchNorm, _InstanceNorm from ..utils import constant_init, kaiming_init from .activation import build_activation_layer from .conv import build_conv_layer from .norm import build_norm_layer from .padding import build_padding_layer from .registry import PLUGIN_LAYERS @PLUGIN_LAYERS.register_module() class ConvModule(nn.Module): """A conv block that bundles conv/norm/activation layers. This block simplifies the usage of convolution layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). It is based upon three build methods: `build_conv_layer()`, `build_norm_layer()` and `build_activation_layer()`. Besides, we add some additional features in this module. 1. Automatically set `bias` of the conv layer. 2. Spectral norm is supported. 3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only supports zero and circular padding, and we add "reflect" padding mode. Args: in_channels (int): Number of channels in the input feature map. Same as that in ``nn._ConvNd``. out_channels (int): Number of channels produced by the convolution. Same as that in ``nn._ConvNd``. kernel_size (int | tuple[int]): Size of the convolving kernel. Same as that in ``nn._ConvNd``. stride (int | tuple[int]): Stride of the convolution. Same as that in ``nn._ConvNd``. padding (int | tuple[int]): Zero-padding added to both sides of the input. Same as that in ``nn._ConvNd``. dilation (int | tuple[int]): Spacing between kernel elements. Same as that in ``nn._ConvNd``. groups (int): Number of blocked connections from input channels to output channels. Same as that in ``nn._ConvNd``. bias (bool | str): If specified as `auto`, it will be decided by the norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise False. Default: "auto". conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: None. act_cfg (dict): Config dict for activation layer. Default: dict(type='ReLU'). inplace (bool): Whether to use inplace mode for activation. Default: True. with_spectral_norm (bool): Whether use spectral norm in conv module. Default: False. padding_mode (str): If the `padding_mode` has not been supported by current `Conv2d` in PyTorch, we will use our own padding layer instead. Currently, we support ['zeros', 'circular'] with official implementation and ['reflect'] with our own implementation. Default: 'zeros'. order (tuple[str]): The order of conv/norm/activation layers. It is a sequence of "conv", "norm" and "act". Common examples are ("conv", "norm", "act") and ("act", "conv", "norm"). Default: ('conv', 'norm', 'act'). """ _abbr_ = 'conv_block' def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias='auto', conv_cfg=None, norm_cfg=None, act_cfg=dict(type='ReLU'), inplace=True, with_spectral_norm=False, padding_mode='zeros', order=('conv', 'norm', 'act')): super(ConvModule, self).__init__() assert conv_cfg is None or isinstance(conv_cfg, dict) assert norm_cfg is None or isinstance(norm_cfg, dict) assert act_cfg is None or isinstance(act_cfg, dict) official_padding_mode = ['zeros', 'circular'] self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.inplace = inplace self.with_spectral_norm = with_spectral_norm self.with_explicit_padding = padding_mode not in official_padding_mode self.order = order assert isinstance(self.order, tuple) and len(self.order) == 3 assert set(order) == set(['conv', 'norm', 'act']) self.with_norm = norm_cfg is not None self.with_activation = act_cfg is not None # if the conv layer is before a norm layer, bias is unnecessary. if bias == 'auto': bias = not self.with_norm self.with_bias = bias if self.with_explicit_padding: pad_cfg = dict(type=padding_mode) self.padding_layer = build_padding_layer(pad_cfg, padding) # reset padding to 0 for conv module conv_padding = 0 if self.with_explicit_padding else padding # build convolution layer self.conv = build_conv_layer( conv_cfg, in_channels, out_channels, kernel_size, stride=stride, padding=conv_padding, dilation=dilation, groups=groups, bias=bias) # export the attributes of self.conv to a higher level for convenience self.in_channels = self.conv.in_channels self.out_channels = self.conv.out_channels self.kernel_size = self.conv.kernel_size self.stride = self.conv.stride self.padding = padding self.dilation = self.conv.dilation self.transposed = self.conv.transposed self.output_padding = self.conv.output_padding self.groups = self.conv.groups if self.with_spectral_norm: self.conv = nn.utils.spectral_norm(self.conv) # build normalization layers if self.with_norm: # norm layer is after conv layer if order.index('norm') > order.index('conv'): norm_channels = out_channels else: norm_channels = in_channels self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) self.add_module(self.norm_name, norm) if self.with_bias: if isinstance(norm, (_BatchNorm, _InstanceNorm)): warnings.warn( 'Unnecessary conv bias before batch/instance norm') else: self.norm_name = None # build activation layer if self.with_activation: act_cfg_ = act_cfg.copy() # nn.Tanh has no 'inplace' argument if act_cfg_['type'] not in [ 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish' ]: act_cfg_.setdefault('inplace', inplace) self.activate = build_activation_layer(act_cfg_) # Use msra init by default self.init_weights() @property def norm(self): if self.norm_name: return getattr(self, self.norm_name) else: return None def init_weights(self): # 1. It is mainly for customized conv layers with their own # initialization manners by calling their own ``init_weights()``, # and we do not want ConvModule to override the initialization. # 2. For customized conv layers without their own initialization # manners (that is, they don't have their own ``init_weights()``) # and PyTorch's conv layers, they will be initialized by # this method with default ``kaiming_init``. # Note: For PyTorch's conv layers, they will be overwritten by our # initialization implementation using default ``kaiming_init``. if not hasattr(self.conv, 'init_weights'): if self.with_activation and self.act_cfg['type'] == 'LeakyReLU': nonlinearity = 'leaky_relu' a = self.act_cfg.get('negative_slope', 0.01) else: nonlinearity = 'relu' a = 0 kaiming_init(self.conv, a=a, nonlinearity=nonlinearity) if self.with_norm: constant_init(self.norm, 1, bias=0) def forward(self, x, activate=True, norm=True): for layer in self.order: if layer == 'conv': if self.with_explicit_padding: x = self.padding_layer(x) x = self.conv(x) elif layer == 'norm' and norm and self.with_norm: x = self.norm(x) elif layer == 'act' and activate and self.with_activation: x = self.activate(x) return x ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/conv_ws.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from .registry import CONV_LAYERS def conv_ws_2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, eps=1e-5): c_in = weight.size(0) weight_flat = weight.view(c_in, -1) mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1) std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1) weight = (weight - mean) / (std + eps) return F.conv2d(input, weight, bias, stride, padding, dilation, groups) @CONV_LAYERS.register_module('ConvWS') class ConvWS2d(nn.Conv2d): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, eps=1e-5): super(ConvWS2d, self).__init__( in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) self.eps = eps def forward(self, x): return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.eps) @CONV_LAYERS.register_module(name='ConvAWS') class ConvAWS2d(nn.Conv2d): """AWS (Adaptive Weight Standardization) This is a variant of Weight Standardization (https://arxiv.org/pdf/1903.10520.pdf) It is used in DetectoRS to avoid NaN (https://arxiv.org/pdf/2006.02334.pdf) Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution kernel_size (int or tuple): Size of the conv kernel stride (int or tuple, optional): Stride of the convolution. Default: 1 padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0 dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 bias (bool, optional): If set True, adds a learnable bias to the output. Default: True """ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super().__init__( in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) self.register_buffer('weight_gamma', torch.ones(self.out_channels, 1, 1, 1)) self.register_buffer('weight_beta', torch.zeros(self.out_channels, 1, 1, 1)) def _get_weight(self, weight): weight_flat = weight.view(weight.size(0), -1) mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) weight = (weight - mean) / std weight = self.weight_gamma * weight + self.weight_beta return weight def forward(self, x): weight = self._get_weight(self.weight) return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups) def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): """Override default load function. AWS overrides the function _load_from_state_dict to recover weight_gamma and weight_beta if they are missing. If weight_gamma and weight_beta are found in the checkpoint, this function will return after super()._load_from_state_dict. Otherwise, it will compute the mean and std of the pretrained weights and store them in weight_beta and weight_gamma. """ self.weight_gamma.data.fill_(-1) local_missing_keys = [] super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, local_missing_keys, unexpected_keys, error_msgs) if self.weight_gamma.data.mean() > 0: for k in local_missing_keys: missing_keys.append(k) return weight = self.weight.data weight_flat = weight.view(weight.size(0), -1) mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) self.weight_beta.data.copy_(mean) self.weight_gamma.data.copy_(std) missing_gamma_beta = [ k for k in local_missing_keys if k.endswith('weight_gamma') or k.endswith('weight_beta') ] for k in missing_gamma_beta: local_missing_keys.remove(k) for k in local_missing_keys: missing_keys.append(k) ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/depthwise_separable_conv_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn as nn from .conv_module import ConvModule class DepthwiseSeparableConvModule(nn.Module): """Depthwise separable convolution module. See https://arxiv.org/pdf/1704.04861.pdf for details. This module can replace a ConvModule with the conv block replaced by two conv block: depthwise conv block and pointwise conv block. The depthwise conv block contains depthwise-conv/norm/activation layers. The pointwise conv block contains pointwise-conv/norm/activation layers. It should be noted that there will be norm/activation layer in the depthwise conv block if `norm_cfg` and `act_cfg` are specified. Args: in_channels (int): Number of channels in the input feature map. Same as that in ``nn._ConvNd``. out_channels (int): Number of channels produced by the convolution. Same as that in ``nn._ConvNd``. kernel_size (int | tuple[int]): Size of the convolving kernel. Same as that in ``nn._ConvNd``. stride (int | tuple[int]): Stride of the convolution. Same as that in ``nn._ConvNd``. Default: 1. padding (int | tuple[int]): Zero-padding added to both sides of the input. Same as that in ``nn._ConvNd``. Default: 0. dilation (int | tuple[int]): Spacing between kernel elements. Same as that in ``nn._ConvNd``. Default: 1. norm_cfg (dict): Default norm config for both depthwise ConvModule and pointwise ConvModule. Default: None. act_cfg (dict): Default activation config for both depthwise ConvModule and pointwise ConvModule. Default: dict(type='ReLU'). dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is 'default', it will be the same as `norm_cfg`. Default: 'default'. dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is 'default', it will be the same as `act_cfg`. Default: 'default'. pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is 'default', it will be the same as `norm_cfg`. Default: 'default'. pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is 'default', it will be the same as `act_cfg`. Default: 'default'. kwargs (optional): Other shared arguments for depthwise and pointwise ConvModule. See ConvModule for ref. """ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, norm_cfg=None, act_cfg=dict(type='ReLU'), dw_norm_cfg='default', dw_act_cfg='default', pw_norm_cfg='default', pw_act_cfg='default', **kwargs): super(DepthwiseSeparableConvModule, self).__init__() assert 'groups' not in kwargs, 'groups should not be specified' # if norm/activation config of depthwise/pointwise ConvModule is not # specified, use default config. dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg # depthwise convolution self.depthwise_conv = ConvModule( in_channels, in_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=in_channels, norm_cfg=dw_norm_cfg, act_cfg=dw_act_cfg, **kwargs) self.pointwise_conv = ConvModule( in_channels, out_channels, 1, norm_cfg=pw_norm_cfg, act_cfg=pw_act_cfg, **kwargs) def forward(self, x): x = self.depthwise_conv(x) x = self.pointwise_conv(x) return x ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/drop.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn from annotator.mmpkg.mmcv import build_from_cfg from .registry import DROPOUT_LAYERS def drop_path(x, drop_prob=0., training=False): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). We follow the implementation https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 """ if drop_prob == 0. or not training: return x keep_prob = 1 - drop_prob # handle tensors with different dimensions, not just 4D tensors. shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) random_tensor = keep_prob + torch.rand( shape, dtype=x.dtype, device=x.device) output = x.div(keep_prob) * random_tensor.floor() return output @DROPOUT_LAYERS.register_module() class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). We follow the implementation https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 Args: drop_prob (float): Probability of the path to be zeroed. Default: 0.1 """ def __init__(self, drop_prob=0.1): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, x): return drop_path(x, self.drop_prob, self.training) @DROPOUT_LAYERS.register_module() class Dropout(nn.Dropout): """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with ``DropPath`` Args: drop_prob (float): Probability of the elements to be zeroed. Default: 0.5. inplace (bool): Do the operation inplace or not. Default: False. """ def __init__(self, drop_prob=0.5, inplace=False): super().__init__(p=drop_prob, inplace=inplace) def build_dropout(cfg, default_args=None): """Builder for drop out layers.""" return build_from_cfg(cfg, DROPOUT_LAYERS, default_args) ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/generalized_attention.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from ..utils import kaiming_init from .registry import PLUGIN_LAYERS @PLUGIN_LAYERS.register_module() class GeneralizedAttention(nn.Module): """GeneralizedAttention module. See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks' (https://arxiv.org/abs/1711.07971) for details. Args: in_channels (int): Channels of the input feature map. spatial_range (int): The spatial range. -1 indicates no spatial range constraint. Default: -1. num_heads (int): The head number of empirical_attention module. Default: 9. position_embedding_dim (int): The position embedding dimension. Default: -1. position_magnitude (int): A multiplier acting on coord difference. Default: 1. kv_stride (int): The feature stride acting on key/value feature map. Default: 2. q_stride (int): The feature stride acting on query feature map. Default: 1. attention_type (str): A binary indicator string for indicating which items in generalized empirical_attention module are used. Default: '1111'. - '1000' indicates 'query and key content' (appr - appr) item, - '0100' indicates 'query content and relative position' (appr - position) item, - '0010' indicates 'key content only' (bias - appr) item, - '0001' indicates 'relative position only' (bias - position) item. """ _abbr_ = 'gen_attention_block' def __init__(self, in_channels, spatial_range=-1, num_heads=9, position_embedding_dim=-1, position_magnitude=1, kv_stride=2, q_stride=1, attention_type='1111'): super(GeneralizedAttention, self).__init__() # hard range means local range for non-local operation self.position_embedding_dim = ( position_embedding_dim if position_embedding_dim > 0 else in_channels) self.position_magnitude = position_magnitude self.num_heads = num_heads self.in_channels = in_channels self.spatial_range = spatial_range self.kv_stride = kv_stride self.q_stride = q_stride self.attention_type = [bool(int(_)) for _ in attention_type] self.qk_embed_dim = in_channels // num_heads out_c = self.qk_embed_dim * num_heads if self.attention_type[0] or self.attention_type[1]: self.query_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_c, kernel_size=1, bias=False) self.query_conv.kaiming_init = True if self.attention_type[0] or self.attention_type[2]: self.key_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_c, kernel_size=1, bias=False) self.key_conv.kaiming_init = True self.v_dim = in_channels // num_heads self.value_conv = nn.Conv2d( in_channels=in_channels, out_channels=self.v_dim * num_heads, kernel_size=1, bias=False) self.value_conv.kaiming_init = True if self.attention_type[1] or self.attention_type[3]: self.appr_geom_fc_x = nn.Linear( self.position_embedding_dim // 2, out_c, bias=False) self.appr_geom_fc_x.kaiming_init = True self.appr_geom_fc_y = nn.Linear( self.position_embedding_dim // 2, out_c, bias=False) self.appr_geom_fc_y.kaiming_init = True if self.attention_type[2]: stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv self.appr_bias = nn.Parameter(appr_bias_value) if self.attention_type[3]: stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv self.geom_bias = nn.Parameter(geom_bias_value) self.proj_conv = nn.Conv2d( in_channels=self.v_dim * num_heads, out_channels=in_channels, kernel_size=1, bias=True) self.proj_conv.kaiming_init = True self.gamma = nn.Parameter(torch.zeros(1)) if self.spatial_range >= 0: # only works when non local is after 3*3 conv if in_channels == 256: max_len = 84 elif in_channels == 512: max_len = 42 max_len_kv = int((max_len - 1.0) / self.kv_stride + 1) local_constraint_map = np.ones( (max_len, max_len, max_len_kv, max_len_kv), dtype=np.int) for iy in range(max_len): for ix in range(max_len): local_constraint_map[ iy, ix, max((iy - self.spatial_range) // self.kv_stride, 0):min((iy + self.spatial_range + 1) // self.kv_stride + 1, max_len), max((ix - self.spatial_range) // self.kv_stride, 0):min((ix + self.spatial_range + 1) // self.kv_stride + 1, max_len)] = 0 self.local_constraint_map = nn.Parameter( torch.from_numpy(local_constraint_map).byte(), requires_grad=False) if self.q_stride > 1: self.q_downsample = nn.AvgPool2d( kernel_size=1, stride=self.q_stride) else: self.q_downsample = None if self.kv_stride > 1: self.kv_downsample = nn.AvgPool2d( kernel_size=1, stride=self.kv_stride) else: self.kv_downsample = None self.init_weights() def get_position_embedding(self, h, w, h_kv, w_kv, q_stride, kv_stride, device, dtype, feat_dim, wave_length=1000): # the default type of Tensor is float32, leading to type mismatch # in fp16 mode. Cast it to support fp16 mode. h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype) h_idxs = h_idxs.view((h, 1)) * q_stride w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype) w_idxs = w_idxs.view((w, 1)) * q_stride h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to( device=device, dtype=dtype) h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to( device=device, dtype=dtype) w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride # (h, h_kv, 1) h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0) h_diff *= self.position_magnitude # (w, w_kv, 1) w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0) w_diff *= self.position_magnitude feat_range = torch.arange(0, feat_dim / 4).to( device=device, dtype=dtype) dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype) dim_mat = dim_mat**((4. / feat_dim) * feat_range) dim_mat = dim_mat.view((1, 1, -1)) embedding_x = torch.cat( ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2) embedding_y = torch.cat( ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2) return embedding_x, embedding_y def forward(self, x_input): num_heads = self.num_heads # use empirical_attention if self.q_downsample is not None: x_q = self.q_downsample(x_input) else: x_q = x_input n, _, h, w = x_q.shape if self.kv_downsample is not None: x_kv = self.kv_downsample(x_input) else: x_kv = x_input _, _, h_kv, w_kv = x_kv.shape if self.attention_type[0] or self.attention_type[1]: proj_query = self.query_conv(x_q).view( (n, num_heads, self.qk_embed_dim, h * w)) proj_query = proj_query.permute(0, 1, 3, 2) if self.attention_type[0] or self.attention_type[2]: proj_key = self.key_conv(x_kv).view( (n, num_heads, self.qk_embed_dim, h_kv * w_kv)) if self.attention_type[1] or self.attention_type[3]: position_embed_x, position_embed_y = self.get_position_embedding( h, w, h_kv, w_kv, self.q_stride, self.kv_stride, x_input.device, x_input.dtype, self.position_embedding_dim) # (n, num_heads, w, w_kv, dim) position_feat_x = self.appr_geom_fc_x(position_embed_x).\ view(1, w, w_kv, num_heads, self.qk_embed_dim).\ permute(0, 3, 1, 2, 4).\ repeat(n, 1, 1, 1, 1) # (n, num_heads, h, h_kv, dim) position_feat_y = self.appr_geom_fc_y(position_embed_y).\ view(1, h, h_kv, num_heads, self.qk_embed_dim).\ permute(0, 3, 1, 2, 4).\ repeat(n, 1, 1, 1, 1) position_feat_x /= math.sqrt(2) position_feat_y /= math.sqrt(2) # accelerate for saliency only if (np.sum(self.attention_type) == 1) and self.attention_type[2]: appr_bias = self.appr_bias.\ view(1, num_heads, 1, self.qk_embed_dim).\ repeat(n, 1, 1, 1) energy = torch.matmul(appr_bias, proj_key).\ view(n, num_heads, 1, h_kv * w_kv) h = 1 w = 1 else: # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for if not self.attention_type[0]: energy = torch.zeros( n, num_heads, h, w, h_kv, w_kv, dtype=x_input.dtype, device=x_input.device) # attention_type[0]: appr - appr # attention_type[1]: appr - position # attention_type[2]: bias - appr # attention_type[3]: bias - position if self.attention_type[0] or self.attention_type[2]: if self.attention_type[0] and self.attention_type[2]: appr_bias = self.appr_bias.\ view(1, num_heads, 1, self.qk_embed_dim) energy = torch.matmul(proj_query + appr_bias, proj_key).\ view(n, num_heads, h, w, h_kv, w_kv) elif self.attention_type[0]: energy = torch.matmul(proj_query, proj_key).\ view(n, num_heads, h, w, h_kv, w_kv) elif self.attention_type[2]: appr_bias = self.appr_bias.\ view(1, num_heads, 1, self.qk_embed_dim).\ repeat(n, 1, 1, 1) energy += torch.matmul(appr_bias, proj_key).\ view(n, num_heads, 1, 1, h_kv, w_kv) if self.attention_type[1] or self.attention_type[3]: if self.attention_type[1] and self.attention_type[3]: geom_bias = self.geom_bias.\ view(1, num_heads, 1, self.qk_embed_dim) proj_query_reshape = (proj_query + geom_bias).\ view(n, num_heads, h, w, self.qk_embed_dim) energy_x = torch.matmul( proj_query_reshape.permute(0, 1, 3, 2, 4), position_feat_x.permute(0, 1, 2, 4, 3)) energy_x = energy_x.\ permute(0, 1, 3, 2, 4).unsqueeze(4) energy_y = torch.matmul( proj_query_reshape, position_feat_y.permute(0, 1, 2, 4, 3)) energy_y = energy_y.unsqueeze(5) energy += energy_x + energy_y elif self.attention_type[1]: proj_query_reshape = proj_query.\ view(n, num_heads, h, w, self.qk_embed_dim) proj_query_reshape = proj_query_reshape.\ permute(0, 1, 3, 2, 4) position_feat_x_reshape = position_feat_x.\ permute(0, 1, 2, 4, 3) position_feat_y_reshape = position_feat_y.\ permute(0, 1, 2, 4, 3) energy_x = torch.matmul(proj_query_reshape, position_feat_x_reshape) energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4) energy_y = torch.matmul(proj_query_reshape, position_feat_y_reshape) energy_y = energy_y.unsqueeze(5) energy += energy_x + energy_y elif self.attention_type[3]: geom_bias = self.geom_bias.\ view(1, num_heads, self.qk_embed_dim, 1).\ repeat(n, 1, 1, 1) position_feat_x_reshape = position_feat_x.\ view(n, num_heads, w*w_kv, self.qk_embed_dim) position_feat_y_reshape = position_feat_y.\ view(n, num_heads, h * h_kv, self.qk_embed_dim) energy_x = torch.matmul(position_feat_x_reshape, geom_bias) energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv) energy_y = torch.matmul(position_feat_y_reshape, geom_bias) energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1) energy += energy_x + energy_y energy = energy.view(n, num_heads, h * w, h_kv * w_kv) if self.spatial_range >= 0: cur_local_constraint_map = \ self.local_constraint_map[:h, :w, :h_kv, :w_kv].\ contiguous().\ view(1, 1, h*w, h_kv*w_kv) energy = energy.masked_fill_(cur_local_constraint_map, float('-inf')) attention = F.softmax(energy, 3) proj_value = self.value_conv(x_kv) proj_value_reshape = proj_value.\ view((n, num_heads, self.v_dim, h_kv * w_kv)).\ permute(0, 1, 3, 2) out = torch.matmul(attention, proj_value_reshape).\ permute(0, 1, 3, 2).\ contiguous().\ view(n, self.v_dim * self.num_heads, h, w) out = self.proj_conv(out) # output is downsampled, upsample back to input size if self.q_downsample is not None: out = F.interpolate( out, size=x_input.shape[2:], mode='bilinear', align_corners=False) out = self.gamma * out + x_input return out def init_weights(self): for m in self.modules(): if hasattr(m, 'kaiming_init') and m.kaiming_init: kaiming_init( m, mode='fan_in', nonlinearity='leaky_relu', bias=0, distribution='uniform', a=1) ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/hsigmoid.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn as nn from .registry import ACTIVATION_LAYERS @ACTIVATION_LAYERS.register_module() class HSigmoid(nn.Module): """Hard Sigmoid Module. Apply the hard sigmoid function: Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value) Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1) Args: bias (float): Bias of the input feature map. Default: 1.0. divisor (float): Divisor of the input feature map. Default: 2.0. min_value (float): Lower bound value. Default: 0.0. max_value (float): Upper bound value. Default: 1.0. Returns: Tensor: The output tensor. """ def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0): super(HSigmoid, self).__init__() self.bias = bias self.divisor = divisor assert self.divisor != 0 self.min_value = min_value self.max_value = max_value def forward(self, x): x = (x + self.bias) / self.divisor return x.clamp_(self.min_value, self.max_value) ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/hswish.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn as nn from .registry import ACTIVATION_LAYERS @ACTIVATION_LAYERS.register_module() class HSwish(nn.Module): """Hard Swish Module. This module applies the hard swish function: .. math:: Hswish(x) = x * ReLU6(x + 3) / 6 Args: inplace (bool): can optionally do the operation in-place. Default: False. Returns: Tensor: The output tensor. """ def __init__(self, inplace=False): super(HSwish, self).__init__() self.act = nn.ReLU6(inplace) def forward(self, x): return x * self.act(x + 3) / 6 ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/non_local.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta import torch import torch.nn as nn from ..utils import constant_init, normal_init from .conv_module import ConvModule from .registry import PLUGIN_LAYERS class _NonLocalNd(nn.Module, metaclass=ABCMeta): """Basic Non-local module. This module is proposed in "Non-local Neural Networks" Paper reference: https://arxiv.org/abs/1711.07971 Code reference: https://github.com/AlexHex7/Non-local_pytorch Args: in_channels (int): Channels of the input feature map. reduction (int): Channel reduction ratio. Default: 2. use_scale (bool): Whether to scale pairwise_weight by `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`. Default: True. conv_cfg (None | dict): The config dict for convolution layers. If not specified, it will use `nn.Conv2d` for convolution layers. Default: None. norm_cfg (None | dict): The config dict for normalization layers. Default: None. (This parameter is only applicable to conv_out.) mode (str): Options are `gaussian`, `concatenation`, `embedded_gaussian` and `dot_product`. Default: embedded_gaussian. """ def __init__(self, in_channels, reduction=2, use_scale=True, conv_cfg=None, norm_cfg=None, mode='embedded_gaussian', **kwargs): super(_NonLocalNd, self).__init__() self.in_channels = in_channels self.reduction = reduction self.use_scale = use_scale self.inter_channels = max(in_channels // reduction, 1) self.mode = mode if mode not in [ 'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation' ]: raise ValueError("Mode should be in 'gaussian', 'concatenation', " f"'embedded_gaussian' or 'dot_product', but got " f'{mode} instead.') # g, theta, phi are defaulted as `nn.ConvNd`. # Here we use ConvModule for potential usage. self.g = ConvModule( self.in_channels, self.inter_channels, kernel_size=1, conv_cfg=conv_cfg, act_cfg=None) self.conv_out = ConvModule( self.inter_channels, self.in_channels, kernel_size=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=None) if self.mode != 'gaussian': self.theta = ConvModule( self.in_channels, self.inter_channels, kernel_size=1, conv_cfg=conv_cfg, act_cfg=None) self.phi = ConvModule( self.in_channels, self.inter_channels, kernel_size=1, conv_cfg=conv_cfg, act_cfg=None) if self.mode == 'concatenation': self.concat_project = ConvModule( self.inter_channels * 2, 1, kernel_size=1, stride=1, padding=0, bias=False, act_cfg=dict(type='ReLU')) self.init_weights(**kwargs) def init_weights(self, std=0.01, zeros_init=True): if self.mode != 'gaussian': for m in [self.g, self.theta, self.phi]: normal_init(m.conv, std=std) else: normal_init(self.g.conv, std=std) if zeros_init: if self.conv_out.norm_cfg is None: constant_init(self.conv_out.conv, 0) else: constant_init(self.conv_out.norm, 0) else: if self.conv_out.norm_cfg is None: normal_init(self.conv_out.conv, std=std) else: normal_init(self.conv_out.norm, std=std) def gaussian(self, theta_x, phi_x): # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] pairwise_weight = torch.matmul(theta_x, phi_x) pairwise_weight = pairwise_weight.softmax(dim=-1) return pairwise_weight def embedded_gaussian(self, theta_x, phi_x): # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] pairwise_weight = torch.matmul(theta_x, phi_x) if self.use_scale: # theta_x.shape[-1] is `self.inter_channels` pairwise_weight /= theta_x.shape[-1]**0.5 pairwise_weight = pairwise_weight.softmax(dim=-1) return pairwise_weight def dot_product(self, theta_x, phi_x): # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] pairwise_weight = torch.matmul(theta_x, phi_x) pairwise_weight /= pairwise_weight.shape[-1] return pairwise_weight def concatenation(self, theta_x, phi_x): # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] h = theta_x.size(2) w = phi_x.size(3) theta_x = theta_x.repeat(1, 1, 1, w) phi_x = phi_x.repeat(1, 1, h, 1) concat_feature = torch.cat([theta_x, phi_x], dim=1) pairwise_weight = self.concat_project(concat_feature) n, _, h, w = pairwise_weight.size() pairwise_weight = pairwise_weight.view(n, h, w) pairwise_weight /= pairwise_weight.shape[-1] return pairwise_weight def forward(self, x): # Assume `reduction = 1`, then `inter_channels = C` # or `inter_channels = C` when `mode="gaussian"` # NonLocal1d x: [N, C, H] # NonLocal2d x: [N, C, H, W] # NonLocal3d x: [N, C, T, H, W] n = x.size(0) # NonLocal1d g_x: [N, H, C] # NonLocal2d g_x: [N, HxW, C] # NonLocal3d g_x: [N, TxHxW, C] g_x = self.g(x).view(n, self.inter_channels, -1) g_x = g_x.permute(0, 2, 1) # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H] # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW] # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW] if self.mode == 'gaussian': theta_x = x.view(n, self.in_channels, -1) theta_x = theta_x.permute(0, 2, 1) if self.sub_sample: phi_x = self.phi(x).view(n, self.in_channels, -1) else: phi_x = x.view(n, self.in_channels, -1) elif self.mode == 'concatenation': theta_x = self.theta(x).view(n, self.inter_channels, -1, 1) phi_x = self.phi(x).view(n, self.inter_channels, 1, -1) else: theta_x = self.theta(x).view(n, self.inter_channels, -1) theta_x = theta_x.permute(0, 2, 1) phi_x = self.phi(x).view(n, self.inter_channels, -1) pairwise_func = getattr(self, self.mode) # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] pairwise_weight = pairwise_func(theta_x, phi_x) # NonLocal1d y: [N, H, C] # NonLocal2d y: [N, HxW, C] # NonLocal3d y: [N, TxHxW, C] y = torch.matmul(pairwise_weight, g_x) # NonLocal1d y: [N, C, H] # NonLocal2d y: [N, C, H, W] # NonLocal3d y: [N, C, T, H, W] y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels, *x.size()[2:]) output = x + self.conv_out(y) return output class NonLocal1d(_NonLocalNd): """1D Non-local module. Args: in_channels (int): Same as `NonLocalND`. sub_sample (bool): Whether to apply max pooling after pairwise function (Note that the `sub_sample` is applied on spatial only). Default: False. conv_cfg (None | dict): Same as `NonLocalND`. Default: dict(type='Conv1d'). """ def __init__(self, in_channels, sub_sample=False, conv_cfg=dict(type='Conv1d'), **kwargs): super(NonLocal1d, self).__init__( in_channels, conv_cfg=conv_cfg, **kwargs) self.sub_sample = sub_sample if sub_sample: max_pool_layer = nn.MaxPool1d(kernel_size=2) self.g = nn.Sequential(self.g, max_pool_layer) if self.mode != 'gaussian': self.phi = nn.Sequential(self.phi, max_pool_layer) else: self.phi = max_pool_layer @PLUGIN_LAYERS.register_module() class NonLocal2d(_NonLocalNd): """2D Non-local module. Args: in_channels (int): Same as `NonLocalND`. sub_sample (bool): Whether to apply max pooling after pairwise function (Note that the `sub_sample` is applied on spatial only). Default: False. conv_cfg (None | dict): Same as `NonLocalND`. Default: dict(type='Conv2d'). """ _abbr_ = 'nonlocal_block' def __init__(self, in_channels, sub_sample=False, conv_cfg=dict(type='Conv2d'), **kwargs): super(NonLocal2d, self).__init__( in_channels, conv_cfg=conv_cfg, **kwargs) self.sub_sample = sub_sample if sub_sample: max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) self.g = nn.Sequential(self.g, max_pool_layer) if self.mode != 'gaussian': self.phi = nn.Sequential(self.phi, max_pool_layer) else: self.phi = max_pool_layer class NonLocal3d(_NonLocalNd): """3D Non-local module. Args: in_channels (int): Same as `NonLocalND`. sub_sample (bool): Whether to apply max pooling after pairwise function (Note that the `sub_sample` is applied on spatial only). Default: False. conv_cfg (None | dict): Same as `NonLocalND`. Default: dict(type='Conv3d'). """ def __init__(self, in_channels, sub_sample=False, conv_cfg=dict(type='Conv3d'), **kwargs): super(NonLocal3d, self).__init__( in_channels, conv_cfg=conv_cfg, **kwargs) self.sub_sample = sub_sample if sub_sample: max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) self.g = nn.Sequential(self.g, max_pool_layer) if self.mode != 'gaussian': self.phi = nn.Sequential(self.phi, max_pool_layer) else: self.phi = max_pool_layer ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/norm.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import inspect import torch.nn as nn from annotator.mmpkg.mmcv.utils import is_tuple_of from annotator.mmpkg.mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm from .registry import NORM_LAYERS NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d) NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d) NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d) NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d) NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm) NORM_LAYERS.register_module('GN', module=nn.GroupNorm) NORM_LAYERS.register_module('LN', module=nn.LayerNorm) NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d) NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d) NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d) NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d) def infer_abbr(class_type): """Infer abbreviation from the class name. When we build a norm layer with `build_norm_layer()`, we want to preserve the norm type in variable names, e.g, self.bn1, self.gn. This method will infer the abbreviation to map class types to abbreviations. Rule 1: If the class has the property "_abbr_", return the property. Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and "in" respectively. Rule 3: If the class name contains "batch", "group", "layer" or "instance", the abbreviation of this layer will be "bn", "gn", "ln" and "in" respectively. Rule 4: Otherwise, the abbreviation falls back to "norm". Args: class_type (type): The norm layer type. Returns: str: The inferred abbreviation. """ if not inspect.isclass(class_type): raise TypeError( f'class_type must be a type, but got {type(class_type)}') if hasattr(class_type, '_abbr_'): return class_type._abbr_ if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN return 'in' elif issubclass(class_type, _BatchNorm): return 'bn' elif issubclass(class_type, nn.GroupNorm): return 'gn' elif issubclass(class_type, nn.LayerNorm): return 'ln' else: class_name = class_type.__name__.lower() if 'batch' in class_name: return 'bn' elif 'group' in class_name: return 'gn' elif 'layer' in class_name: return 'ln' elif 'instance' in class_name: return 'in' else: return 'norm_layer' def build_norm_layer(cfg, num_features, postfix=''): """Build normalization layer. Args: cfg (dict): The norm layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate a norm layer. - requires_grad (bool, optional): Whether stop gradient updates. num_features (int): Number of input channels. postfix (int | str): The postfix to be appended into norm abbreviation to create named layer. Returns: (str, nn.Module): The first element is the layer name consisting of abbreviation and postfix, e.g., bn1, gn. The second element is the created norm layer. """ if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() layer_type = cfg_.pop('type') if layer_type not in NORM_LAYERS: raise KeyError(f'Unrecognized norm type {layer_type}') norm_layer = NORM_LAYERS.get(layer_type) abbr = infer_abbr(norm_layer) assert isinstance(postfix, (int, str)) name = abbr + str(postfix) requires_grad = cfg_.pop('requires_grad', True) cfg_.setdefault('eps', 1e-5) if layer_type != 'GN': layer = norm_layer(num_features, **cfg_) if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'): layer._specify_ddp_gpu_num(1) else: assert 'num_groups' in cfg_ layer = norm_layer(num_channels=num_features, **cfg_) for param in layer.parameters(): param.requires_grad = requires_grad return name, layer def is_norm(layer, exclude=None): """Check if a layer is a normalization layer. Args: layer (nn.Module): The layer to be checked. exclude (type | tuple[type]): Types to be excluded. Returns: bool: Whether the layer is a norm layer. """ if exclude is not None: if not isinstance(exclude, tuple): exclude = (exclude, ) if not is_tuple_of(exclude, type): raise TypeError( f'"exclude" must be either None or type or a tuple of types, ' f'but got {type(exclude)}: {exclude}') if exclude and isinstance(layer, exclude): return False all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm) return isinstance(layer, all_norm_bases) ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/padding.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn as nn from .registry import PADDING_LAYERS PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d) PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d) PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d) def build_padding_layer(cfg, *args, **kwargs): """Build padding layer. Args: cfg (None or dict): The padding layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate a padding layer. Returns: nn.Module: Created padding layer. """ if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() padding_type = cfg_.pop('type') if padding_type not in PADDING_LAYERS: raise KeyError(f'Unrecognized padding type {padding_type}.') else: padding_layer = PADDING_LAYERS.get(padding_type) layer = padding_layer(*args, **kwargs, **cfg_) return layer ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/plugin.py ================================================ import inspect import platform from .registry import PLUGIN_LAYERS if platform.system() == 'Windows': import regex as re else: import re def infer_abbr(class_type): """Infer abbreviation from the class name. This method will infer the abbreviation to map class types to abbreviations. Rule 1: If the class has the property "abbr", return the property. Rule 2: Otherwise, the abbreviation falls back to snake case of class name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``. Args: class_type (type): The norm layer type. Returns: str: The inferred abbreviation. """ def camel2snack(word): """Convert camel case word into snack case. Modified from `inflection lib `_. Example:: >>> camel2snack("FancyBlock") 'fancy_block' """ word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word) word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word) word = word.replace('-', '_') return word.lower() if not inspect.isclass(class_type): raise TypeError( f'class_type must be a type, but got {type(class_type)}') if hasattr(class_type, '_abbr_'): return class_type._abbr_ else: return camel2snack(class_type.__name__) def build_plugin_layer(cfg, postfix='', **kwargs): """Build plugin layer. Args: cfg (None or dict): cfg should contain: type (str): identify plugin layer type. layer args: args needed to instantiate a plugin layer. postfix (int, str): appended into norm abbreviation to create named layer. Default: ''. Returns: tuple[str, nn.Module]: name (str): abbreviation + postfix layer (nn.Module): created plugin layer """ if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() layer_type = cfg_.pop('type') if layer_type not in PLUGIN_LAYERS: raise KeyError(f'Unrecognized plugin type {layer_type}') plugin_layer = PLUGIN_LAYERS.get(layer_type) abbr = infer_abbr(plugin_layer) assert isinstance(postfix, (int, str)) name = abbr + str(postfix) layer = plugin_layer(**kwargs, **cfg_) return name, layer ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/registry.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from annotator.mmpkg.mmcv.utils import Registry CONV_LAYERS = Registry('conv layer') NORM_LAYERS = Registry('norm layer') ACTIVATION_LAYERS = Registry('activation layer') PADDING_LAYERS = Registry('padding layer') UPSAMPLE_LAYERS = Registry('upsample layer') PLUGIN_LAYERS = Registry('plugin layer') DROPOUT_LAYERS = Registry('drop out layers') POSITIONAL_ENCODING = Registry('position encoding') ATTENTION = Registry('attention') FEEDFORWARD_NETWORK = Registry('feed-forward Network') TRANSFORMER_LAYER = Registry('transformerLayer') TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence') ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/scale.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn class Scale(nn.Module): """A learnable scale parameter. This layer scales the input by a learnable factor. It multiplies a learnable scale parameter of shape (1,) with input of any shape. Args: scale (float): Initial value of scale factor. Default: 1.0 """ def __init__(self, scale=1.0): super(Scale, self).__init__() self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) def forward(self, x): return x * self.scale ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/swish.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn from .registry import ACTIVATION_LAYERS @ACTIVATION_LAYERS.register_module() class Swish(nn.Module): """Swish Module. This module applies the swish function: .. math:: Swish(x) = x * Sigmoid(x) Returns: Tensor: The output tensor. """ def __init__(self): super(Swish, self).__init__() def forward(self, x): return x * torch.sigmoid(x) ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import warnings import torch import torch.nn as nn from annotator.mmpkg.mmcv import ConfigDict, deprecated_api_warning from annotator.mmpkg.mmcv.cnn import Linear, build_activation_layer, build_norm_layer from annotator.mmpkg.mmcv.runner.base_module import BaseModule, ModuleList, Sequential from annotator.mmpkg.mmcv.utils import build_from_cfg from .drop import build_dropout from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) # Avoid BC-breaking of importing MultiScaleDeformableAttention from this file try: from annotator.mmpkg.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 warnings.warn( ImportWarning( '``MultiScaleDeformableAttention`` has been moved to ' '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 '``from annotator.mmpkg.mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 'to ``from annotator.mmpkg.mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 )) except ImportError: warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' '``mmcv.ops.multi_scale_deform_attn``, ' 'You should install ``mmcv-full`` if you need this module. ') def build_positional_encoding(cfg, default_args=None): """Builder for Position Encoding.""" return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args) def build_attention(cfg, default_args=None): """Builder for attention.""" return build_from_cfg(cfg, ATTENTION, default_args) def build_feedforward_network(cfg, default_args=None): """Builder for feed-forward network (FFN).""" return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args) def build_transformer_layer(cfg, default_args=None): """Builder for transformer layer.""" return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args) def build_transformer_layer_sequence(cfg, default_args=None): """Builder for transformer encoder and transformer decoder.""" return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args) @ATTENTION.register_module() class MultiheadAttention(BaseModule): """A wrapper for ``torch.nn.MultiheadAttention``. This module implements MultiheadAttention with identity connection, and positional encoding is also passed as input. Args: embed_dims (int): The embedding dimension. num_heads (int): Parallel attention heads. attn_drop (float): A Dropout layer on attn_output_weights. Default: 0.0. proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. Default: 0.0. dropout_layer (obj:`ConfigDict`): The dropout_layer used when adding the shortcut. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): When it is True, Key, Query and Value are shape of (batch, n, embed_dim), otherwise (n, batch, embed_dim). Default to False. """ def __init__(self, embed_dims, num_heads, attn_drop=0., proj_drop=0., dropout_layer=dict(type='Dropout', drop_prob=0.), init_cfg=None, batch_first=False, **kwargs): super(MultiheadAttention, self).__init__(init_cfg) if 'dropout' in kwargs: warnings.warn('The arguments `dropout` in MultiheadAttention ' 'has been deprecated, now you can separately ' 'set `attn_drop`(float), proj_drop(float), ' 'and `dropout_layer`(dict) ') attn_drop = kwargs['dropout'] dropout_layer['drop_prob'] = kwargs.pop('dropout') self.embed_dims = embed_dims self.num_heads = num_heads self.batch_first = batch_first self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs) self.proj_drop = nn.Dropout(proj_drop) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else nn.Identity() @deprecated_api_warning({'residual': 'identity'}, cls_name='MultiheadAttention') def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_pos=None, attn_mask=None, key_padding_mask=None, **kwargs): """Forward function for `MultiheadAttention`. **kwargs allow passing a more general data flow when combining with other operations in `transformerlayer`. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . If None, the ``query`` will be used. Defaults to None. value (Tensor): The value tensor with same shape as `key`. Same in `nn.MultiheadAttention.forward`. Defaults to None. If None, the `key` will be used. identity (Tensor): This tensor, with the same shape as x, will be used for the identity link. If None, `x` will be used. Defaults to None. query_pos (Tensor): The positional encoding for query, with the same shape as `x`. If not None, it will be added to `x` before forward function. Defaults to None. key_pos (Tensor): The positional encoding for `key`, with the same shape as `key`. Defaults to None. If not None, it will be added to `key` before forward function. If None, and `query_pos` has the same shape as `key`, then `query_pos` will be used for `key_pos`. Defaults to None. attn_mask (Tensor): ByteTensor mask with shape [num_queries, num_keys]. Same in `nn.MultiheadAttention.forward`. Defaults to None. key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. Defaults to None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. """ if key is None: key = query if value is None: value = key if identity is None: identity = query if key_pos is None: if query_pos is not None: # use query_pos if key_pos is not available if query_pos.shape == key.shape: key_pos = query_pos else: warnings.warn(f'position encoding of key is' f'missing in {self.__class__.__name__}.') if query_pos is not None: query = query + query_pos if key_pos is not None: key = key + key_pos # Because the dataflow('key', 'query', 'value') of # ``torch.nn.MultiheadAttention`` is (num_query, batch, # embed_dims), We should adjust the shape of dataflow from # batch_first (batch, num_query, embed_dims) to num_query_first # (num_query ,batch, embed_dims), and recover ``attn_output`` # from num_query_first to batch_first. if self.batch_first: query = query.transpose(0, 1) key = key.transpose(0, 1) value = value.transpose(0, 1) out = self.attn( query=query, key=key, value=value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)[0] if self.batch_first: out = out.transpose(0, 1) return identity + self.dropout_layer(self.proj_drop(out)) @FEEDFORWARD_NETWORK.register_module() class FFN(BaseModule): """Implements feed-forward networks (FFNs) with identity connection. Args: embed_dims (int): The feature dimension. Same as `MultiheadAttention`. Defaults: 256. feedforward_channels (int): The hidden dimension of FFNs. Defaults: 1024. num_fcs (int, optional): The number of fully-connected layers in FFNs. Default: 2. act_cfg (dict, optional): The activation config for FFNs. Default: dict(type='ReLU') ffn_drop (float, optional): Probability of an element to be zeroed in FFN. Default 0.0. add_identity (bool, optional): Whether to add the identity connection. Default: `True`. dropout_layer (obj:`ConfigDict`): The dropout_layer used when adding the shortcut. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ @deprecated_api_warning( { 'dropout': 'ffn_drop', 'add_residual': 'add_identity' }, cls_name='FFN') def __init__(self, embed_dims=256, feedforward_channels=1024, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0., dropout_layer=None, add_identity=True, init_cfg=None, **kwargs): super(FFN, self).__init__(init_cfg) assert num_fcs >= 2, 'num_fcs should be no less ' \ f'than 2. got {num_fcs}.' self.embed_dims = embed_dims self.feedforward_channels = feedforward_channels self.num_fcs = num_fcs self.act_cfg = act_cfg self.activate = build_activation_layer(act_cfg) layers = [] in_channels = embed_dims for _ in range(num_fcs - 1): layers.append( Sequential( Linear(in_channels, feedforward_channels), self.activate, nn.Dropout(ffn_drop))) in_channels = feedforward_channels layers.append(Linear(feedforward_channels, embed_dims)) layers.append(nn.Dropout(ffn_drop)) self.layers = Sequential(*layers) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else torch.nn.Identity() self.add_identity = add_identity @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN') def forward(self, x, identity=None): """Forward function for `FFN`. The function would add x to the output tensor if residue is None. """ out = self.layers(x) if not self.add_identity: return self.dropout_layer(out) if identity is None: identity = x return identity + self.dropout_layer(out) @TRANSFORMER_LAYER.register_module() class BaseTransformerLayer(BaseModule): """Base `TransformerLayer` for vision transformer. It can be built from `mmcv.ConfigDict` and support more flexible customization, for example, using any number of `FFN or LN ` and use different kinds of `attention` by specifying a list of `ConfigDict` named `attn_cfgs`. It is worth mentioning that it supports `prenorm` when you specifying `norm` as the first element of `operation_order`. More details about the `prenorm`: `On Layer Normalization in the Transformer Architecture `_ . Args: attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for `self_attention` or `cross_attention` modules, The order of the configs in the list should be consistent with corresponding attentions in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. Default: None. ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for FFN, The order of the configs in the list should be consistent with corresponding ffn in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Support `prenorm` when you specifying first element as `norm`. Default:None. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. """ def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=False, **kwargs): deprecated_args = dict( feedforward_channels='feedforward_channels', ffn_dropout='ffn_drop', ffn_num_fcs='num_fcs') for ori_name, new_name in deprecated_args.items(): if ori_name in kwargs: warnings.warn( f'The arguments `{ori_name}` in BaseTransformerLayer ' f'has been deprecated, now you should set `{new_name}` ' f'and other FFN related arguments ' f'to a dict named `ffn_cfgs`. ') ffn_cfgs[new_name] = kwargs[ori_name] super(BaseTransformerLayer, self).__init__(init_cfg) self.batch_first = batch_first assert set(operation_order) & set( ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ set(operation_order), f'The operation_order of' \ f' {self.__class__.__name__} should ' \ f'contains all four operation type ' \ f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" num_attn = operation_order.count('self_attn') + operation_order.count( 'cross_attn') if isinstance(attn_cfgs, dict): attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] else: assert num_attn == len(attn_cfgs), f'The length ' \ f'of attn_cfg {num_attn} is ' \ f'not consistent with the number of attention' \ f'in operation_order {operation_order}.' self.num_attn = num_attn self.operation_order = operation_order self.norm_cfg = norm_cfg self.pre_norm = operation_order[0] == 'norm' self.attentions = ModuleList() index = 0 for operation_name in operation_order: if operation_name in ['self_attn', 'cross_attn']: if 'batch_first' in attn_cfgs[index]: assert self.batch_first == attn_cfgs[index]['batch_first'] else: attn_cfgs[index]['batch_first'] = self.batch_first attention = build_attention(attn_cfgs[index]) # Some custom attentions used as `self_attn` # or `cross_attn` can have different behavior. attention.operation_name = operation_name self.attentions.append(attention) index += 1 self.embed_dims = self.attentions[0].embed_dims self.ffns = ModuleList() num_ffns = operation_order.count('ffn') if isinstance(ffn_cfgs, dict): ffn_cfgs = ConfigDict(ffn_cfgs) if isinstance(ffn_cfgs, dict): ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] assert len(ffn_cfgs) == num_ffns for ffn_index in range(num_ffns): if 'embed_dims' not in ffn_cfgs[ffn_index]: ffn_cfgs['embed_dims'] = self.embed_dims else: assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims self.ffns.append( build_feedforward_network(ffn_cfgs[ffn_index], dict(type='FFN'))) self.norms = ModuleList() num_norms = operation_order.count('norm') for _ in range(num_norms): self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) def forward(self, query, key=None, value=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: if layer == 'self_attn': temp_key = temp_value = query query = self.attentions[attn_index]( query, temp_key, temp_value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=query_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 elif layer == 'cross_attn': query = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=key_pos, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query @TRANSFORMER_LAYER_SEQUENCE.register_module() class TransformerLayerSequence(BaseModule): """Base class for TransformerEncoder and TransformerDecoder in vision transformer. As base-class of Encoder and Decoder in vision transformer. Support customization such as specifying different kind of `transformer_layer` in `transformer_coder`. Args: transformerlayer (list[obj:`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict`): Config of transformerlayer in TransformerCoder. If it is obj:`mmcv.ConfigDict`, it would be repeated `num_layer` times to a list[`mmcv.ConfigDict`]. Default: None. num_layers (int): The number of `TransformerLayer`. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): super(TransformerLayerSequence, self).__init__(init_cfg) if isinstance(transformerlayers, dict): transformerlayers = [ copy.deepcopy(transformerlayers) for _ in range(num_layers) ] else: assert isinstance(transformerlayers, list) and \ len(transformerlayers) == num_layers self.num_layers = num_layers self.layers = ModuleList() for i in range(num_layers): self.layers.append(build_transformer_layer(transformerlayers[i])) self.embed_dims = self.layers[0].embed_dims self.pre_norm = self.layers[0].pre_norm def forward(self, query, key, value, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerCoder`. Args: query (Tensor): Input query with shape `(num_queries, bs, embed_dims)`. key (Tensor): The key tensor with shape `(num_keys, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_keys, bs, embed_dims)`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor], optional): Each element is 2D Tensor which is used in calculation of corresponding attention in operation_order. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in self-attention Default: None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: results with shape [num_queries, bs, embed_dims]. """ for layer in self.layers: query = layer( query, key, value, query_pos=query_pos, key_pos=key_pos, attn_masks=attn_masks, query_key_padding_mask=query_key_padding_mask, key_padding_mask=key_padding_mask, **kwargs) return query ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/upsample.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn as nn import torch.nn.functional as F from ..utils import xavier_init from .registry import UPSAMPLE_LAYERS UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample) UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample) @UPSAMPLE_LAYERS.register_module(name='pixel_shuffle') class PixelShufflePack(nn.Module): """Pixel Shuffle upsample layer. This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to achieve a simple upsampling with pixel shuffle. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. scale_factor (int): Upsample ratio. upsample_kernel (int): Kernel size of the conv layer to expand the channels. """ def __init__(self, in_channels, out_channels, scale_factor, upsample_kernel): super(PixelShufflePack, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.scale_factor = scale_factor self.upsample_kernel = upsample_kernel self.upsample_conv = nn.Conv2d( self.in_channels, self.out_channels * scale_factor * scale_factor, self.upsample_kernel, padding=(self.upsample_kernel - 1) // 2) self.init_weights() def init_weights(self): xavier_init(self.upsample_conv, distribution='uniform') def forward(self, x): x = self.upsample_conv(x) x = F.pixel_shuffle(x, self.scale_factor) return x def build_upsample_layer(cfg, *args, **kwargs): """Build upsample layer. Args: cfg (dict): The upsample layer config, which should contain: - type (str): Layer type. - scale_factor (int): Upsample ratio, which is not applicable to deconv. - layer args: Args needed to instantiate a upsample layer. args (argument list): Arguments passed to the ``__init__`` method of the corresponding conv layer. kwargs (keyword arguments): Keyword arguments passed to the ``__init__`` method of the corresponding conv layer. Returns: nn.Module: Created upsample layer. """ if not isinstance(cfg, dict): raise TypeError(f'cfg must be a dict, but got {type(cfg)}') if 'type' not in cfg: raise KeyError( f'the cfg dict must contain the key "type", but got {cfg}') cfg_ = cfg.copy() layer_type = cfg_.pop('type') if layer_type not in UPSAMPLE_LAYERS: raise KeyError(f'Unrecognized upsample type {layer_type}') else: upsample = UPSAMPLE_LAYERS.get(layer_type) if upsample is nn.Upsample: cfg_['mode'] = layer_type layer = upsample(*args, **kwargs, **cfg_) return layer ================================================ FILE: annotator/mmpkg/mmcv/cnn/bricks/wrappers.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py # noqa: E501 Wrap some nn modules to support empty tensor input. Currently, these wrappers are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask heads are trained on only positive RoIs. """ import math import torch import torch.nn as nn from torch.nn.modules.utils import _pair, _triple from .registry import CONV_LAYERS, UPSAMPLE_LAYERS if torch.__version__ == 'parrots': TORCH_VERSION = torch.__version__ else: # torch.__version__ could be 1.3.1+cu92, we only need the first two # for comparison TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2]) def obsolete_torch_version(torch_version, version_threshold): return torch_version == 'parrots' or torch_version <= version_threshold class NewEmptyTensorOp(torch.autograd.Function): @staticmethod def forward(ctx, x, new_shape): ctx.shape = x.shape return x.new_empty(new_shape) @staticmethod def backward(ctx, grad): shape = ctx.shape return NewEmptyTensorOp.apply(grad, shape), None @CONV_LAYERS.register_module('Conv', force=True) class Conv2d(nn.Conv2d): def forward(self, x): if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size, self.padding, self.stride, self.dilation): o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 out_shape.append(o) empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) @CONV_LAYERS.register_module('Conv3d', force=True) class Conv3d(nn.Conv3d): def forward(self, x): if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size, self.padding, self.stride, self.dilation): o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 out_shape.append(o) empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) @CONV_LAYERS.register_module() @CONV_LAYERS.register_module('deconv') @UPSAMPLE_LAYERS.register_module('deconv', force=True) class ConvTranspose2d(nn.ConvTranspose2d): def forward(self, x): if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size, self.padding, self.stride, self.dilation, self.output_padding): out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) @CONV_LAYERS.register_module() @CONV_LAYERS.register_module('deconv3d') @UPSAMPLE_LAYERS.register_module('deconv3d', force=True) class ConvTranspose3d(nn.ConvTranspose3d): def forward(self, x): if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size, self.padding, self.stride, self.dilation, self.output_padding): out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) class MaxPool2d(nn.MaxPool2d): def forward(self, x): # PyTorch 1.9 does not support empty tensor inference yet if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): out_shape = list(x.shape[:2]) for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size), _pair(self.padding), _pair(self.stride), _pair(self.dilation)): o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 o = math.ceil(o) if self.ceil_mode else math.floor(o) out_shape.append(o) empty = NewEmptyTensorOp.apply(x, out_shape) return empty return super().forward(x) class MaxPool3d(nn.MaxPool3d): def forward(self, x): # PyTorch 1.9 does not support empty tensor inference yet if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): out_shape = list(x.shape[:2]) for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size), _triple(self.padding), _triple(self.stride), _triple(self.dilation)): o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 o = math.ceil(o) if self.ceil_mode else math.floor(o) out_shape.append(o) empty = NewEmptyTensorOp.apply(x, out_shape) return empty return super().forward(x) class Linear(torch.nn.Linear): def forward(self, x): # empty tensor forward of Linear layer is supported in Pytorch 1.6 if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)): out_shape = [x.shape[0], self.out_features] empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) ================================================ FILE: annotator/mmpkg/mmcv/cnn/builder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ..runner import Sequential from ..utils import Registry, build_from_cfg def build_model_from_cfg(cfg, registry, default_args=None): """Build a PyTorch model from config dict(s). Different from ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built. Args: cfg (dict, list[dict]): The config of modules, is is either a config dict or a list of config dicts. If cfg is a list, a the built modules will be wrapped with ``nn.Sequential``. registry (:obj:`Registry`): A registry the module belongs to. default_args (dict, optional): Default arguments to build the module. Defaults to None. Returns: nn.Module: A built nn module. """ if isinstance(cfg, list): modules = [ build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg ] return Sequential(*modules) else: return build_from_cfg(cfg, registry, default_args) MODELS = Registry('model', build_func=build_model_from_cfg) ================================================ FILE: annotator/mmpkg/mmcv/cnn/resnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import logging import torch.nn as nn import torch.utils.checkpoint as cp from .utils import constant_init, kaiming_init def conv3x3(in_planes, out_planes, stride=1, dilation=1): """3x3 convolution with padding.""" return nn.Conv2d( in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, style='pytorch', with_cp=False): super(BasicBlock, self).__init__() assert style in ['pytorch', 'caffe'] self.conv1 = conv3x3(inplanes, planes, stride, dilation) self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride self.dilation = dilation assert not with_cp def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, style='pytorch', with_cp=False): """Bottleneck block. If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is "caffe", the stride-two layer is the first 1x1 conv layer. """ super(Bottleneck, self).__init__() assert style in ['pytorch', 'caffe'] if style == 'pytorch': conv1_stride = 1 conv2_stride = stride else: conv1_stride = stride conv2_stride = 1 self.conv1 = nn.Conv2d( inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False) self.conv2 = nn.Conv2d( planes, planes, kernel_size=3, stride=conv2_stride, padding=dilation, dilation=dilation, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d( planes, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride self.dilation = dilation self.with_cp = with_cp def forward(self, x): def _inner_forward(x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) out = self.relu(out) return out def make_res_layer(block, inplanes, planes, blocks, stride=1, dilation=1, style='pytorch', with_cp=False): downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d( inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append( block( inplanes, planes, stride, dilation, downsample, style=style, with_cp=with_cp)) inplanes = planes * block.expansion for _ in range(1, blocks): layers.append( block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp)) return nn.Sequential(*layers) class ResNet(nn.Module): """ResNet backbone. Args: depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. num_stages (int): Resnet stages, normally 4. strides (Sequence[int]): Strides of the first block of each stage. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two layer is the 3x3 conv layer, otherwise the stride-two layer is the first 1x1 conv layer. frozen_stages (int): Stages to be frozen (all param fixed). -1 means not freezing any parameters. bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze running stats (mean and var). bn_frozen (bool): Whether to freeze weight and bias of BN layers. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. """ arch_settings = { 18: (BasicBlock, (2, 2, 2, 2)), 34: (BasicBlock, (3, 4, 6, 3)), 50: (Bottleneck, (3, 4, 6, 3)), 101: (Bottleneck, (3, 4, 23, 3)), 152: (Bottleneck, (3, 8, 36, 3)) } def __init__(self, depth, num_stages=4, strides=(1, 2, 2, 2), dilations=(1, 1, 1, 1), out_indices=(0, 1, 2, 3), style='pytorch', frozen_stages=-1, bn_eval=True, bn_frozen=False, with_cp=False): super(ResNet, self).__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for resnet') assert num_stages >= 1 and num_stages <= 4 block, stage_blocks = self.arch_settings[depth] stage_blocks = stage_blocks[:num_stages] assert len(strides) == len(dilations) == num_stages assert max(out_indices) < num_stages self.out_indices = out_indices self.style = style self.frozen_stages = frozen_stages self.bn_eval = bn_eval self.bn_frozen = bn_frozen self.with_cp = with_cp self.inplanes = 64 self.conv1 = nn.Conv2d( 3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.res_layers = [] for i, num_blocks in enumerate(stage_blocks): stride = strides[i] dilation = dilations[i] planes = 64 * 2**i res_layer = make_res_layer( block, self.inplanes, planes, num_blocks, stride=stride, dilation=dilation, style=self.style, with_cp=with_cp) self.inplanes = planes * block.expansion layer_name = f'layer{i + 1}' self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1) def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = logging.getLogger() from ..runner import load_checkpoint load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, nn.BatchNorm2d): constant_init(m, 1) else: raise TypeError('pretrained must be a str or None') def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) outs = [] for i, layer_name in enumerate(self.res_layers): res_layer = getattr(self, layer_name) x = res_layer(x) if i in self.out_indices: outs.append(x) if len(outs) == 1: return outs[0] else: return tuple(outs) def train(self, mode=True): super(ResNet, self).train(mode) if self.bn_eval: for m in self.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() if self.bn_frozen: for params in m.parameters(): params.requires_grad = False if mode and self.frozen_stages >= 0: for param in self.conv1.parameters(): param.requires_grad = False for param in self.bn1.parameters(): param.requires_grad = False self.bn1.eval() self.bn1.weight.requires_grad = False self.bn1.bias.requires_grad = False for i in range(1, self.frozen_stages + 1): mod = getattr(self, f'layer{i}') mod.eval() for param in mod.parameters(): param.requires_grad = False ================================================ FILE: annotator/mmpkg/mmcv/cnn/utils/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .flops_counter import get_model_complexity_info from .fuse_conv_bn import fuse_conv_bn from .sync_bn import revert_sync_batchnorm from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit, NormalInit, PretrainedInit, TruncNormalInit, UniformInit, XavierInit, bias_init_with_prob, caffe2_xavier_init, constant_init, initialize, kaiming_init, normal_init, trunc_normal_init, uniform_init, xavier_init) __all__ = [ 'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init', 'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init', 'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit', 'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit', 'Caffe2XavierInit', 'revert_sync_batchnorm' ] ================================================ FILE: annotator/mmpkg/mmcv/cnn/utils/flops_counter.py ================================================ # Modified from flops-counter.pytorch by Vladislav Sovrasov # original repo: https://github.com/sovrasov/flops-counter.pytorch # MIT License # Copyright (c) 2018 Vladislav Sovrasov # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import sys from functools import partial import numpy as np import torch import torch.nn as nn import annotator.mmpkg.mmcv as mmcv def get_model_complexity_info(model, input_shape, print_per_layer_stat=True, as_strings=True, input_constructor=None, flush=False, ost=sys.stdout): """Get complexity information of a model. This method can calculate FLOPs and parameter counts of a model with corresponding input shape. It can also print complexity information for each layer in a model. Supported layers are listed as below: - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``. - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``, ``nn.ReLU6``. - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``, ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``, ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``, ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``, ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``. - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``, ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``, ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``. - Linear: ``nn.Linear``. - Deconvolution: ``nn.ConvTranspose2d``. - Upsample: ``nn.Upsample``. Args: model (nn.Module): The model for complexity calculation. input_shape (tuple): Input shape used for calculation. print_per_layer_stat (bool): Whether to print complexity information for each layer in a model. Default: True. as_strings (bool): Output FLOPs and params counts in a string form. Default: True. input_constructor (None | callable): If specified, it takes a callable method that generates input. otherwise, it will generate a random tensor with input shape to calculate FLOPs. Default: None. flush (bool): same as that in :func:`print`. Default: False. ost (stream): same as ``file`` param in :func:`print`. Default: sys.stdout. Returns: tuple[float | str]: If ``as_strings`` is set to True, it will return FLOPs and parameter counts in a string format. otherwise, it will return those in a float number format. """ assert type(input_shape) is tuple assert len(input_shape) >= 1 assert isinstance(model, nn.Module) flops_model = add_flops_counting_methods(model) flops_model.eval() flops_model.start_flops_count() if input_constructor: input = input_constructor(input_shape) _ = flops_model(**input) else: try: batch = torch.ones(()).new_empty( (1, *input_shape), dtype=next(flops_model.parameters()).dtype, device=next(flops_model.parameters()).device) except StopIteration: # Avoid StopIteration for models which have no parameters, # like `nn.Relu()`, `nn.AvgPool2d`, etc. batch = torch.ones(()).new_empty((1, *input_shape)) _ = flops_model(batch) flops_count, params_count = flops_model.compute_average_flops_cost() if print_per_layer_stat: print_model_with_flops( flops_model, flops_count, params_count, ost=ost, flush=flush) flops_model.stop_flops_count() if as_strings: return flops_to_string(flops_count), params_to_string(params_count) return flops_count, params_count def flops_to_string(flops, units='GFLOPs', precision=2): """Convert FLOPs number into a string. Note that Here we take a multiply-add counts as one FLOP. Args: flops (float): FLOPs number to be converted. units (str | None): Converted FLOPs units. Options are None, 'GFLOPs', 'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically choose the most suitable unit for FLOPs. Default: 'GFLOPs'. precision (int): Digit number after the decimal point. Default: 2. Returns: str: The converted FLOPs number with units. Examples: >>> flops_to_string(1e9) '1.0 GFLOPs' >>> flops_to_string(2e5, 'MFLOPs') '0.2 MFLOPs' >>> flops_to_string(3e-9, None) '3e-09 FLOPs' """ if units is None: if flops // 10**9 > 0: return str(round(flops / 10.**9, precision)) + ' GFLOPs' elif flops // 10**6 > 0: return str(round(flops / 10.**6, precision)) + ' MFLOPs' elif flops // 10**3 > 0: return str(round(flops / 10.**3, precision)) + ' KFLOPs' else: return str(flops) + ' FLOPs' else: if units == 'GFLOPs': return str(round(flops / 10.**9, precision)) + ' ' + units elif units == 'MFLOPs': return str(round(flops / 10.**6, precision)) + ' ' + units elif units == 'KFLOPs': return str(round(flops / 10.**3, precision)) + ' ' + units else: return str(flops) + ' FLOPs' def params_to_string(num_params, units=None, precision=2): """Convert parameter number into a string. Args: num_params (float): Parameter number to be converted. units (str | None): Converted FLOPs units. Options are None, 'M', 'K' and ''. If set to None, it will automatically choose the most suitable unit for Parameter number. Default: None. precision (int): Digit number after the decimal point. Default: 2. Returns: str: The converted parameter number with units. Examples: >>> params_to_string(1e9) '1000.0 M' >>> params_to_string(2e5) '200.0 k' >>> params_to_string(3e-9) '3e-09' """ if units is None: if num_params // 10**6 > 0: return str(round(num_params / 10**6, precision)) + ' M' elif num_params // 10**3: return str(round(num_params / 10**3, precision)) + ' k' else: return str(num_params) else: if units == 'M': return str(round(num_params / 10.**6, precision)) + ' ' + units elif units == 'K': return str(round(num_params / 10.**3, precision)) + ' ' + units else: return str(num_params) def print_model_with_flops(model, total_flops, total_params, units='GFLOPs', precision=3, ost=sys.stdout, flush=False): """Print a model with FLOPs for each layer. Args: model (nn.Module): The model to be printed. total_flops (float): Total FLOPs of the model. total_params (float): Total parameter counts of the model. units (str | None): Converted FLOPs units. Default: 'GFLOPs'. precision (int): Digit number after the decimal point. Default: 3. ost (stream): same as `file` param in :func:`print`. Default: sys.stdout. flush (bool): same as that in :func:`print`. Default: False. Example: >>> class ExampleModel(nn.Module): >>> def __init__(self): >>> super().__init__() >>> self.conv1 = nn.Conv2d(3, 8, 3) >>> self.conv2 = nn.Conv2d(8, 256, 3) >>> self.conv3 = nn.Conv2d(256, 8, 3) >>> self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) >>> self.flatten = nn.Flatten() >>> self.fc = nn.Linear(8, 1) >>> def forward(self, x): >>> x = self.conv1(x) >>> x = self.conv2(x) >>> x = self.conv3(x) >>> x = self.avg_pool(x) >>> x = self.flatten(x) >>> x = self.fc(x) >>> return x >>> model = ExampleModel() >>> x = (3, 16, 16) to print the complexity information state for each layer, you can use >>> get_model_complexity_info(model, x) or directly use >>> print_model_with_flops(model, 4579784.0, 37361) ExampleModel( 0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs, (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1)) # noqa: E501 (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1)) (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1)) (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1)) (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, ) (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True) ) """ def accumulate_params(self): if is_supported_instance(self): return self.__params__ else: sum = 0 for m in self.children(): sum += m.accumulate_params() return sum def accumulate_flops(self): if is_supported_instance(self): return self.__flops__ / model.__batch_counter__ else: sum = 0 for m in self.children(): sum += m.accumulate_flops() return sum def flops_repr(self): accumulated_num_params = self.accumulate_params() accumulated_flops_cost = self.accumulate_flops() return ', '.join([ params_to_string( accumulated_num_params, units='M', precision=precision), '{:.3%} Params'.format(accumulated_num_params / total_params), flops_to_string( accumulated_flops_cost, units=units, precision=precision), '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops), self.original_extra_repr() ]) def add_extra_repr(m): m.accumulate_flops = accumulate_flops.__get__(m) m.accumulate_params = accumulate_params.__get__(m) flops_extra_repr = flops_repr.__get__(m) if m.extra_repr != flops_extra_repr: m.original_extra_repr = m.extra_repr m.extra_repr = flops_extra_repr assert m.extra_repr != m.original_extra_repr def del_extra_repr(m): if hasattr(m, 'original_extra_repr'): m.extra_repr = m.original_extra_repr del m.original_extra_repr if hasattr(m, 'accumulate_flops'): del m.accumulate_flops model.apply(add_extra_repr) print(model, file=ost, flush=flush) model.apply(del_extra_repr) def get_model_parameters_number(model): """Calculate parameter number of a model. Args: model (nn.module): The model for parameter number calculation. Returns: float: Parameter number of the model. """ num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) return num_params def add_flops_counting_methods(net_main_module): # adding additional methods to the existing module object, # this is done this way so that each function has access to self object net_main_module.start_flops_count = start_flops_count.__get__( net_main_module) net_main_module.stop_flops_count = stop_flops_count.__get__( net_main_module) net_main_module.reset_flops_count = reset_flops_count.__get__( net_main_module) net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # noqa: E501 net_main_module) net_main_module.reset_flops_count() return net_main_module def compute_average_flops_cost(self): """Compute average FLOPs cost. A method to compute average FLOPs cost, which will be available after `add_flops_counting_methods()` is called on a desired net object. Returns: float: Current mean flops consumption per image. """ batches_count = self.__batch_counter__ flops_sum = 0 for module in self.modules(): if is_supported_instance(module): flops_sum += module.__flops__ params_sum = get_model_parameters_number(self) return flops_sum / batches_count, params_sum def start_flops_count(self): """Activate the computation of mean flops consumption per image. A method to activate the computation of mean flops consumption per image. which will be available after ``add_flops_counting_methods()`` is called on a desired net object. It should be called before running the network. """ add_batch_counter_hook_function(self) def add_flops_counter_hook_function(module): if is_supported_instance(module): if hasattr(module, '__flops_handle__'): return else: handle = module.register_forward_hook( get_modules_mapping()[type(module)]) module.__flops_handle__ = handle self.apply(partial(add_flops_counter_hook_function)) def stop_flops_count(self): """Stop computing the mean flops consumption per image. A method to stop computing the mean flops consumption per image, which will be available after ``add_flops_counting_methods()`` is called on a desired net object. It can be called to pause the computation whenever. """ remove_batch_counter_hook_function(self) self.apply(remove_flops_counter_hook_function) def reset_flops_count(self): """Reset statistics computed so far. A method to Reset computed statistics, which will be available after `add_flops_counting_methods()` is called on a desired net object. """ add_batch_counter_variables_or_reset(self) self.apply(add_flops_counter_variable_or_reset) # ---- Internal functions def empty_flops_counter_hook(module, input, output): module.__flops__ += 0 def upsample_flops_counter_hook(module, input, output): output_size = output[0] batch_size = output_size.shape[0] output_elements_count = batch_size for val in output_size.shape[1:]: output_elements_count *= val module.__flops__ += int(output_elements_count) def relu_flops_counter_hook(module, input, output): active_elements_count = output.numel() module.__flops__ += int(active_elements_count) def linear_flops_counter_hook(module, input, output): input = input[0] output_last_dim = output.shape[ -1] # pytorch checks dimensions, so here we don't care much module.__flops__ += int(np.prod(input.shape) * output_last_dim) def pool_flops_counter_hook(module, input, output): input = input[0] module.__flops__ += int(np.prod(input.shape)) def norm_flops_counter_hook(module, input, output): input = input[0] batch_flops = np.prod(input.shape) if (getattr(module, 'affine', False) or getattr(module, 'elementwise_affine', False)): batch_flops *= 2 module.__flops__ += int(batch_flops) def deconv_flops_counter_hook(conv_module, input, output): # Can have multiple inputs, getting the first one input = input[0] batch_size = input.shape[0] input_height, input_width = input.shape[2:] kernel_height, kernel_width = conv_module.kernel_size in_channels = conv_module.in_channels out_channels = conv_module.out_channels groups = conv_module.groups filters_per_channel = out_channels // groups conv_per_position_flops = ( kernel_height * kernel_width * in_channels * filters_per_channel) active_elements_count = batch_size * input_height * input_width overall_conv_flops = conv_per_position_flops * active_elements_count bias_flops = 0 if conv_module.bias is not None: output_height, output_width = output.shape[2:] bias_flops = out_channels * batch_size * output_height * output_height overall_flops = overall_conv_flops + bias_flops conv_module.__flops__ += int(overall_flops) def conv_flops_counter_hook(conv_module, input, output): # Can have multiple inputs, getting the first one input = input[0] batch_size = input.shape[0] output_dims = list(output.shape[2:]) kernel_dims = list(conv_module.kernel_size) in_channels = conv_module.in_channels out_channels = conv_module.out_channels groups = conv_module.groups filters_per_channel = out_channels // groups conv_per_position_flops = int( np.prod(kernel_dims)) * in_channels * filters_per_channel active_elements_count = batch_size * int(np.prod(output_dims)) overall_conv_flops = conv_per_position_flops * active_elements_count bias_flops = 0 if conv_module.bias is not None: bias_flops = out_channels * active_elements_count overall_flops = overall_conv_flops + bias_flops conv_module.__flops__ += int(overall_flops) def batch_counter_hook(module, input, output): batch_size = 1 if len(input) > 0: # Can have multiple inputs, getting the first one input = input[0] batch_size = len(input) else: pass print('Warning! No positional inputs found for a module, ' 'assuming batch size is 1.') module.__batch_counter__ += batch_size def add_batch_counter_variables_or_reset(module): module.__batch_counter__ = 0 def add_batch_counter_hook_function(module): if hasattr(module, '__batch_counter_handle__'): return handle = module.register_forward_hook(batch_counter_hook) module.__batch_counter_handle__ = handle def remove_batch_counter_hook_function(module): if hasattr(module, '__batch_counter_handle__'): module.__batch_counter_handle__.remove() del module.__batch_counter_handle__ def add_flops_counter_variable_or_reset(module): if is_supported_instance(module): if hasattr(module, '__flops__') or hasattr(module, '__params__'): print('Warning: variables __flops__ or __params__ are already ' 'defined for the module' + type(module).__name__ + ' ptflops can affect your code!') module.__flops__ = 0 module.__params__ = get_model_parameters_number(module) def is_supported_instance(module): if type(module) in get_modules_mapping(): return True return False def remove_flops_counter_hook_function(module): if is_supported_instance(module): if hasattr(module, '__flops_handle__'): module.__flops_handle__.remove() del module.__flops_handle__ def get_modules_mapping(): return { # convolutions nn.Conv1d: conv_flops_counter_hook, nn.Conv2d: conv_flops_counter_hook, mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook, nn.Conv3d: conv_flops_counter_hook, mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook, # activations nn.ReLU: relu_flops_counter_hook, nn.PReLU: relu_flops_counter_hook, nn.ELU: relu_flops_counter_hook, nn.LeakyReLU: relu_flops_counter_hook, nn.ReLU6: relu_flops_counter_hook, # poolings nn.MaxPool1d: pool_flops_counter_hook, nn.AvgPool1d: pool_flops_counter_hook, nn.AvgPool2d: pool_flops_counter_hook, nn.MaxPool2d: pool_flops_counter_hook, mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook, nn.MaxPool3d: pool_flops_counter_hook, mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook, nn.AvgPool3d: pool_flops_counter_hook, nn.AdaptiveMaxPool1d: pool_flops_counter_hook, nn.AdaptiveAvgPool1d: pool_flops_counter_hook, nn.AdaptiveMaxPool2d: pool_flops_counter_hook, nn.AdaptiveAvgPool2d: pool_flops_counter_hook, nn.AdaptiveMaxPool3d: pool_flops_counter_hook, nn.AdaptiveAvgPool3d: pool_flops_counter_hook, # normalizations nn.BatchNorm1d: norm_flops_counter_hook, nn.BatchNorm2d: norm_flops_counter_hook, nn.BatchNorm3d: norm_flops_counter_hook, nn.GroupNorm: norm_flops_counter_hook, nn.InstanceNorm1d: norm_flops_counter_hook, nn.InstanceNorm2d: norm_flops_counter_hook, nn.InstanceNorm3d: norm_flops_counter_hook, nn.LayerNorm: norm_flops_counter_hook, # FC nn.Linear: linear_flops_counter_hook, mmcv.cnn.bricks.Linear: linear_flops_counter_hook, # Upscale nn.Upsample: upsample_flops_counter_hook, # Deconvolution nn.ConvTranspose2d: deconv_flops_counter_hook, mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook, } ================================================ FILE: annotator/mmpkg/mmcv/cnn/utils/fuse_conv_bn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn def _fuse_conv_bn(conv, bn): """Fuse conv and bn into one module. Args: conv (nn.Module): Conv to be fused. bn (nn.Module): BN to be fused. Returns: nn.Module: Fused module. """ conv_w = conv.weight conv_b = conv.bias if conv.bias is not None else torch.zeros_like( bn.running_mean) factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1])) conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) return conv def fuse_conv_bn(module): """Recursively fuse conv and bn in a module. During inference, the functionary of batch norm layers is turned off but only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv layers to save computations and simplify network structures. Args: module (nn.Module): Module to be fused. Returns: nn.Module: Fused module. """ last_conv = None last_conv_name = None for name, child in module.named_children(): if isinstance(child, (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)): if last_conv is None: # only fuse BN that is after Conv continue fused_conv = _fuse_conv_bn(last_conv, child) module._modules[last_conv_name] = fused_conv # To reduce changes, set BN as Identity instead of deleting it. module._modules[name] = nn.Identity() last_conv = None elif isinstance(child, nn.Conv2d): last_conv = child last_conv_name = name else: fuse_conv_bn(child) return module ================================================ FILE: annotator/mmpkg/mmcv/cnn/utils/sync_bn.py ================================================ import torch import annotator.mmpkg.mmcv as mmcv class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm): """A general BatchNorm layer without input dimension check. Reproduced from @kapily's work: (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547) The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc is `_check_input_dim` that is designed for tensor sanity checks. The check has been bypassed in this class for the convenience of converting SyncBatchNorm. """ def _check_input_dim(self, input): return def revert_sync_batchnorm(module): """Helper function to convert all `SyncBatchNorm` (SyncBN) and `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to `BatchNormXd` layers. Adapted from @kapily's work: (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547) Args: module (nn.Module): The module containing `SyncBatchNorm` layers. Returns: module_output: The converted module with `BatchNormXd` layers. """ module_output = module module_checklist = [torch.nn.modules.batchnorm.SyncBatchNorm] if hasattr(mmcv, 'ops'): module_checklist.append(mmcv.ops.SyncBatchNorm) if isinstance(module, tuple(module_checklist)): module_output = _BatchNormXd(module.num_features, module.eps, module.momentum, module.affine, module.track_running_stats) if module.affine: # no_grad() may not be needed here but # just to be consistent with `convert_sync_batchnorm()` with torch.no_grad(): module_output.weight = module.weight module_output.bias = module.bias module_output.running_mean = module.running_mean module_output.running_var = module.running_var module_output.num_batches_tracked = module.num_batches_tracked module_output.training = module.training # qconfig exists in quantized models if hasattr(module, 'qconfig'): module_output.qconfig = module.qconfig for name, child in module.named_children(): module_output.add_module(name, revert_sync_batchnorm(child)) del module return module_output ================================================ FILE: annotator/mmpkg/mmcv/cnn/utils/weight_init.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import math import warnings import numpy as np import torch import torch.nn as nn from torch import Tensor from annotator.mmpkg.mmcv.utils import Registry, build_from_cfg, get_logger, print_log INITIALIZERS = Registry('initializer') def update_init_info(module, init_info): """Update the `_params_init_info` in the module if the value of parameters are changed. Args: module (obj:`nn.Module`): The module of PyTorch with a user-defined attribute `_params_init_info` which records the initialization information. init_info (str): The string that describes the initialization. """ assert hasattr( module, '_params_init_info'), f'Can not find `_params_init_info` in {module}' for name, param in module.named_parameters(): assert param in module._params_init_info, ( f'Find a new :obj:`Parameter` ' f'named `{name}` during executing the ' f'`init_weights` of ' f'`{module.__class__.__name__}`. ' f'Please do not add or ' f'replace parameters during executing ' f'the `init_weights`. ') # The parameter has been changed during executing the # `init_weights` of module mean_value = param.data.mean() if module._params_init_info[param]['tmp_mean_value'] != mean_value: module._params_init_info[param]['init_info'] = init_info module._params_init_info[param]['tmp_mean_value'] = mean_value def constant_init(module, val, bias=0): if hasattr(module, 'weight') and module.weight is not None: nn.init.constant_(module.weight, val) if hasattr(module, 'bias') and module.bias is not None: nn.init.constant_(module.bias, bias) def xavier_init(module, gain=1, bias=0, distribution='normal'): assert distribution in ['uniform', 'normal'] if hasattr(module, 'weight') and module.weight is not None: if distribution == 'uniform': nn.init.xavier_uniform_(module.weight, gain=gain) else: nn.init.xavier_normal_(module.weight, gain=gain) if hasattr(module, 'bias') and module.bias is not None: nn.init.constant_(module.bias, bias) def normal_init(module, mean=0, std=1, bias=0): if hasattr(module, 'weight') and module.weight is not None: nn.init.normal_(module.weight, mean, std) if hasattr(module, 'bias') and module.bias is not None: nn.init.constant_(module.bias, bias) def trunc_normal_init(module: nn.Module, mean: float = 0, std: float = 1, a: float = -2, b: float = 2, bias: float = 0) -> None: if hasattr(module, 'weight') and module.weight is not None: trunc_normal_(module.weight, mean, std, a, b) # type: ignore if hasattr(module, 'bias') and module.bias is not None: nn.init.constant_(module.bias, bias) # type: ignore def uniform_init(module, a=0, b=1, bias=0): if hasattr(module, 'weight') and module.weight is not None: nn.init.uniform_(module.weight, a, b) if hasattr(module, 'bias') and module.bias is not None: nn.init.constant_(module.bias, bias) def kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal'): assert distribution in ['uniform', 'normal'] if hasattr(module, 'weight') and module.weight is not None: if distribution == 'uniform': nn.init.kaiming_uniform_( module.weight, a=a, mode=mode, nonlinearity=nonlinearity) else: nn.init.kaiming_normal_( module.weight, a=a, mode=mode, nonlinearity=nonlinearity) if hasattr(module, 'bias') and module.bias is not None: nn.init.constant_(module.bias, bias) def caffe2_xavier_init(module, bias=0): # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch # Acknowledgment to FAIR's internal code kaiming_init( module, a=1, mode='fan_in', nonlinearity='leaky_relu', bias=bias, distribution='uniform') def bias_init_with_prob(prior_prob): """initialize conv/fc bias value according to a given probability value.""" bias_init = float(-np.log((1 - prior_prob) / prior_prob)) return bias_init def _get_bases_name(m): return [b.__name__ for b in m.__class__.__bases__] class BaseInit(object): def __init__(self, *, bias=0, bias_prob=None, layer=None): self.wholemodule = False if not isinstance(bias, (int, float)): raise TypeError(f'bias must be a number, but got a {type(bias)}') if bias_prob is not None: if not isinstance(bias_prob, float): raise TypeError(f'bias_prob type must be float, \ but got {type(bias_prob)}') if layer is not None: if not isinstance(layer, (str, list)): raise TypeError(f'layer must be a str or a list of str, \ but got a {type(layer)}') else: layer = [] if bias_prob is not None: self.bias = bias_init_with_prob(bias_prob) else: self.bias = bias self.layer = [layer] if isinstance(layer, str) else layer def _get_init_info(self): info = f'{self.__class__.__name__}, bias={self.bias}' return info @INITIALIZERS.register_module(name='Constant') class ConstantInit(BaseInit): """Initialize module parameters with constant values. Args: val (int | float): the value to fill the weights in the module with bias (int | float): the value to fill the bias. Defaults to 0. bias_prob (float, optional): the probability for bias initialization. Defaults to None. layer (str | list[str], optional): the layer will be initialized. Defaults to None. """ def __init__(self, val, **kwargs): super().__init__(**kwargs) self.val = val def __call__(self, module): def init(m): if self.wholemodule: constant_init(m, self.val, self.bias) else: layername = m.__class__.__name__ basesname = _get_bases_name(m) if len(set(self.layer) & set([layername] + basesname)): constant_init(m, self.val, self.bias) module.apply(init) if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) def _get_init_info(self): info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}' return info @INITIALIZERS.register_module(name='Xavier') class XavierInit(BaseInit): r"""Initialize module parameters with values according to the method described in `Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010). `_ Args: gain (int | float): an optional scaling factor. Defaults to 1. bias (int | float): the value to fill the bias. Defaults to 0. bias_prob (float, optional): the probability for bias initialization. Defaults to None. distribution (str): distribution either be ``'normal'`` or ``'uniform'``. Defaults to ``'normal'``. layer (str | list[str], optional): the layer will be initialized. Defaults to None. """ def __init__(self, gain=1, distribution='normal', **kwargs): super().__init__(**kwargs) self.gain = gain self.distribution = distribution def __call__(self, module): def init(m): if self.wholemodule: xavier_init(m, self.gain, self.bias, self.distribution) else: layername = m.__class__.__name__ basesname = _get_bases_name(m) if len(set(self.layer) & set([layername] + basesname)): xavier_init(m, self.gain, self.bias, self.distribution) module.apply(init) if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) def _get_init_info(self): info = f'{self.__class__.__name__}: gain={self.gain}, ' \ f'distribution={self.distribution}, bias={self.bias}' return info @INITIALIZERS.register_module(name='Normal') class NormalInit(BaseInit): r"""Initialize module parameters with the values drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`. Args: mean (int | float):the mean of the normal distribution. Defaults to 0. std (int | float): the standard deviation of the normal distribution. Defaults to 1. bias (int | float): the value to fill the bias. Defaults to 0. bias_prob (float, optional): the probability for bias initialization. Defaults to None. layer (str | list[str], optional): the layer will be initialized. Defaults to None. """ def __init__(self, mean=0, std=1, **kwargs): super().__init__(**kwargs) self.mean = mean self.std = std def __call__(self, module): def init(m): if self.wholemodule: normal_init(m, self.mean, self.std, self.bias) else: layername = m.__class__.__name__ basesname = _get_bases_name(m) if len(set(self.layer) & set([layername] + basesname)): normal_init(m, self.mean, self.std, self.bias) module.apply(init) if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) def _get_init_info(self): info = f'{self.__class__.__name__}: mean={self.mean},' \ f' std={self.std}, bias={self.bias}' return info @INITIALIZERS.register_module(name='TruncNormal') class TruncNormalInit(BaseInit): r"""Initialize module parameters with the values drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values outside :math:`[a, b]`. Args: mean (float): the mean of the normal distribution. Defaults to 0. std (float): the standard deviation of the normal distribution. Defaults to 1. a (float): The minimum cutoff value. b ( float): The maximum cutoff value. bias (float): the value to fill the bias. Defaults to 0. bias_prob (float, optional): the probability for bias initialization. Defaults to None. layer (str | list[str], optional): the layer will be initialized. Defaults to None. """ def __init__(self, mean: float = 0, std: float = 1, a: float = -2, b: float = 2, **kwargs) -> None: super().__init__(**kwargs) self.mean = mean self.std = std self.a = a self.b = b def __call__(self, module: nn.Module) -> None: def init(m): if self.wholemodule: trunc_normal_init(m, self.mean, self.std, self.a, self.b, self.bias) else: layername = m.__class__.__name__ basesname = _get_bases_name(m) if len(set(self.layer) & set([layername] + basesname)): trunc_normal_init(m, self.mean, self.std, self.a, self.b, self.bias) module.apply(init) if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) def _get_init_info(self): info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \ f' mean={self.mean}, std={self.std}, bias={self.bias}' return info @INITIALIZERS.register_module(name='Uniform') class UniformInit(BaseInit): r"""Initialize module parameters with values drawn from the uniform distribution :math:`\mathcal{U}(a, b)`. Args: a (int | float): the lower bound of the uniform distribution. Defaults to 0. b (int | float): the upper bound of the uniform distribution. Defaults to 1. bias (int | float): the value to fill the bias. Defaults to 0. bias_prob (float, optional): the probability for bias initialization. Defaults to None. layer (str | list[str], optional): the layer will be initialized. Defaults to None. """ def __init__(self, a=0, b=1, **kwargs): super().__init__(**kwargs) self.a = a self.b = b def __call__(self, module): def init(m): if self.wholemodule: uniform_init(m, self.a, self.b, self.bias) else: layername = m.__class__.__name__ basesname = _get_bases_name(m) if len(set(self.layer) & set([layername] + basesname)): uniform_init(m, self.a, self.b, self.bias) module.apply(init) if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) def _get_init_info(self): info = f'{self.__class__.__name__}: a={self.a},' \ f' b={self.b}, bias={self.bias}' return info @INITIALIZERS.register_module(name='Kaiming') class KaimingInit(BaseInit): r"""Initialize module parameters with the values according to the method described in `Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification - He, K. et al. (2015). `_ Args: a (int | float): the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``). Defaults to 0. mode (str): either ``'fan_in'`` or ``'fan_out'``. Choosing ``'fan_in'`` preserves the magnitude of the variance of the weights in the forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the backwards pass. Defaults to ``'fan_out'``. nonlinearity (str): the non-linear function (`nn.functional` name), recommended to use only with ``'relu'`` or ``'leaky_relu'`` . Defaults to 'relu'. bias (int | float): the value to fill the bias. Defaults to 0. bias_prob (float, optional): the probability for bias initialization. Defaults to None. distribution (str): distribution either be ``'normal'`` or ``'uniform'``. Defaults to ``'normal'``. layer (str | list[str], optional): the layer will be initialized. Defaults to None. """ def __init__(self, a=0, mode='fan_out', nonlinearity='relu', distribution='normal', **kwargs): super().__init__(**kwargs) self.a = a self.mode = mode self.nonlinearity = nonlinearity self.distribution = distribution def __call__(self, module): def init(m): if self.wholemodule: kaiming_init(m, self.a, self.mode, self.nonlinearity, self.bias, self.distribution) else: layername = m.__class__.__name__ basesname = _get_bases_name(m) if len(set(self.layer) & set([layername] + basesname)): kaiming_init(m, self.a, self.mode, self.nonlinearity, self.bias, self.distribution) module.apply(init) if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) def _get_init_info(self): info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \ f'nonlinearity={self.nonlinearity}, ' \ f'distribution ={self.distribution}, bias={self.bias}' return info @INITIALIZERS.register_module(name='Caffe2Xavier') class Caffe2XavierInit(KaimingInit): # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch # Acknowledgment to FAIR's internal code def __init__(self, **kwargs): super().__init__( a=1, mode='fan_in', nonlinearity='leaky_relu', distribution='uniform', **kwargs) def __call__(self, module): super().__call__(module) @INITIALIZERS.register_module(name='Pretrained') class PretrainedInit(object): """Initialize module by loading a pretrained model. Args: checkpoint (str): the checkpoint file of the pretrained model should be load. prefix (str, optional): the prefix of a sub-module in the pretrained model. it is for loading a part of the pretrained model to initialize. For example, if we would like to only load the backbone of a detector model, we can set ``prefix='backbone.'``. Defaults to None. map_location (str): map tensors into proper locations. """ def __init__(self, checkpoint, prefix=None, map_location=None): self.checkpoint = checkpoint self.prefix = prefix self.map_location = map_location def __call__(self, module): from annotator.mmpkg.mmcv.runner import (_load_checkpoint_with_prefix, load_checkpoint, load_state_dict) logger = get_logger('mmcv') if self.prefix is None: print_log(f'load model from: {self.checkpoint}', logger=logger) load_checkpoint( module, self.checkpoint, map_location=self.map_location, strict=False, logger=logger) else: print_log( f'load {self.prefix} in model from: {self.checkpoint}', logger=logger) state_dict = _load_checkpoint_with_prefix( self.prefix, self.checkpoint, map_location=self.map_location) load_state_dict(module, state_dict, strict=False, logger=logger) if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) def _get_init_info(self): info = f'{self.__class__.__name__}: load from {self.checkpoint}' return info def _initialize(module, cfg, wholemodule=False): func = build_from_cfg(cfg, INITIALIZERS) # wholemodule flag is for override mode, there is no layer key in override # and initializer will give init values for the whole module with the name # in override. func.wholemodule = wholemodule func(module) def _initialize_override(module, override, cfg): if not isinstance(override, (dict, list)): raise TypeError(f'override must be a dict or a list of dict, \ but got {type(override)}') override = [override] if isinstance(override, dict) else override for override_ in override: cp_override = copy.deepcopy(override_) name = cp_override.pop('name', None) if name is None: raise ValueError('`override` must contain the key "name",' f'but got {cp_override}') # if override only has name key, it means use args in init_cfg if not cp_override: cp_override.update(cfg) # if override has name key and other args except type key, it will # raise error elif 'type' not in cp_override.keys(): raise ValueError( f'`override` need "type" key, but got {cp_override}') if hasattr(module, name): _initialize(getattr(module, name), cp_override, wholemodule=True) else: raise RuntimeError(f'module did not have attribute {name}, ' f'but init_cfg is {cp_override}.') def initialize(module, init_cfg): """Initialize a module. Args: module (``torch.nn.Module``): the module will be initialized. init_cfg (dict | list[dict]): initialization configuration dict to define initializer. OpenMMLab has implemented 6 initializers including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``, ``Kaiming``, and ``Pretrained``. Example: >>> module = nn.Linear(2, 3, bias=True) >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2) >>> initialize(module, init_cfg) >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2)) >>> # define key ``'layer'`` for initializing layer with different >>> # configuration >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1), dict(type='Constant', layer='Linear', val=2)] >>> initialize(module, init_cfg) >>> # define key``'override'`` to initialize some specific part in >>> # module >>> class FooNet(nn.Module): >>> def __init__(self): >>> super().__init__() >>> self.feat = nn.Conv2d(3, 16, 3) >>> self.reg = nn.Conv2d(16, 10, 3) >>> self.cls = nn.Conv2d(16, 5, 3) >>> model = FooNet() >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d', >>> override=dict(type='Constant', name='reg', val=3, bias=4)) >>> initialize(model, init_cfg) >>> model = ResNet(depth=50) >>> # Initialize weights with the pretrained model. >>> init_cfg = dict(type='Pretrained', checkpoint='torchvision://resnet50') >>> initialize(model, init_cfg) >>> # Initialize weights of a sub-module with the specific part of >>> # a pretrained model by using "prefix". >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\ >>> 'retinanet_r50_fpn_1x_coco/'\ >>> 'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth' >>> init_cfg = dict(type='Pretrained', checkpoint=url, prefix='backbone.') """ if not isinstance(init_cfg, (dict, list)): raise TypeError(f'init_cfg must be a dict or a list of dict, \ but got {type(init_cfg)}') if isinstance(init_cfg, dict): init_cfg = [init_cfg] for cfg in init_cfg: # should deeply copy the original config because cfg may be used by # other modules, e.g., one init_cfg shared by multiple bottleneck # blocks, the expected cfg will be changed after pop and will change # the initialization behavior of other modules cp_cfg = copy.deepcopy(cfg) override = cp_cfg.pop('override', None) _initialize(module, cp_cfg) if override is not None: cp_cfg.pop('layer', None) _initialize_override(module, override, cp_cfg) else: # All attributes in module have same initialization. pass def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float, b: float) -> Tensor: # Method based on # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf # Modified from # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py def norm_cdf(x): # Computes standard normal cumulative distribution function return (1. + math.erf(x / math.sqrt(2.))) / 2. if (mean < a - 2 * std) or (mean > b + 2 * std): warnings.warn( 'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. ' 'The distribution of values may be incorrect.', stacklevel=2) with torch.no_grad(): # Values are generated by using a truncated uniform distribution and # then using the inverse CDF for the normal distribution. # Get upper and lower cdf values lower = norm_cdf((a - mean) / std) upper = norm_cdf((b - mean) / std) # Uniformly fill tensor with values from [lower, upper], then translate # to [2lower-1, 2upper-1]. tensor.uniform_(2 * lower - 1, 2 * upper - 1) # Use inverse cdf transform for normal distribution to get truncated # standard normal tensor.erfinv_() # Transform to proper mean, std tensor.mul_(std * math.sqrt(2.)) tensor.add_(mean) # Clamp to ensure it's in the proper range tensor.clamp_(min=a, max=b) return tensor def trunc_normal_(tensor: Tensor, mean: float = 0., std: float = 1., a: float = -2., b: float = 2.) -> Tensor: r"""Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for generating the random values works best when :math:`a \leq \text{mean} \leq b`. Modified from https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py Args: tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`. mean (float): the mean of the normal distribution. std (float): the standard deviation of the normal distribution. a (float): the minimum cutoff value. b (float): the maximum cutoff value. """ return _no_grad_trunc_normal_(tensor, mean, std, a, b) ================================================ FILE: annotator/mmpkg/mmcv/cnn/vgg.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import logging import torch.nn as nn from .utils import constant_init, kaiming_init, normal_init def conv3x3(in_planes, out_planes, dilation=1): """3x3 convolution with padding.""" return nn.Conv2d( in_planes, out_planes, kernel_size=3, padding=dilation, dilation=dilation) def make_vgg_layer(inplanes, planes, num_blocks, dilation=1, with_bn=False, ceil_mode=False): layers = [] for _ in range(num_blocks): layers.append(conv3x3(inplanes, planes, dilation)) if with_bn: layers.append(nn.BatchNorm2d(planes)) layers.append(nn.ReLU(inplace=True)) inplanes = planes layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode)) return layers class VGG(nn.Module): """VGG backbone. Args: depth (int): Depth of vgg, from {11, 13, 16, 19}. with_bn (bool): Use BatchNorm or not. num_classes (int): number of classes for classification. num_stages (int): VGG stages, normally 5. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (all param fixed). -1 means not freezing any parameters. bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze running stats (mean and var). bn_frozen (bool): Whether to freeze weight and bias of BN layers. """ arch_settings = { 11: (1, 1, 2, 2, 2), 13: (2, 2, 2, 2, 2), 16: (2, 2, 3, 3, 3), 19: (2, 2, 4, 4, 4) } def __init__(self, depth, with_bn=False, num_classes=-1, num_stages=5, dilations=(1, 1, 1, 1, 1), out_indices=(0, 1, 2, 3, 4), frozen_stages=-1, bn_eval=True, bn_frozen=False, ceil_mode=False, with_last_pool=True): super(VGG, self).__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for vgg') assert num_stages >= 1 and num_stages <= 5 stage_blocks = self.arch_settings[depth] self.stage_blocks = stage_blocks[:num_stages] assert len(dilations) == num_stages assert max(out_indices) <= num_stages self.num_classes = num_classes self.out_indices = out_indices self.frozen_stages = frozen_stages self.bn_eval = bn_eval self.bn_frozen = bn_frozen self.inplanes = 3 start_idx = 0 vgg_layers = [] self.range_sub_modules = [] for i, num_blocks in enumerate(self.stage_blocks): num_modules = num_blocks * (2 + with_bn) + 1 end_idx = start_idx + num_modules dilation = dilations[i] planes = 64 * 2**i if i < 4 else 512 vgg_layer = make_vgg_layer( self.inplanes, planes, num_blocks, dilation=dilation, with_bn=with_bn, ceil_mode=ceil_mode) vgg_layers.extend(vgg_layer) self.inplanes = planes self.range_sub_modules.append([start_idx, end_idx]) start_idx = end_idx if not with_last_pool: vgg_layers.pop(-1) self.range_sub_modules[-1][1] -= 1 self.module_name = 'features' self.add_module(self.module_name, nn.Sequential(*vgg_layers)) if self.num_classes > 0: self.classifier = nn.Sequential( nn.Linear(512 * 7 * 7, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, num_classes), ) def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = logging.getLogger() from ..runner import load_checkpoint load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, nn.BatchNorm2d): constant_init(m, 1) elif isinstance(m, nn.Linear): normal_init(m, std=0.01) else: raise TypeError('pretrained must be a str or None') def forward(self, x): outs = [] vgg_layers = getattr(self, self.module_name) for i in range(len(self.stage_blocks)): for j in range(*self.range_sub_modules[i]): vgg_layer = vgg_layers[j] x = vgg_layer(x) if i in self.out_indices: outs.append(x) if self.num_classes > 0: x = x.view(x.size(0), -1) x = self.classifier(x) outs.append(x) if len(outs) == 1: return outs[0] else: return tuple(outs) def train(self, mode=True): super(VGG, self).train(mode) if self.bn_eval: for m in self.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() if self.bn_frozen: for params in m.parameters(): params.requires_grad = False vgg_layers = getattr(self, self.module_name) if mode and self.frozen_stages >= 0: for i in range(self.frozen_stages): for j in range(*self.range_sub_modules[i]): mod = vgg_layers[j] mod.eval() for param in mod.parameters(): param.requires_grad = False ================================================ FILE: annotator/mmpkg/mmcv/engine/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test, single_gpu_test) __all__ = [ 'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test', 'single_gpu_test' ] ================================================ FILE: annotator/mmpkg/mmcv/engine/test.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp import pickle import shutil import tempfile import time import torch import torch.distributed as dist import annotator.mmpkg.mmcv as mmcv from annotator.mmpkg.mmcv.runner import get_dist_info def single_gpu_test(model, data_loader): """Test model with a single gpu. This method tests model with a single gpu and displays test progress bar. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. Returns: list: The prediction results. """ model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for data in data_loader: with torch.no_grad(): result = model(return_loss=False, **data) results.extend(result) # Assume result has the same length of batch_size # refer to https://github.com/open-mmlab/mmcv/issues/985 batch_size = len(result) for _ in range(batch_size): prog_bar.update() return results def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): """Test model with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting ``gpu_collect=True``, it encodes results to gpu tensors and use gpu communication for results collection. On cpu mode it saves the results on different gpus to ``tmpdir`` and collects them by the rank 0 worker. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. Returns: list: The prediction results. """ model.eval() results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) time.sleep(2) # This line can prevent deadlock problem in some cases. for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, **data) results.extend(result) if rank == 0: batch_size = len(result) batch_size_all = batch_size * world_size if batch_size_all + prog_bar.completed > len(dataset): batch_size_all = len(dataset) - prog_bar.completed for _ in range(batch_size_all): prog_bar.update() # collect results from all ranks if gpu_collect: results = collect_results_gpu(results, len(dataset)) else: results = collect_results_cpu(results, len(dataset), tmpdir) return results def collect_results_cpu(result_part, size, tmpdir=None): """Collect results under cpu mode. On cpu mode, this function will save the results on different gpus to ``tmpdir`` and collect them by the rank 0 worker. Args: result_part (list): Result list containing result parts to be collected. size (int): Size of the results, commonly equal to length of the results. tmpdir (str | None): temporal directory for collected results to store. If set to None, it will create a random temporal directory for it. Returns: list: The collected results. """ rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: mmcv.mkdir_or_exist('.dist_test') tmpdir = tempfile.mkdtemp(dir='.dist_test') tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, f'part_{i}.pkl') part_result = mmcv.load(part_file) # When data is severely insufficient, an empty part_result # on a certain gpu could makes the overall outputs empty. if part_result: part_list.append(part_result) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results def collect_results_gpu(result_part, size): """Collect results under gpu mode. On gpu mode, this function will encode results to gpu tensors and use gpu communication for results collection. Args: result_part (list): Result list containing result parts to be collected. size (int): Size of the results, commonly equal to length of the results. Returns: list: The collected results. """ rank, world_size = get_dist_info() # dump result part to tensor with pickle part_tensor = torch.tensor( bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') # gather all result part tensor shape shape_tensor = torch.tensor(part_tensor.shape, device='cuda') shape_list = [shape_tensor.clone() for _ in range(world_size)] dist.all_gather(shape_list, shape_tensor) # padding result part tensor to max length shape_max = torch.tensor(shape_list).max() part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') part_send[:shape_tensor[0]] = part_tensor part_recv_list = [ part_tensor.new_zeros(shape_max) for _ in range(world_size) ] # gather all result part dist.all_gather(part_recv_list, part_send) if rank == 0: part_list = [] for recv, shape in zip(part_recv_list, shape_list): part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()) # When data is severely insufficient, an empty part_result # on a certain gpu could makes the overall outputs empty. if part_result: part_list.append(part_result) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] return ordered_results ================================================ FILE: annotator/mmpkg/mmcv/fileio/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .file_client import BaseStorageBackend, FileClient from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler from .io import dump, load, register_handler from .parse import dict_from_file, list_from_file __all__ = [ 'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler', 'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler', 'list_from_file', 'dict_from_file' ] ================================================ FILE: annotator/mmpkg/mmcv/fileio/file_client.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import inspect import os import os.path as osp import re import tempfile import warnings from abc import ABCMeta, abstractmethod from contextlib import contextmanager from pathlib import Path from typing import Iterable, Iterator, Optional, Tuple, Union from urllib.request import urlopen import annotator.mmpkg.mmcv as mmcv from annotator.mmpkg.mmcv.utils.misc import has_method from annotator.mmpkg.mmcv.utils.path import is_filepath class BaseStorageBackend(metaclass=ABCMeta): """Abstract class of storage backends. All backends need to implement two apis: ``get()`` and ``get_text()``. ``get()`` reads the file as a byte stream and ``get_text()`` reads the file as texts. """ # a flag to indicate whether the backend can create a symlink for a file _allow_symlink = False @property def name(self): return self.__class__.__name__ @property def allow_symlink(self): return self._allow_symlink @abstractmethod def get(self, filepath): pass @abstractmethod def get_text(self, filepath): pass class CephBackend(BaseStorageBackend): """Ceph storage backend (for internal use). Args: path_mapping (dict|None): path mapping dict from local path to Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath`` will be replaced by ``dst``. Default: None. .. warning:: :class:`mmcv.fileio.file_client.CephBackend` will be deprecated, please use :class:`mmcv.fileio.file_client.PetrelBackend` instead. """ def __init__(self, path_mapping=None): try: import ceph except ImportError: raise ImportError('Please install ceph to enable CephBackend.') warnings.warn( 'CephBackend will be deprecated, please use PetrelBackend instead') self._client = ceph.S3Client() assert isinstance(path_mapping, dict) or path_mapping is None self.path_mapping = path_mapping def get(self, filepath): filepath = str(filepath) if self.path_mapping is not None: for k, v in self.path_mapping.items(): filepath = filepath.replace(k, v) value = self._client.Get(filepath) value_buf = memoryview(value) return value_buf def get_text(self, filepath, encoding=None): raise NotImplementedError class PetrelBackend(BaseStorageBackend): """Petrel storage backend (for internal use). PetrelBackend supports reading and writing data to multiple clusters. If the file path contains the cluster name, PetrelBackend will read data from specified cluster or write data to it. Otherwise, PetrelBackend will access the default cluster. Args: path_mapping (dict, optional): Path mapping dict from local path to Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath`` will be replaced by ``dst``. Default: None. enable_mc (bool, optional): Whether to enable memcached support. Default: True. Examples: >>> filepath1 = 's3://path/of/file' >>> filepath2 = 'cluster-name:s3://path/of/file' >>> client = PetrelBackend() >>> client.get(filepath1) # get data from default cluster >>> client.get(filepath2) # get data from 'cluster-name' cluster """ def __init__(self, path_mapping: Optional[dict] = None, enable_mc: bool = True): try: from petrel_client import client except ImportError: raise ImportError('Please install petrel_client to enable ' 'PetrelBackend.') self._client = client.Client(enable_mc=enable_mc) assert isinstance(path_mapping, dict) or path_mapping is None self.path_mapping = path_mapping def _map_path(self, filepath: Union[str, Path]) -> str: """Map ``filepath`` to a string path whose prefix will be replaced by :attr:`self.path_mapping`. Args: filepath (str): Path to be mapped. """ filepath = str(filepath) if self.path_mapping is not None: for k, v in self.path_mapping.items(): filepath = filepath.replace(k, v) return filepath def _format_path(self, filepath: str) -> str: """Convert a ``filepath`` to standard format of petrel oss. If the ``filepath`` is concatenated by ``os.path.join``, in a Windows environment, the ``filepath`` will be the format of 's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the above ``filepath`` will be converted to 's3://bucket_name/image.jpg'. Args: filepath (str): Path to be formatted. """ return re.sub(r'\\+', '/', filepath) def get(self, filepath: Union[str, Path]) -> memoryview: """Read data from a given ``filepath`` with 'rb' mode. Args: filepath (str or Path): Path to read data. Returns: memoryview: A memory view of expected bytes object to avoid copying. The memoryview object can be converted to bytes by ``value_buf.tobytes()``. """ filepath = self._map_path(filepath) filepath = self._format_path(filepath) value = self._client.Get(filepath) value_buf = memoryview(value) return value_buf def get_text(self, filepath: Union[str, Path], encoding: str = 'utf-8') -> str: """Read data from a given ``filepath`` with 'r' mode. Args: filepath (str or Path): Path to read data. encoding (str): The encoding format used to open the ``filepath``. Default: 'utf-8'. Returns: str: Expected text reading from ``filepath``. """ return str(self.get(filepath), encoding=encoding) def put(self, obj: bytes, filepath: Union[str, Path]) -> None: """Save data to a given ``filepath``. Args: obj (bytes): Data to be saved. filepath (str or Path): Path to write data. """ filepath = self._map_path(filepath) filepath = self._format_path(filepath) self._client.put(filepath, obj) def put_text(self, obj: str, filepath: Union[str, Path], encoding: str = 'utf-8') -> None: """Save data to a given ``filepath``. Args: obj (str): Data to be written. filepath (str or Path): Path to write data. encoding (str): The encoding format used to encode the ``obj``. Default: 'utf-8'. """ self.put(bytes(obj, encoding=encoding), filepath) def remove(self, filepath: Union[str, Path]) -> None: """Remove a file. Args: filepath (str or Path): Path to be removed. """ if not has_method(self._client, 'delete'): raise NotImplementedError( ('Current version of Petrel Python SDK has not supported ' 'the `delete` method, please use a higher version or dev' ' branch instead.')) filepath = self._map_path(filepath) filepath = self._format_path(filepath) self._client.delete(filepath) def exists(self, filepath: Union[str, Path]) -> bool: """Check whether a file path exists. Args: filepath (str or Path): Path to be checked whether exists. Returns: bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise. """ if not (has_method(self._client, 'contains') and has_method(self._client, 'isdir')): raise NotImplementedError( ('Current version of Petrel Python SDK has not supported ' 'the `contains` and `isdir` methods, please use a higher' 'version or dev branch instead.')) filepath = self._map_path(filepath) filepath = self._format_path(filepath) return self._client.contains(filepath) or self._client.isdir(filepath) def isdir(self, filepath: Union[str, Path]) -> bool: """Check whether a file path is a directory. Args: filepath (str or Path): Path to be checked whether it is a directory. Returns: bool: Return ``True`` if ``filepath`` points to a directory, ``False`` otherwise. """ if not has_method(self._client, 'isdir'): raise NotImplementedError( ('Current version of Petrel Python SDK has not supported ' 'the `isdir` method, please use a higher version or dev' ' branch instead.')) filepath = self._map_path(filepath) filepath = self._format_path(filepath) return self._client.isdir(filepath) def isfile(self, filepath: Union[str, Path]) -> bool: """Check whether a file path is a file. Args: filepath (str or Path): Path to be checked whether it is a file. Returns: bool: Return ``True`` if ``filepath`` points to a file, ``False`` otherwise. """ if not has_method(self._client, 'contains'): raise NotImplementedError( ('Current version of Petrel Python SDK has not supported ' 'the `contains` method, please use a higher version or ' 'dev branch instead.')) filepath = self._map_path(filepath) filepath = self._format_path(filepath) return self._client.contains(filepath) def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> str: """Concatenate all file paths. Args: filepath (str or Path): Path to be concatenated. Returns: str: The result after concatenation. """ filepath = self._format_path(self._map_path(filepath)) if filepath.endswith('/'): filepath = filepath[:-1] formatted_paths = [filepath] for path in filepaths: formatted_paths.append(self._format_path(self._map_path(path))) return '/'.join(formatted_paths) @contextmanager def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]: """Download a file from ``filepath`` and return a temporary path. ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It can be called with ``with`` statement, and when exists from the ``with`` statement, the temporary path will be released. Args: filepath (str | Path): Download a file from ``filepath``. Examples: >>> client = PetrelBackend() >>> # After existing from the ``with`` clause, >>> # the path will be removed >>> with client.get_local_path('s3://path/of/your/file') as path: ... # do something here Yields: Iterable[str]: Only yield one temporary path. """ filepath = self._map_path(filepath) filepath = self._format_path(filepath) assert self.isfile(filepath) try: f = tempfile.NamedTemporaryFile(delete=False) f.write(self.get(filepath)) f.close() yield f.name finally: os.remove(f.name) def list_dir_or_file(self, dir_path: Union[str, Path], list_dir: bool = True, list_file: bool = True, suffix: Optional[Union[str, Tuple[str]]] = None, recursive: bool = False) -> Iterator[str]: """Scan a directory to find the interested directories or files in arbitrary order. Note: Petrel has no concept of directories but it simulates the directory hierarchy in the filesystem through public prefixes. In addition, if the returned path ends with '/', it means the path is a public prefix which is a logical directory. Note: :meth:`list_dir_or_file` returns the path relative to ``dir_path``. In addition, the returned path of directory will not contains the suffix '/' which is consistent with other backends. Args: dir_path (str | Path): Path of the directory. list_dir (bool): List the directories. Default: True. list_file (bool): List the path of files. Default: True. suffix (str or tuple[str], optional): File suffix that we are interested in. Default: None. recursive (bool): If set to True, recursively scan the directory. Default: False. Yields: Iterable[str]: A relative path to ``dir_path``. """ if not has_method(self._client, 'list'): raise NotImplementedError( ('Current version of Petrel Python SDK has not supported ' 'the `list` method, please use a higher version or dev' ' branch instead.')) dir_path = self._map_path(dir_path) dir_path = self._format_path(dir_path) if list_dir and suffix is not None: raise TypeError( '`list_dir` should be False when `suffix` is not None') if (suffix is not None) and not isinstance(suffix, (str, tuple)): raise TypeError('`suffix` must be a string or tuple of strings') # Petrel's simulated directory hierarchy assumes that directory paths # should end with `/` if not dir_path.endswith('/'): dir_path += '/' root = dir_path def _list_dir_or_file(dir_path, list_dir, list_file, suffix, recursive): for path in self._client.list(dir_path): # the `self.isdir` is not used here to determine whether path # is a directory, because `self.isdir` relies on # `self._client.list` if path.endswith('/'): # a directory path next_dir_path = self.join_path(dir_path, path) if list_dir: # get the relative path and exclude the last # character '/' rel_dir = next_dir_path[len(root):-1] yield rel_dir if recursive: yield from _list_dir_or_file(next_dir_path, list_dir, list_file, suffix, recursive) else: # a file path absolute_path = self.join_path(dir_path, path) rel_path = absolute_path[len(root):] if (suffix is None or rel_path.endswith(suffix)) and list_file: yield rel_path return _list_dir_or_file(dir_path, list_dir, list_file, suffix, recursive) class MemcachedBackend(BaseStorageBackend): """Memcached storage backend. Attributes: server_list_cfg (str): Config file for memcached server list. client_cfg (str): Config file for memcached client. sys_path (str | None): Additional path to be appended to `sys.path`. Default: None. """ def __init__(self, server_list_cfg, client_cfg, sys_path=None): if sys_path is not None: import sys sys.path.append(sys_path) try: import mc except ImportError: raise ImportError( 'Please install memcached to enable MemcachedBackend.') self.server_list_cfg = server_list_cfg self.client_cfg = client_cfg self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg, self.client_cfg) # mc.pyvector servers as a point which points to a memory cache self._mc_buffer = mc.pyvector() def get(self, filepath): filepath = str(filepath) import mc self._client.Get(filepath, self._mc_buffer) value_buf = mc.ConvertBuffer(self._mc_buffer) return value_buf def get_text(self, filepath, encoding=None): raise NotImplementedError class LmdbBackend(BaseStorageBackend): """Lmdb storage backend. Args: db_path (str): Lmdb database path. readonly (bool, optional): Lmdb environment parameter. If True, disallow any write operations. Default: True. lock (bool, optional): Lmdb environment parameter. If False, when concurrent access occurs, do not lock the database. Default: False. readahead (bool, optional): Lmdb environment parameter. If False, disable the OS filesystem readahead mechanism, which may improve random read performance when a database is larger than RAM. Default: False. Attributes: db_path (str): Lmdb database path. """ def __init__(self, db_path, readonly=True, lock=False, readahead=False, **kwargs): try: import lmdb except ImportError: raise ImportError('Please install lmdb to enable LmdbBackend.') self.db_path = str(db_path) self._client = lmdb.open( self.db_path, readonly=readonly, lock=lock, readahead=readahead, **kwargs) def get(self, filepath): """Get values according to the filepath. Args: filepath (str | obj:`Path`): Here, filepath is the lmdb key. """ filepath = str(filepath) with self._client.begin(write=False) as txn: value_buf = txn.get(filepath.encode('ascii')) return value_buf def get_text(self, filepath, encoding=None): raise NotImplementedError class HardDiskBackend(BaseStorageBackend): """Raw hard disks storage backend.""" _allow_symlink = True def get(self, filepath: Union[str, Path]) -> bytes: """Read data from a given ``filepath`` with 'rb' mode. Args: filepath (str or Path): Path to read data. Returns: bytes: Expected bytes object. """ with open(filepath, 'rb') as f: value_buf = f.read() return value_buf def get_text(self, filepath: Union[str, Path], encoding: str = 'utf-8') -> str: """Read data from a given ``filepath`` with 'r' mode. Args: filepath (str or Path): Path to read data. encoding (str): The encoding format used to open the ``filepath``. Default: 'utf-8'. Returns: str: Expected text reading from ``filepath``. """ with open(filepath, 'r', encoding=encoding) as f: value_buf = f.read() return value_buf def put(self, obj: bytes, filepath: Union[str, Path]) -> None: """Write data to a given ``filepath`` with 'wb' mode. Note: ``put`` will create a directory if the directory of ``filepath`` does not exist. Args: obj (bytes): Data to be written. filepath (str or Path): Path to write data. """ mmcv.mkdir_or_exist(osp.dirname(filepath)) with open(filepath, 'wb') as f: f.write(obj) def put_text(self, obj: str, filepath: Union[str, Path], encoding: str = 'utf-8') -> None: """Write data to a given ``filepath`` with 'w' mode. Note: ``put_text`` will create a directory if the directory of ``filepath`` does not exist. Args: obj (str): Data to be written. filepath (str or Path): Path to write data. encoding (str): The encoding format used to open the ``filepath``. Default: 'utf-8'. """ mmcv.mkdir_or_exist(osp.dirname(filepath)) with open(filepath, 'w', encoding=encoding) as f: f.write(obj) def remove(self, filepath: Union[str, Path]) -> None: """Remove a file. Args: filepath (str or Path): Path to be removed. """ os.remove(filepath) def exists(self, filepath: Union[str, Path]) -> bool: """Check whether a file path exists. Args: filepath (str or Path): Path to be checked whether exists. Returns: bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise. """ return osp.exists(filepath) def isdir(self, filepath: Union[str, Path]) -> bool: """Check whether a file path is a directory. Args: filepath (str or Path): Path to be checked whether it is a directory. Returns: bool: Return ``True`` if ``filepath`` points to a directory, ``False`` otherwise. """ return osp.isdir(filepath) def isfile(self, filepath: Union[str, Path]) -> bool: """Check whether a file path is a file. Args: filepath (str or Path): Path to be checked whether it is a file. Returns: bool: Return ``True`` if ``filepath`` points to a file, ``False`` otherwise. """ return osp.isfile(filepath) def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> str: """Concatenate all file paths. Join one or more filepath components intelligently. The return value is the concatenation of filepath and any members of *filepaths. Args: filepath (str or Path): Path to be concatenated. Returns: str: The result of concatenation. """ return osp.join(filepath, *filepaths) @contextmanager def get_local_path( self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]: """Only for unified API and do nothing.""" yield filepath def list_dir_or_file(self, dir_path: Union[str, Path], list_dir: bool = True, list_file: bool = True, suffix: Optional[Union[str, Tuple[str]]] = None, recursive: bool = False) -> Iterator[str]: """Scan a directory to find the interested directories or files in arbitrary order. Note: :meth:`list_dir_or_file` returns the path relative to ``dir_path``. Args: dir_path (str | Path): Path of the directory. list_dir (bool): List the directories. Default: True. list_file (bool): List the path of files. Default: True. suffix (str or tuple[str], optional): File suffix that we are interested in. Default: None. recursive (bool): If set to True, recursively scan the directory. Default: False. Yields: Iterable[str]: A relative path to ``dir_path``. """ if list_dir and suffix is not None: raise TypeError('`suffix` should be None when `list_dir` is True') if (suffix is not None) and not isinstance(suffix, (str, tuple)): raise TypeError('`suffix` must be a string or tuple of strings') root = dir_path def _list_dir_or_file(dir_path, list_dir, list_file, suffix, recursive): for entry in os.scandir(dir_path): if not entry.name.startswith('.') and entry.is_file(): rel_path = osp.relpath(entry.path, root) if (suffix is None or rel_path.endswith(suffix)) and list_file: yield rel_path elif osp.isdir(entry.path): if list_dir: rel_dir = osp.relpath(entry.path, root) yield rel_dir if recursive: yield from _list_dir_or_file(entry.path, list_dir, list_file, suffix, recursive) return _list_dir_or_file(dir_path, list_dir, list_file, suffix, recursive) class HTTPBackend(BaseStorageBackend): """HTTP and HTTPS storage bachend.""" def get(self, filepath): value_buf = urlopen(filepath).read() return value_buf def get_text(self, filepath, encoding='utf-8'): value_buf = urlopen(filepath).read() return value_buf.decode(encoding) @contextmanager def get_local_path(self, filepath: str) -> Iterable[str]: """Download a file from ``filepath``. ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It can be called with ``with`` statement, and when exists from the ``with`` statement, the temporary path will be released. Args: filepath (str): Download a file from ``filepath``. Examples: >>> client = HTTPBackend() >>> # After existing from the ``with`` clause, >>> # the path will be removed >>> with client.get_local_path('http://path/of/your/file') as path: ... # do something here """ try: f = tempfile.NamedTemporaryFile(delete=False) f.write(self.get(filepath)) f.close() yield f.name finally: os.remove(f.name) class FileClient: """A general file client to access files in different backends. The client loads a file or text in a specified backend from its path and returns it as a binary or text file. There are two ways to choose a backend, the name of backend and the prefix of path. Although both of them can be used to choose a storage backend, ``backend`` has a higher priority that is if they are all set, the storage backend will be chosen by the backend argument. If they are all `None`, the disk backend will be chosen. Note that It can also register other backend accessor with a given name, prefixes, and backend class. In addition, We use the singleton pattern to avoid repeated object creation. If the arguments are the same, the same object will be returned. Args: backend (str, optional): The storage backend type. Options are "disk", "ceph", "memcached", "lmdb", "http" and "petrel". Default: None. prefix (str, optional): The prefix of the registered storage backend. Options are "s3", "http", "https". Default: None. Examples: >>> # only set backend >>> file_client = FileClient(backend='petrel') >>> # only set prefix >>> file_client = FileClient(prefix='s3') >>> # set both backend and prefix but use backend to choose client >>> file_client = FileClient(backend='petrel', prefix='s3') >>> # if the arguments are the same, the same object is returned >>> file_client1 = FileClient(backend='petrel') >>> file_client1 is file_client True Attributes: client (:obj:`BaseStorageBackend`): The backend object. """ _backends = { 'disk': HardDiskBackend, 'ceph': CephBackend, 'memcached': MemcachedBackend, 'lmdb': LmdbBackend, 'petrel': PetrelBackend, 'http': HTTPBackend, } # This collection is used to record the overridden backends, and when a # backend appears in the collection, the singleton pattern is disabled for # that backend, because if the singleton pattern is used, then the object # returned will be the backend before overwriting _overridden_backends = set() _prefix_to_backends = { 's3': PetrelBackend, 'http': HTTPBackend, 'https': HTTPBackend, } _overridden_prefixes = set() _instances = {} def __new__(cls, backend=None, prefix=None, **kwargs): if backend is None and prefix is None: backend = 'disk' if backend is not None and backend not in cls._backends: raise ValueError( f'Backend {backend} is not supported. Currently supported ones' f' are {list(cls._backends.keys())}') if prefix is not None and prefix not in cls._prefix_to_backends: raise ValueError( f'prefix {prefix} is not supported. Currently supported ones ' f'are {list(cls._prefix_to_backends.keys())}') # concatenate the arguments to a unique key for determining whether # objects with the same arguments were created arg_key = f'{backend}:{prefix}' for key, value in kwargs.items(): arg_key += f':{key}:{value}' # if a backend was overridden, it will create a new object if (arg_key in cls._instances and backend not in cls._overridden_backends and prefix not in cls._overridden_prefixes): _instance = cls._instances[arg_key] else: # create a new object and put it to _instance _instance = super().__new__(cls) if backend is not None: _instance.client = cls._backends[backend](**kwargs) else: _instance.client = cls._prefix_to_backends[prefix](**kwargs) cls._instances[arg_key] = _instance return _instance @property def name(self): return self.client.name @property def allow_symlink(self): return self.client.allow_symlink @staticmethod def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]: """Parse the prefix of a uri. Args: uri (str | Path): Uri to be parsed that contains the file prefix. Examples: >>> FileClient.parse_uri_prefix('s3://path/of/your/file') 's3' Returns: str | None: Return the prefix of uri if the uri contains '://' else ``None``. """ assert is_filepath(uri) uri = str(uri) if '://' not in uri: return None else: prefix, _ = uri.split('://') # In the case of PetrelBackend, the prefix may contains the cluster # name like clusterName:s3 if ':' in prefix: _, prefix = prefix.split(':') return prefix @classmethod def infer_client(cls, file_client_args: Optional[dict] = None, uri: Optional[Union[str, Path]] = None) -> 'FileClient': """Infer a suitable file client based on the URI and arguments. Args: file_client_args (dict, optional): Arguments to instantiate a FileClient. Default: None. uri (str | Path, optional): Uri to be parsed that contains the file prefix. Default: None. Examples: >>> uri = 's3://path/of/your/file' >>> file_client = FileClient.infer_client(uri=uri) >>> file_client_args = {'backend': 'petrel'} >>> file_client = FileClient.infer_client(file_client_args) Returns: FileClient: Instantiated FileClient object. """ assert file_client_args is not None or uri is not None if file_client_args is None: file_prefix = cls.parse_uri_prefix(uri) # type: ignore return cls(prefix=file_prefix) else: return cls(**file_client_args) @classmethod def _register_backend(cls, name, backend, force=False, prefixes=None): if not isinstance(name, str): raise TypeError('the backend name should be a string, ' f'but got {type(name)}') if not inspect.isclass(backend): raise TypeError( f'backend should be a class but got {type(backend)}') if not issubclass(backend, BaseStorageBackend): raise TypeError( f'backend {backend} is not a subclass of BaseStorageBackend') if not force and name in cls._backends: raise KeyError( f'{name} is already registered as a storage backend, ' 'add "force=True" if you want to override it') if name in cls._backends and force: cls._overridden_backends.add(name) cls._backends[name] = backend if prefixes is not None: if isinstance(prefixes, str): prefixes = [prefixes] else: assert isinstance(prefixes, (list, tuple)) for prefix in prefixes: if prefix not in cls._prefix_to_backends: cls._prefix_to_backends[prefix] = backend elif (prefix in cls._prefix_to_backends) and force: cls._overridden_prefixes.add(prefix) cls._prefix_to_backends[prefix] = backend else: raise KeyError( f'{prefix} is already registered as a storage backend,' ' add "force=True" if you want to override it') @classmethod def register_backend(cls, name, backend=None, force=False, prefixes=None): """Register a backend to FileClient. This method can be used as a normal class method or a decorator. .. code-block:: python class NewBackend(BaseStorageBackend): def get(self, filepath): return filepath def get_text(self, filepath): return filepath FileClient.register_backend('new', NewBackend) or .. code-block:: python @FileClient.register_backend('new') class NewBackend(BaseStorageBackend): def get(self, filepath): return filepath def get_text(self, filepath): return filepath Args: name (str): The name of the registered backend. backend (class, optional): The backend class to be registered, which must be a subclass of :class:`BaseStorageBackend`. When this method is used as a decorator, backend is None. Defaults to None. force (bool, optional): Whether to override the backend if the name has already been registered. Defaults to False. prefixes (str or list[str] or tuple[str], optional): The prefixes of the registered storage backend. Default: None. `New in version 1.3.15.` """ if backend is not None: cls._register_backend( name, backend, force=force, prefixes=prefixes) return def _register(backend_cls): cls._register_backend( name, backend_cls, force=force, prefixes=prefixes) return backend_cls return _register def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]: """Read data from a given ``filepath`` with 'rb' mode. Note: There are two types of return values for ``get``, one is ``bytes`` and the other is ``memoryview``. The advantage of using memoryview is that you can avoid copying, and if you want to convert it to ``bytes``, you can use ``.tobytes()``. Args: filepath (str or Path): Path to read data. Returns: bytes | memoryview: Expected bytes object or a memory view of the bytes object. """ return self.client.get(filepath) def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str: """Read data from a given ``filepath`` with 'r' mode. Args: filepath (str or Path): Path to read data. encoding (str): The encoding format used to open the ``filepath``. Default: 'utf-8'. Returns: str: Expected text reading from ``filepath``. """ return self.client.get_text(filepath, encoding) def put(self, obj: bytes, filepath: Union[str, Path]) -> None: """Write data to a given ``filepath`` with 'wb' mode. Note: ``put`` should create a directory if the directory of ``filepath`` does not exist. Args: obj (bytes): Data to be written. filepath (str or Path): Path to write data. """ self.client.put(obj, filepath) def put_text(self, obj: str, filepath: Union[str, Path]) -> None: """Write data to a given ``filepath`` with 'w' mode. Note: ``put_text`` should create a directory if the directory of ``filepath`` does not exist. Args: obj (str): Data to be written. filepath (str or Path): Path to write data. encoding (str, optional): The encoding format used to open the `filepath`. Default: 'utf-8'. """ self.client.put_text(obj, filepath) def remove(self, filepath: Union[str, Path]) -> None: """Remove a file. Args: filepath (str, Path): Path to be removed. """ self.client.remove(filepath) def exists(self, filepath: Union[str, Path]) -> bool: """Check whether a file path exists. Args: filepath (str or Path): Path to be checked whether exists. Returns: bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise. """ return self.client.exists(filepath) def isdir(self, filepath: Union[str, Path]) -> bool: """Check whether a file path is a directory. Args: filepath (str or Path): Path to be checked whether it is a directory. Returns: bool: Return ``True`` if ``filepath`` points to a directory, ``False`` otherwise. """ return self.client.isdir(filepath) def isfile(self, filepath: Union[str, Path]) -> bool: """Check whether a file path is a file. Args: filepath (str or Path): Path to be checked whether it is a file. Returns: bool: Return ``True`` if ``filepath`` points to a file, ``False`` otherwise. """ return self.client.isfile(filepath) def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> str: """Concatenate all file paths. Join one or more filepath components intelligently. The return value is the concatenation of filepath and any members of *filepaths. Args: filepath (str or Path): Path to be concatenated. Returns: str: The result of concatenation. """ return self.client.join_path(filepath, *filepaths) @contextmanager def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]: """Download data from ``filepath`` and write the data to local path. ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It can be called with ``with`` statement, and when exists from the ``with`` statement, the temporary path will be released. Note: If the ``filepath`` is a local path, just return itself. .. warning:: ``get_local_path`` is an experimental interface that may change in the future. Args: filepath (str or Path): Path to be read data. Examples: >>> file_client = FileClient(prefix='s3') >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path: ... # do something here Yields: Iterable[str]: Only yield one path. """ with self.client.get_local_path(str(filepath)) as local_path: yield local_path def list_dir_or_file(self, dir_path: Union[str, Path], list_dir: bool = True, list_file: bool = True, suffix: Optional[Union[str, Tuple[str]]] = None, recursive: bool = False) -> Iterator[str]: """Scan a directory to find the interested directories or files in arbitrary order. Note: :meth:`list_dir_or_file` returns the path relative to ``dir_path``. Args: dir_path (str | Path): Path of the directory. list_dir (bool): List the directories. Default: True. list_file (bool): List the path of files. Default: True. suffix (str or tuple[str], optional): File suffix that we are interested in. Default: None. recursive (bool): If set to True, recursively scan the directory. Default: False. Yields: Iterable[str]: A relative path to ``dir_path``. """ yield from self.client.list_dir_or_file(dir_path, list_dir, list_file, suffix, recursive) ================================================ FILE: annotator/mmpkg/mmcv/fileio/handlers/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base import BaseFileHandler from .json_handler import JsonHandler from .pickle_handler import PickleHandler from .yaml_handler import YamlHandler __all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler'] ================================================ FILE: annotator/mmpkg/mmcv/fileio/handlers/base.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod class BaseFileHandler(metaclass=ABCMeta): # `str_like` is a flag to indicate whether the type of file object is # str-like object or bytes-like object. Pickle only processes bytes-like # objects but json only processes str-like object. If it is str-like # object, `StringIO` will be used to process the buffer. str_like = True @abstractmethod def load_from_fileobj(self, file, **kwargs): pass @abstractmethod def dump_to_fileobj(self, obj, file, **kwargs): pass @abstractmethod def dump_to_str(self, obj, **kwargs): pass def load_from_path(self, filepath, mode='r', **kwargs): with open(filepath, mode) as f: return self.load_from_fileobj(f, **kwargs) def dump_to_path(self, obj, filepath, mode='w', **kwargs): with open(filepath, mode) as f: self.dump_to_fileobj(obj, f, **kwargs) ================================================ FILE: annotator/mmpkg/mmcv/fileio/handlers/json_handler.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import json import numpy as np from .base import BaseFileHandler def set_default(obj): """Set default json values for non-serializable values. It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list. It also converts ``np.generic`` (including ``np.int32``, ``np.float32``, etc.) into plain numbers of plain python built-in types. """ if isinstance(obj, (set, range)): return list(obj) elif isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, np.generic): return obj.item() raise TypeError(f'{type(obj)} is unsupported for json dump') class JsonHandler(BaseFileHandler): def load_from_fileobj(self, file): return json.load(file) def dump_to_fileobj(self, obj, file, **kwargs): kwargs.setdefault('default', set_default) json.dump(obj, file, **kwargs) def dump_to_str(self, obj, **kwargs): kwargs.setdefault('default', set_default) return json.dumps(obj, **kwargs) ================================================ FILE: annotator/mmpkg/mmcv/fileio/handlers/pickle_handler.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import pickle from .base import BaseFileHandler class PickleHandler(BaseFileHandler): str_like = False def load_from_fileobj(self, file, **kwargs): return pickle.load(file, **kwargs) def load_from_path(self, filepath, **kwargs): return super(PickleHandler, self).load_from_path( filepath, mode='rb', **kwargs) def dump_to_str(self, obj, **kwargs): kwargs.setdefault('protocol', 2) return pickle.dumps(obj, **kwargs) def dump_to_fileobj(self, obj, file, **kwargs): kwargs.setdefault('protocol', 2) pickle.dump(obj, file, **kwargs) def dump_to_path(self, obj, filepath, **kwargs): super(PickleHandler, self).dump_to_path( obj, filepath, mode='wb', **kwargs) ================================================ FILE: annotator/mmpkg/mmcv/fileio/handlers/yaml_handler.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import yaml try: from yaml import CLoader as Loader, CDumper as Dumper except ImportError: from yaml import Loader, Dumper from .base import BaseFileHandler # isort:skip class YamlHandler(BaseFileHandler): def load_from_fileobj(self, file, **kwargs): kwargs.setdefault('Loader', Loader) return yaml.load(file, **kwargs) def dump_to_fileobj(self, obj, file, **kwargs): kwargs.setdefault('Dumper', Dumper) yaml.dump(obj, file, **kwargs) def dump_to_str(self, obj, **kwargs): kwargs.setdefault('Dumper', Dumper) return yaml.dump(obj, **kwargs) ================================================ FILE: annotator/mmpkg/mmcv/fileio/io.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from io import BytesIO, StringIO from pathlib import Path from ..utils import is_list_of, is_str from .file_client import FileClient from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler file_handlers = { 'json': JsonHandler(), 'yaml': YamlHandler(), 'yml': YamlHandler(), 'pickle': PickleHandler(), 'pkl': PickleHandler() } def load(file, file_format=None, file_client_args=None, **kwargs): """Load data from json/yaml/pickle files. This method provides a unified api for loading data from serialized files. Note: In v1.3.16 and later, ``load`` supports loading data from serialized files those can be storaged in different backends. Args: file (str or :obj:`Path` or file-like object): Filename or a file-like object. file_format (str, optional): If not specified, the file format will be inferred from the file extension, otherwise use the specified one. Currently supported formats include "json", "yaml/yml" and "pickle/pkl". file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Default: None. Examples: >>> load('/path/of/your/file') # file is storaged in disk >>> load('https://path/of/your/file') # file is storaged in Internet >>> load('s3://path/of/your/file') # file is storaged in petrel Returns: The content from the file. """ if isinstance(file, Path): file = str(file) if file_format is None and is_str(file): file_format = file.split('.')[-1] if file_format not in file_handlers: raise TypeError(f'Unsupported format: {file_format}') handler = file_handlers[file_format] if is_str(file): file_client = FileClient.infer_client(file_client_args, file) if handler.str_like: with StringIO(file_client.get_text(file)) as f: obj = handler.load_from_fileobj(f, **kwargs) else: with BytesIO(file_client.get(file)) as f: obj = handler.load_from_fileobj(f, **kwargs) elif hasattr(file, 'read'): obj = handler.load_from_fileobj(file, **kwargs) else: raise TypeError('"file" must be a filepath str or a file-object') return obj def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs): """Dump data to json/yaml/pickle strings or files. This method provides a unified api for dumping data as strings or to files, and also supports custom arguments for each file format. Note: In v1.3.16 and later, ``dump`` supports dumping data as strings or to files which is saved to different backends. Args: obj (any): The python object to be dumped. file (str or :obj:`Path` or file-like object, optional): If not specified, then the object is dumped to a str, otherwise to a file specified by the filename or file-like object. file_format (str, optional): Same as :func:`load`. file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Default: None. Examples: >>> dump('hello world', '/path/of/your/file') # disk >>> dump('hello world', 's3://path/of/your/file') # ceph or petrel Returns: bool: True for success, False otherwise. """ if isinstance(file, Path): file = str(file) if file_format is None: if is_str(file): file_format = file.split('.')[-1] elif file is None: raise ValueError( 'file_format must be specified since file is None') if file_format not in file_handlers: raise TypeError(f'Unsupported format: {file_format}') handler = file_handlers[file_format] if file is None: return handler.dump_to_str(obj, **kwargs) elif is_str(file): file_client = FileClient.infer_client(file_client_args, file) if handler.str_like: with StringIO() as f: handler.dump_to_fileobj(obj, f, **kwargs) file_client.put_text(f.getvalue(), file) else: with BytesIO() as f: handler.dump_to_fileobj(obj, f, **kwargs) file_client.put(f.getvalue(), file) elif hasattr(file, 'write'): handler.dump_to_fileobj(obj, file, **kwargs) else: raise TypeError('"file" must be a filename str or a file-object') def _register_handler(handler, file_formats): """Register a handler for some file extensions. Args: handler (:obj:`BaseFileHandler`): Handler to be registered. file_formats (str or list[str]): File formats to be handled by this handler. """ if not isinstance(handler, BaseFileHandler): raise TypeError( f'handler must be a child of BaseFileHandler, not {type(handler)}') if isinstance(file_formats, str): file_formats = [file_formats] if not is_list_of(file_formats, str): raise TypeError('file_formats must be a str or a list of str') for ext in file_formats: file_handlers[ext] = handler def register_handler(file_formats, **kwargs): def wrap(cls): _register_handler(cls(**kwargs), file_formats) return cls return wrap ================================================ FILE: annotator/mmpkg/mmcv/fileio/parse.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from io import StringIO from .file_client import FileClient def list_from_file(filename, prefix='', offset=0, max_num=0, encoding='utf-8', file_client_args=None): """Load a text file and parse the content as a list of strings. Note: In v1.3.16 and later, ``list_from_file`` supports loading a text file which can be storaged in different backends and parsing the content as a list for strings. Args: filename (str): Filename. prefix (str): The prefix to be inserted to the beginning of each item. offset (int): The offset of lines. max_num (int): The maximum number of lines to be read, zeros and negatives mean no limitation. encoding (str): Encoding used to open the file. Default utf-8. file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Default: None. Examples: >>> list_from_file('/path/of/your/file') # disk ['hello', 'world'] >>> list_from_file('s3://path/of/your/file') # ceph or petrel ['hello', 'world'] Returns: list[str]: A list of strings. """ cnt = 0 item_list = [] file_client = FileClient.infer_client(file_client_args, filename) with StringIO(file_client.get_text(filename, encoding)) as f: for _ in range(offset): f.readline() for line in f: if 0 < max_num <= cnt: break item_list.append(prefix + line.rstrip('\n\r')) cnt += 1 return item_list def dict_from_file(filename, key_type=str, encoding='utf-8', file_client_args=None): """Load a text file and parse the content as a dict. Each line of the text file will be two or more columns split by whitespaces or tabs. The first column will be parsed as dict keys, and the following columns will be parsed as dict values. Note: In v1.3.16 and later, ``dict_from_file`` supports loading a text file which can be storaged in different backends and parsing the content as a dict. Args: filename(str): Filename. key_type(type): Type of the dict keys. str is user by default and type conversion will be performed if specified. encoding (str): Encoding used to open the file. Default utf-8. file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Default: None. Examples: >>> dict_from_file('/path/of/your/file') # disk {'key1': 'value1', 'key2': 'value2'} >>> dict_from_file('s3://path/of/your/file') # ceph or petrel {'key1': 'value1', 'key2': 'value2'} Returns: dict: The parsed contents. """ mapping = {} file_client = FileClient.infer_client(file_client_args, filename) with StringIO(file_client.get_text(filename, encoding)) as f: for line in f: items = line.rstrip('\n').split() assert len(items) >= 2 key = key_type(items[0]) val = items[1:] if len(items) > 2 else items[1] mapping[key] = val return mapping ================================================ FILE: annotator/mmpkg/mmcv/image/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr, gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert, rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb) from .geometric import (cutout, imcrop, imflip, imflip_, impad, impad_to_multiple, imrescale, imresize, imresize_like, imresize_to_multiple, imrotate, imshear, imtranslate, rescale_size) from .io import imfrombytes, imread, imwrite, supported_backends, use_backend from .misc import tensor2imgs from .photometric import (adjust_brightness, adjust_color, adjust_contrast, adjust_lighting, adjust_sharpness, auto_contrast, clahe, imdenormalize, imequalize, iminvert, imnormalize, imnormalize_, lut_transform, posterize, solarize) __all__ = [ 'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb', 'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale', 'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size', 'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate', 'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend', 'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize', 'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr', 'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize', 'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe', 'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting' ] ================================================ FILE: annotator/mmpkg/mmcv/image/colorspace.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import cv2 import numpy as np def imconvert(img, src, dst): """Convert an image from the src colorspace to dst colorspace. Args: img (ndarray): The input image. src (str): The source colorspace, e.g., 'rgb', 'hsv'. dst (str): The destination colorspace, e.g., 'rgb', 'hsv'. Returns: ndarray: The converted image. """ code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') out_img = cv2.cvtColor(img, code) return out_img def bgr2gray(img, keepdim=False): """Convert a BGR image to grayscale image. Args: img (ndarray): The input image. keepdim (bool): If False (by default), then return the grayscale image with 2 dims, otherwise 3 dims. Returns: ndarray: The converted grayscale image. """ out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if keepdim: out_img = out_img[..., None] return out_img def rgb2gray(img, keepdim=False): """Convert a RGB image to grayscale image. Args: img (ndarray): The input image. keepdim (bool): If False (by default), then return the grayscale image with 2 dims, otherwise 3 dims. Returns: ndarray: The converted grayscale image. """ out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) if keepdim: out_img = out_img[..., None] return out_img def gray2bgr(img): """Convert a grayscale image to BGR image. Args: img (ndarray): The input image. Returns: ndarray: The converted BGR image. """ img = img[..., None] if img.ndim == 2 else img out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) return out_img def gray2rgb(img): """Convert a grayscale image to RGB image. Args: img (ndarray): The input image. Returns: ndarray: The converted RGB image. """ img = img[..., None] if img.ndim == 2 else img out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) return out_img def _convert_input_type_range(img): """Convert the type and range of the input image. It converts the input image to np.float32 type and range of [0, 1]. It is mainly used for pre-processing the input image in colorspace conversion functions such as rgb2ycbcr and ycbcr2rgb. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. Returns: (ndarray): The converted image with type of np.float32 and range of [0, 1]. """ img_type = img.dtype img = img.astype(np.float32) if img_type == np.float32: pass elif img_type == np.uint8: img /= 255. else: raise TypeError('The img type should be np.float32 or np.uint8, ' f'but got {img_type}') return img def _convert_output_type_range(img, dst_type): """Convert the type and range of the image according to dst_type. It converts the image to desired type and range. If `dst_type` is np.uint8, images will be converted to np.uint8 type with range [0, 255]. If `dst_type` is np.float32, it converts the image to np.float32 type with range [0, 1]. It is mainly used for post-processing images in colorspace conversion functions such as rgb2ycbcr and ycbcr2rgb. Args: img (ndarray): The image to be converted with np.float32 type and range [0, 255]. dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it converts the image to np.uint8 type with range [0, 255]. If dst_type is np.float32, it converts the image to np.float32 type with range [0, 1]. Returns: (ndarray): The converted image with desired type and range. """ if dst_type not in (np.uint8, np.float32): raise TypeError('The dst_type should be np.float32 or np.uint8, ' f'but got {dst_type}') if dst_type == np.uint8: img = img.round() else: img /= 255. return img.astype(dst_type) def rgb2ycbcr(img, y_only=False): """Convert a RGB image to YCbCr image. This function produces the same results as Matlab's `rgb2ycbcr` function. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. y_only (bool): Whether to only return Y channel. Default: False. Returns: ndarray: The converted YCbCr image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) if y_only: out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0 else: out_img = np.matmul( img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786], [24.966, 112.0, -18.214]]) + [16, 128, 128] out_img = _convert_output_type_range(out_img, img_type) return out_img def bgr2ycbcr(img, y_only=False): """Convert a BGR image to YCbCr image. The bgr version of rgb2ycbcr. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. y_only (bool): Whether to only return Y channel. Default: False. Returns: ndarray: The converted YCbCr image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) if y_only: out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0 else: out_img = np.matmul( img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786], [65.481, -37.797, 112.0]]) + [16, 128, 128] out_img = _convert_output_type_range(out_img, img_type) return out_img def ycbcr2rgb(img): """Convert a YCbCr image to RGB image. This function produces the same results as Matlab's ycbcr2rgb function. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. Returns: ndarray: The converted RGB image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) * 255 out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071], [0.00625893, -0.00318811, 0]]) * 255.0 + [ -222.921, 135.576, -276.836 ] out_img = _convert_output_type_range(out_img, img_type) return out_img def ycbcr2bgr(img): """Convert a YCbCr image to BGR image. The bgr version of ycbcr2rgb. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. Returns: ndarray: The converted BGR image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) * 255 out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0.00791071, -0.00153632, 0], [0, -0.00318811, 0.00625893]]) * 255.0 + [ -276.836, 135.576, -222.921 ] out_img = _convert_output_type_range(out_img, img_type) return out_img def convert_color_factory(src, dst): code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') def convert_color(img): out_img = cv2.cvtColor(img, code) return out_img convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()} image. Args: img (ndarray or str): The input image. Returns: ndarray: The converted {dst.upper()} image. """ return convert_color bgr2rgb = convert_color_factory('bgr', 'rgb') rgb2bgr = convert_color_factory('rgb', 'bgr') bgr2hsv = convert_color_factory('bgr', 'hsv') hsv2bgr = convert_color_factory('hsv', 'bgr') bgr2hls = convert_color_factory('bgr', 'hls') hls2bgr = convert_color_factory('hls', 'bgr') ================================================ FILE: annotator/mmpkg/mmcv/image/geometric.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numbers import cv2 import numpy as np from ..utils import to_2tuple from .io import imread_backend try: from PIL import Image except ImportError: Image = None def _scale_size(size, scale): """Rescale a size by a ratio. Args: size (tuple[int]): (w, h). scale (float | tuple(float)): Scaling factor. Returns: tuple[int]: scaled size. """ if isinstance(scale, (float, int)): scale = (scale, scale) w, h = size return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5) cv2_interp_codes = { 'nearest': cv2.INTER_NEAREST, 'bilinear': cv2.INTER_LINEAR, 'bicubic': cv2.INTER_CUBIC, 'area': cv2.INTER_AREA, 'lanczos': cv2.INTER_LANCZOS4 } if Image is not None: pillow_interp_codes = { 'nearest': Image.NEAREST, 'bilinear': Image.BILINEAR, 'bicubic': Image.BICUBIC, 'box': Image.BOX, 'lanczos': Image.LANCZOS, 'hamming': Image.HAMMING } def imresize(img, size, return_scale=False, interpolation='bilinear', out=None, backend=None): """Resize image to a given size. Args: img (ndarray): The input image. size (tuple[int]): Target size (w, h). return_scale (bool): Whether to return `w_scale` and `h_scale`. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. out (ndarray): The output destination. backend (str | None): The image resize backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. Returns: tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or `resized_img`. """ h, w = img.shape[:2] if backend is None: backend = imread_backend if backend not in ['cv2', 'pillow']: raise ValueError(f'backend: {backend} is not supported for resize.' f"Supported backends are 'cv2', 'pillow'") if backend == 'pillow': assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' pil_image = Image.fromarray(img) pil_image = pil_image.resize(size, pillow_interp_codes[interpolation]) resized_img = np.array(pil_image) else: resized_img = cv2.resize( img, size, dst=out, interpolation=cv2_interp_codes[interpolation]) if not return_scale: return resized_img else: w_scale = size[0] / w h_scale = size[1] / h return resized_img, w_scale, h_scale def imresize_to_multiple(img, divisor, size=None, scale_factor=None, keep_ratio=False, return_scale=False, interpolation='bilinear', out=None, backend=None): """Resize image according to a given size or scale factor and then rounds up the the resized or rescaled image size to the nearest value that can be divided by the divisor. Args: img (ndarray): The input image. divisor (int | tuple): Resized image size will be a multiple of divisor. If divisor is a tuple, divisor should be (w_divisor, h_divisor). size (None | int | tuple[int]): Target size (w, h). Default: None. scale_factor (None | float | tuple[float]): Multiplier for spatial size. Should match input size if it is a tuple and the 2D style is (w_scale_factor, h_scale_factor). Default: None. keep_ratio (bool): Whether to keep the aspect ratio when resizing the image. Default: False. return_scale (bool): Whether to return `w_scale` and `h_scale`. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. out (ndarray): The output destination. backend (str | None): The image resize backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. Returns: tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or `resized_img`. """ h, w = img.shape[:2] if size is not None and scale_factor is not None: raise ValueError('only one of size or scale_factor should be defined') elif size is None and scale_factor is None: raise ValueError('one of size or scale_factor should be defined') elif size is not None: size = to_2tuple(size) if keep_ratio: size = rescale_size((w, h), size, return_scale=False) else: size = _scale_size((w, h), scale_factor) divisor = to_2tuple(divisor) size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)]) resized_img, w_scale, h_scale = imresize( img, size, return_scale=True, interpolation=interpolation, out=out, backend=backend) if return_scale: return resized_img, w_scale, h_scale else: return resized_img def imresize_like(img, dst_img, return_scale=False, interpolation='bilinear', backend=None): """Resize image to the same size of a given image. Args: img (ndarray): The input image. dst_img (ndarray): The target image. return_scale (bool): Whether to return `w_scale` and `h_scale`. interpolation (str): Same as :func:`resize`. backend (str | None): Same as :func:`resize`. Returns: tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or `resized_img`. """ h, w = dst_img.shape[:2] return imresize(img, (w, h), return_scale, interpolation, backend=backend) def rescale_size(old_size, scale, return_scale=False): """Calculate the new size to be rescaled to. Args: old_size (tuple[int]): The old size (w, h) of image. scale (float | tuple[int]): The scaling factor or maximum size. If it is a float number, then the image will be rescaled by this factor, else if it is a tuple of 2 integers, then the image will be rescaled as large as possible within the scale. return_scale (bool): Whether to return the scaling factor besides the rescaled image size. Returns: tuple[int]: The new rescaled image size. """ w, h = old_size if isinstance(scale, (float, int)): if scale <= 0: raise ValueError(f'Invalid scale {scale}, must be positive.') scale_factor = scale elif isinstance(scale, tuple): max_long_edge = max(scale) max_short_edge = min(scale) scale_factor = min(max_long_edge / max(h, w), max_short_edge / min(h, w)) else: raise TypeError( f'Scale must be a number or tuple of int, but got {type(scale)}') new_size = _scale_size((w, h), scale_factor) if return_scale: return new_size, scale_factor else: return new_size def imrescale(img, scale, return_scale=False, interpolation='bilinear', backend=None): """Resize image while keeping the aspect ratio. Args: img (ndarray): The input image. scale (float | tuple[int]): The scaling factor or maximum size. If it is a float number, then the image will be rescaled by this factor, else if it is a tuple of 2 integers, then the image will be rescaled as large as possible within the scale. return_scale (bool): Whether to return the scaling factor besides the rescaled image. interpolation (str): Same as :func:`resize`. backend (str | None): Same as :func:`resize`. Returns: ndarray: The rescaled image. """ h, w = img.shape[:2] new_size, scale_factor = rescale_size((w, h), scale, return_scale=True) rescaled_img = imresize( img, new_size, interpolation=interpolation, backend=backend) if return_scale: return rescaled_img, scale_factor else: return rescaled_img def imflip(img, direction='horizontal'): """Flip an image horizontally or vertically. Args: img (ndarray): Image to be flipped. direction (str): The flip direction, either "horizontal" or "vertical" or "diagonal". Returns: ndarray: The flipped image. """ assert direction in ['horizontal', 'vertical', 'diagonal'] if direction == 'horizontal': return np.flip(img, axis=1) elif direction == 'vertical': return np.flip(img, axis=0) else: return np.flip(img, axis=(0, 1)) def imflip_(img, direction='horizontal'): """Inplace flip an image horizontally or vertically. Args: img (ndarray): Image to be flipped. direction (str): The flip direction, either "horizontal" or "vertical" or "diagonal". Returns: ndarray: The flipped image (inplace). """ assert direction in ['horizontal', 'vertical', 'diagonal'] if direction == 'horizontal': return cv2.flip(img, 1, img) elif direction == 'vertical': return cv2.flip(img, 0, img) else: return cv2.flip(img, -1, img) def imrotate(img, angle, center=None, scale=1.0, border_value=0, interpolation='bilinear', auto_bound=False): """Rotate an image. Args: img (ndarray): Image to be rotated. angle (float): Rotation angle in degrees, positive values mean clockwise rotation. center (tuple[float], optional): Center point (w, h) of the rotation in the source image. If not specified, the center of the image will be used. scale (float): Isotropic scale factor. border_value (int): Border value. interpolation (str): Same as :func:`resize`. auto_bound (bool): Whether to adjust the image size to cover the whole rotated image. Returns: ndarray: The rotated image. """ if center is not None and auto_bound: raise ValueError('`auto_bound` conflicts with `center`') h, w = img.shape[:2] if center is None: center = ((w - 1) * 0.5, (h - 1) * 0.5) assert isinstance(center, tuple) matrix = cv2.getRotationMatrix2D(center, -angle, scale) if auto_bound: cos = np.abs(matrix[0, 0]) sin = np.abs(matrix[0, 1]) new_w = h * sin + w * cos new_h = h * cos + w * sin matrix[0, 2] += (new_w - w) * 0.5 matrix[1, 2] += (new_h - h) * 0.5 w = int(np.round(new_w)) h = int(np.round(new_h)) rotated = cv2.warpAffine( img, matrix, (w, h), flags=cv2_interp_codes[interpolation], borderValue=border_value) return rotated def bbox_clip(bboxes, img_shape): """Clip bboxes to fit the image shape. Args: bboxes (ndarray): Shape (..., 4*k) img_shape (tuple[int]): (height, width) of the image. Returns: ndarray: Clipped bboxes. """ assert bboxes.shape[-1] % 4 == 0 cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype) cmin[0::2] = img_shape[1] - 1 cmin[1::2] = img_shape[0] - 1 clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0) return clipped_bboxes def bbox_scaling(bboxes, scale, clip_shape=None): """Scaling bboxes w.r.t the box center. Args: bboxes (ndarray): Shape(..., 4). scale (float): Scaling factor. clip_shape (tuple[int], optional): If specified, bboxes that exceed the boundary will be clipped according to the given shape (h, w). Returns: ndarray: Scaled bboxes. """ if float(scale) == 1.0: scaled_bboxes = bboxes.copy() else: w = bboxes[..., 2] - bboxes[..., 0] + 1 h = bboxes[..., 3] - bboxes[..., 1] + 1 dw = (w * (scale - 1)) * 0.5 dh = (h * (scale - 1)) * 0.5 scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1) if clip_shape is not None: return bbox_clip(scaled_bboxes, clip_shape) else: return scaled_bboxes def imcrop(img, bboxes, scale=1.0, pad_fill=None): """Crop image patches. 3 steps: scale the bboxes -> clip bboxes -> crop and pad. Args: img (ndarray): Image to be cropped. bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes. scale (float, optional): Scale ratio of bboxes, the default value 1.0 means no padding. pad_fill (Number | list[Number]): Value to be filled for padding. Default: None, which means no padding. Returns: list[ndarray] | ndarray: The cropped image patches. """ chn = 1 if img.ndim == 2 else img.shape[2] if pad_fill is not None: if isinstance(pad_fill, (int, float)): pad_fill = [pad_fill for _ in range(chn)] assert len(pad_fill) == chn _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32) clipped_bbox = bbox_clip(scaled_bboxes, img.shape) patches = [] for i in range(clipped_bbox.shape[0]): x1, y1, x2, y2 = tuple(clipped_bbox[i, :]) if pad_fill is None: patch = img[y1:y2 + 1, x1:x2 + 1, ...] else: _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :]) if chn == 1: patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1) else: patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn) patch = np.array( pad_fill, dtype=img.dtype) * np.ones( patch_shape, dtype=img.dtype) x_start = 0 if _x1 >= 0 else -_x1 y_start = 0 if _y1 >= 0 else -_y1 w = x2 - x1 + 1 h = y2 - y1 + 1 patch[y_start:y_start + h, x_start:x_start + w, ...] = img[y1:y1 + h, x1:x1 + w, ...] patches.append(patch) if bboxes.ndim == 1: return patches[0] else: return patches def impad(img, *, shape=None, padding=None, pad_val=0, padding_mode='constant'): """Pad the given image to a certain shape or pad on all sides with specified padding mode and padding value. Args: img (ndarray): Image to be padded. shape (tuple[int]): Expected padding shape (h, w). Default: None. padding (int or tuple[int]): Padding on each border. If a single int is provided this is used to pad all borders. If tuple of length 2 is provided this is the padding on left/right and top/bottom respectively. If a tuple of length 4 is provided this is the padding for the left, top, right and bottom borders respectively. Default: None. Note that `shape` and `padding` can not be both set. pad_val (Number | Sequence[Number]): Values to be filled in padding areas when padding_mode is 'constant'. Default: 0. padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default: constant. - constant: pads with a constant value, this value is specified with pad_val. - edge: pads with the last value at the edge of the image. - reflect: pads with reflection of image without repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode will result in [3, 2, 1, 2, 3, 4, 3, 2]. - symmetric: pads with reflection of image repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode will result in [2, 1, 1, 2, 3, 4, 4, 3] Returns: ndarray: The padded image. """ assert (shape is not None) ^ (padding is not None) if shape is not None: padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0]) # check pad_val if isinstance(pad_val, tuple): assert len(pad_val) == img.shape[-1] elif not isinstance(pad_val, numbers.Number): raise TypeError('pad_val must be a int or a tuple. ' f'But received {type(pad_val)}') # check padding if isinstance(padding, tuple) and len(padding) in [2, 4]: if len(padding) == 2: padding = (padding[0], padding[1], padding[0], padding[1]) elif isinstance(padding, numbers.Number): padding = (padding, padding, padding, padding) else: raise ValueError('Padding must be a int or a 2, or 4 element tuple.' f'But received {padding}') # check padding mode assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'] border_type = { 'constant': cv2.BORDER_CONSTANT, 'edge': cv2.BORDER_REPLICATE, 'reflect': cv2.BORDER_REFLECT_101, 'symmetric': cv2.BORDER_REFLECT } img = cv2.copyMakeBorder( img, padding[1], padding[3], padding[0], padding[2], border_type[padding_mode], value=pad_val) return img def impad_to_multiple(img, divisor, pad_val=0): """Pad an image to ensure each edge to be multiple to some number. Args: img (ndarray): Image to be padded. divisor (int): Padded image edges will be multiple to divisor. pad_val (Number | Sequence[Number]): Same as :func:`impad`. Returns: ndarray: The padded image. """ pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor return impad(img, shape=(pad_h, pad_w), pad_val=pad_val) def cutout(img, shape, pad_val=0): """Randomly cut out a rectangle from the original img. Args: img (ndarray): Image to be cutout. shape (int | tuple[int]): Expected cutout shape (h, w). If given as a int, the value will be used for both h and w. pad_val (int | float | tuple[int | float]): Values to be filled in the cut area. Defaults to 0. Returns: ndarray: The cutout image. """ channels = 1 if img.ndim == 2 else img.shape[2] if isinstance(shape, int): cut_h, cut_w = shape, shape else: assert isinstance(shape, tuple) and len(shape) == 2, \ f'shape must be a int or a tuple with length 2, but got type ' \ f'{type(shape)} instead.' cut_h, cut_w = shape if isinstance(pad_val, (int, float)): pad_val = tuple([pad_val] * channels) elif isinstance(pad_val, tuple): assert len(pad_val) == channels, \ 'Expected the num of elements in tuple equals the channels' \ 'of input image. Found {} vs {}'.format( len(pad_val), channels) else: raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`') img_h, img_w = img.shape[:2] y0 = np.random.uniform(img_h) x0 = np.random.uniform(img_w) y1 = int(max(0, y0 - cut_h / 2.)) x1 = int(max(0, x0 - cut_w / 2.)) y2 = min(img_h, y1 + cut_h) x2 = min(img_w, x1 + cut_w) if img.ndim == 2: patch_shape = (y2 - y1, x2 - x1) else: patch_shape = (y2 - y1, x2 - x1, channels) img_cutout = img.copy() patch = np.array( pad_val, dtype=img.dtype) * np.ones( patch_shape, dtype=img.dtype) img_cutout[y1:y2, x1:x2, ...] = patch return img_cutout def _get_shear_matrix(magnitude, direction='horizontal'): """Generate the shear matrix for transformation. Args: magnitude (int | float): The magnitude used for shear. direction (str): The flip direction, either "horizontal" or "vertical". Returns: ndarray: The shear matrix with dtype float32. """ if direction == 'horizontal': shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]]) elif direction == 'vertical': shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]]) return shear_matrix def imshear(img, magnitude, direction='horizontal', border_value=0, interpolation='bilinear'): """Shear an image. Args: img (ndarray): Image to be sheared with format (h, w) or (h, w, c). magnitude (int | float): The magnitude used for shear. direction (str): The flip direction, either "horizontal" or "vertical". border_value (int | tuple[int]): Value used in case of a constant border. interpolation (str): Same as :func:`resize`. Returns: ndarray: The sheared image. """ assert direction in ['horizontal', 'vertical'], f'Invalid direction: {direction}' height, width = img.shape[:2] if img.ndim == 2: channels = 1 elif img.ndim == 3: channels = img.shape[-1] if isinstance(border_value, int): border_value = tuple([border_value] * channels) elif isinstance(border_value, tuple): assert len(border_value) == channels, \ 'Expected the num of elements in tuple equals the channels' \ 'of input image. Found {} vs {}'.format( len(border_value), channels) else: raise ValueError( f'Invalid type {type(border_value)} for `border_value`') shear_matrix = _get_shear_matrix(magnitude, direction) sheared = cv2.warpAffine( img, shear_matrix, (width, height), # Note case when the number elements in `border_value` # greater than 3 (e.g. shearing masks whose channels large # than 3) will raise TypeError in `cv2.warpAffine`. # Here simply slice the first 3 values in `border_value`. borderValue=border_value[:3], flags=cv2_interp_codes[interpolation]) return sheared def _get_translate_matrix(offset, direction='horizontal'): """Generate the translate matrix. Args: offset (int | float): The offset used for translate. direction (str): The translate direction, either "horizontal" or "vertical". Returns: ndarray: The translate matrix with dtype float32. """ if direction == 'horizontal': translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]]) elif direction == 'vertical': translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]]) return translate_matrix def imtranslate(img, offset, direction='horizontal', border_value=0, interpolation='bilinear'): """Translate an image. Args: img (ndarray): Image to be translated with format (h, w) or (h, w, c). offset (int | float): The offset used for translate. direction (str): The translate direction, either "horizontal" or "vertical". border_value (int | tuple[int]): Value used in case of a constant border. interpolation (str): Same as :func:`resize`. Returns: ndarray: The translated image. """ assert direction in ['horizontal', 'vertical'], f'Invalid direction: {direction}' height, width = img.shape[:2] if img.ndim == 2: channels = 1 elif img.ndim == 3: channels = img.shape[-1] if isinstance(border_value, int): border_value = tuple([border_value] * channels) elif isinstance(border_value, tuple): assert len(border_value) == channels, \ 'Expected the num of elements in tuple equals the channels' \ 'of input image. Found {} vs {}'.format( len(border_value), channels) else: raise ValueError( f'Invalid type {type(border_value)} for `border_value`.') translate_matrix = _get_translate_matrix(offset, direction) translated = cv2.warpAffine( img, translate_matrix, (width, height), # Note case when the number elements in `border_value` # greater than 3 (e.g. translating masks whose channels # large than 3) will raise TypeError in `cv2.warpAffine`. # Here simply slice the first 3 values in `border_value`. borderValue=border_value[:3], flags=cv2_interp_codes[interpolation]) return translated ================================================ FILE: annotator/mmpkg/mmcv/image/io.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import io import os.path as osp from pathlib import Path import cv2 import numpy as np from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION, IMREAD_UNCHANGED) from annotator.mmpkg.mmcv.utils import check_file_exist, is_str, mkdir_or_exist try: from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG except ImportError: TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None try: from PIL import Image, ImageOps except ImportError: Image = None try: import tifffile except ImportError: tifffile = None jpeg = None supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile'] imread_flags = { 'color': IMREAD_COLOR, 'grayscale': IMREAD_GRAYSCALE, 'unchanged': IMREAD_UNCHANGED, 'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR, 'grayscale_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE } imread_backend = 'cv2' def use_backend(backend): """Select a backend for image decoding. Args: backend (str): The image decoding backend type. Options are `cv2`, `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG) and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg` file format. """ assert backend in supported_backends global imread_backend imread_backend = backend if imread_backend == 'turbojpeg': if TurboJPEG is None: raise ImportError('`PyTurboJPEG` is not installed') global jpeg if jpeg is None: jpeg = TurboJPEG() elif imread_backend == 'pillow': if Image is None: raise ImportError('`Pillow` is not installed') elif imread_backend == 'tifffile': if tifffile is None: raise ImportError('`tifffile` is not installed') def _jpegflag(flag='color', channel_order='bgr'): channel_order = channel_order.lower() if channel_order not in ['rgb', 'bgr']: raise ValueError('channel order must be either "rgb" or "bgr"') if flag == 'color': if channel_order == 'bgr': return TJPF_BGR elif channel_order == 'rgb': return TJCS_RGB elif flag == 'grayscale': return TJPF_GRAY else: raise ValueError('flag must be "color" or "grayscale"') def _pillow2array(img, flag='color', channel_order='bgr'): """Convert a pillow image to numpy array. Args: img (:obj:`PIL.Image.Image`): The image loaded using PIL flag (str): Flags specifying the color type of a loaded image, candidates are 'color', 'grayscale' and 'unchanged'. Default to 'color'. channel_order (str): The channel order of the output image array, candidates are 'bgr' and 'rgb'. Default to 'bgr'. Returns: np.ndarray: The converted numpy array """ channel_order = channel_order.lower() if channel_order not in ['rgb', 'bgr']: raise ValueError('channel order must be either "rgb" or "bgr"') if flag == 'unchanged': array = np.array(img) if array.ndim >= 3 and array.shape[2] >= 3: # color image array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR else: # Handle exif orientation tag if flag in ['color', 'grayscale']: img = ImageOps.exif_transpose(img) # If the image mode is not 'RGB', convert it to 'RGB' first. if img.mode != 'RGB': if img.mode != 'LA': # Most formats except 'LA' can be directly converted to RGB img = img.convert('RGB') else: # When the mode is 'LA', the default conversion will fill in # the canvas with black, which sometimes shadows black objects # in the foreground. # # Therefore, a random color (124, 117, 104) is used for canvas img_rgba = img.convert('RGBA') img = Image.new('RGB', img_rgba.size, (124, 117, 104)) img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha if flag in ['color', 'color_ignore_orientation']: array = np.array(img) if channel_order != 'rgb': array = array[:, :, ::-1] # RGB to BGR elif flag in ['grayscale', 'grayscale_ignore_orientation']: img = img.convert('L') array = np.array(img) else: raise ValueError( 'flag must be "color", "grayscale", "unchanged", ' f'"color_ignore_orientation" or "grayscale_ignore_orientation"' f' but got {flag}') return array def imread(img_or_path, flag='color', channel_order='bgr', backend=None): """Read an image. Args: img_or_path (ndarray or str or Path): Either a numpy array or str or pathlib.Path. If it is a numpy array (loaded image), then it will be returned as is. flag (str): Flags specifying the color type of a loaded image, candidates are `color`, `grayscale`, `unchanged`, `color_ignore_orientation` and `grayscale_ignore_orientation`. By default, `cv2` and `pillow` backend would rotate the image according to its EXIF info unless called with `unchanged` or `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend always ignore image's EXIF info regardless of the flag. The `turbojpeg` backend only supports `color` and `grayscale`. channel_order (str): Order of channel, candidates are `bgr` and `rgb`. backend (str | None): The image decoding backend type. Options are `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. Returns: ndarray: Loaded image array. """ if backend is None: backend = imread_backend if backend not in supported_backends: raise ValueError(f'backend: {backend} is not supported. Supported ' "backends are 'cv2', 'turbojpeg', 'pillow'") if isinstance(img_or_path, Path): img_or_path = str(img_or_path) if isinstance(img_or_path, np.ndarray): return img_or_path elif is_str(img_or_path): check_file_exist(img_or_path, f'img file does not exist: {img_or_path}') if backend == 'turbojpeg': with open(img_or_path, 'rb') as in_file: img = jpeg.decode(in_file.read(), _jpegflag(flag, channel_order)) if img.shape[-1] == 1: img = img[:, :, 0] return img elif backend == 'pillow': img = Image.open(img_or_path) img = _pillow2array(img, flag, channel_order) return img elif backend == 'tifffile': img = tifffile.imread(img_or_path) return img else: flag = imread_flags[flag] if is_str(flag) else flag img = cv2.imread(img_or_path, flag) if flag == IMREAD_COLOR and channel_order == 'rgb': cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) return img else: raise TypeError('"img" must be a numpy array or a str or ' 'a pathlib.Path object') def imfrombytes(content, flag='color', channel_order='bgr', backend=None): """Read an image from bytes. Args: content (bytes): Image bytes got from files or other streams. flag (str): Same as :func:`imread`. backend (str | None): The image decoding backend type. Options are `cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. Returns: ndarray: Loaded image array. """ if backend is None: backend = imread_backend if backend not in supported_backends: raise ValueError(f'backend: {backend} is not supported. Supported ' "backends are 'cv2', 'turbojpeg', 'pillow'") if backend == 'turbojpeg': img = jpeg.decode(content, _jpegflag(flag, channel_order)) if img.shape[-1] == 1: img = img[:, :, 0] return img elif backend == 'pillow': buff = io.BytesIO(content) img = Image.open(buff) img = _pillow2array(img, flag, channel_order) return img else: img_np = np.frombuffer(content, np.uint8) flag = imread_flags[flag] if is_str(flag) else flag img = cv2.imdecode(img_np, flag) if flag == IMREAD_COLOR and channel_order == 'rgb': cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) return img def imwrite(img, file_path, params=None, auto_mkdir=True): """Write image to file. Args: img (ndarray): Image array to be written. file_path (str): Image file path. params (None or list): Same as opencv :func:`imwrite` interface. auto_mkdir (bool): If the parent folder of `file_path` does not exist, whether to create it automatically. Returns: bool: Successful or not. """ if auto_mkdir: dir_name = osp.abspath(osp.dirname(file_path)) mkdir_or_exist(dir_name) return cv2.imwrite(file_path, img, params) ================================================ FILE: annotator/mmpkg/mmcv/image/misc.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import annotator.mmpkg.mmcv as mmcv try: import torch except ImportError: torch = None def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): """Convert tensor to 3-channel images. Args: tensor (torch.Tensor): Tensor that contains multiple images, shape ( N, C, H, W). mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0). std (tuple[float], optional): Standard deviation of images. Defaults to (1, 1, 1). to_rgb (bool, optional): Whether the tensor was converted to RGB format in the first place. If so, convert it back to BGR. Defaults to True. Returns: list[np.ndarray]: A list that contains multiple images. """ if torch is None: raise RuntimeError('pytorch is not installed') assert torch.is_tensor(tensor) and tensor.ndim == 4 assert len(mean) == 3 assert len(std) == 3 num_imgs = tensor.size(0) mean = np.array(mean, dtype=np.float32) std = np.array(std, dtype=np.float32) imgs = [] for img_id in range(num_imgs): img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0) img = mmcv.imdenormalize( img, mean, std, to_bgr=to_rgb).astype(np.uint8) imgs.append(np.ascontiguousarray(img)) return imgs ================================================ FILE: annotator/mmpkg/mmcv/image/photometric.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import cv2 import numpy as np from ..utils import is_tuple_of from .colorspace import bgr2gray, gray2bgr def imnormalize(img, mean, std, to_rgb=True): """Normalize an image with mean and std. Args: img (ndarray): Image to be normalized. mean (ndarray): The mean to be used for normalize. std (ndarray): The std to be used for normalize. to_rgb (bool): Whether to convert to rgb. Returns: ndarray: The normalized image. """ img = img.copy().astype(np.float32) return imnormalize_(img, mean, std, to_rgb) def imnormalize_(img, mean, std, to_rgb=True): """Inplace normalize an image with mean and std. Args: img (ndarray): Image to be normalized. mean (ndarray): The mean to be used for normalize. std (ndarray): The std to be used for normalize. to_rgb (bool): Whether to convert to rgb. Returns: ndarray: The normalized image. """ # cv2 inplace normalization does not accept uint8 assert img.dtype != np.uint8 mean = np.float64(mean.reshape(1, -1)) stdinv = 1 / np.float64(std.reshape(1, -1)) if to_rgb: cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace cv2.subtract(img, mean, img) # inplace cv2.multiply(img, stdinv, img) # inplace return img def imdenormalize(img, mean, std, to_bgr=True): assert img.dtype != np.uint8 mean = mean.reshape(1, -1).astype(np.float64) std = std.reshape(1, -1).astype(np.float64) img = cv2.multiply(img, std) # make a copy cv2.add(img, mean, img) # inplace if to_bgr: cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img) # inplace return img def iminvert(img): """Invert (negate) an image. Args: img (ndarray): Image to be inverted. Returns: ndarray: The inverted image. """ return np.full_like(img, 255) - img def solarize(img, thr=128): """Solarize an image (invert all pixel values above a threshold) Args: img (ndarray): Image to be solarized. thr (int): Threshold for solarizing (0 - 255). Returns: ndarray: The solarized image. """ img = np.where(img < thr, img, 255 - img) return img def posterize(img, bits): """Posterize an image (reduce the number of bits for each color channel) Args: img (ndarray): Image to be posterized. bits (int): Number of bits (1 to 8) to use for posterizing. Returns: ndarray: The posterized image. """ shift = 8 - bits img = np.left_shift(np.right_shift(img, shift), shift) return img def adjust_color(img, alpha=1, beta=None, gamma=0): r"""It blends the source image and its gray image: .. math:: output = img * alpha + gray\_img * beta + gamma Args: img (ndarray): The input source image. alpha (int | float): Weight for the source image. Default 1. beta (int | float): Weight for the converted gray image. If None, it's assigned the value (1 - `alpha`). gamma (int | float): Scalar added to each sum. Same as :func:`cv2.addWeighted`. Default 0. Returns: ndarray: Colored image which has the same size and dtype as input. """ gray_img = bgr2gray(img) gray_img = np.tile(gray_img[..., None], [1, 1, 3]) if beta is None: beta = 1 - alpha colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma) if not colored_img.dtype == np.uint8: # Note when the dtype of `img` is not the default `np.uint8` # (e.g. np.float32), the value in `colored_img` got from cv2 # is not guaranteed to be in range [0, 255], so here clip # is needed. colored_img = np.clip(colored_img, 0, 255) return colored_img def imequalize(img): """Equalize the image histogram. This function applies a non-linear mapping to the input image, in order to create a uniform distribution of grayscale values in the output image. Args: img (ndarray): Image to be equalized. Returns: ndarray: The equalized image. """ def _scale_channel(im, c): """Scale the data in the corresponding channel.""" im = im[:, :, c] # Compute the histogram of the image channel. histo = np.histogram(im, 256, (0, 255))[0] # For computing the step, filter out the nonzeros. nonzero_histo = histo[histo > 0] step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255 if not step: lut = np.array(range(256)) else: # Compute the cumulative sum, shifted by step // 2 # and then normalized by step. lut = (np.cumsum(histo) + (step // 2)) // step # Shift lut, prepending with 0. lut = np.concatenate([[0], lut[:-1]], 0) # handle potential integer overflow lut[lut > 255] = 255 # If step is zero, return the original image. # Otherwise, index from lut. return np.where(np.equal(step, 0), im, lut[im]) # Scales each channel independently and then stacks # the result. s1 = _scale_channel(img, 0) s2 = _scale_channel(img, 1) s3 = _scale_channel(img, 2) equalized_img = np.stack([s1, s2, s3], axis=-1) return equalized_img.astype(img.dtype) def adjust_brightness(img, factor=1.): """Adjust image brightness. This function controls the brightness of an image. An enhancement factor of 0.0 gives a black image. A factor of 1.0 gives the original image. This function blends the source image and the degenerated black image: .. math:: output = img * factor + degenerated * (1 - factor) Args: img (ndarray): Image to be brightened. factor (float): A value controls the enhancement. Factor 1.0 returns the original image, lower factors mean less color (brightness, contrast, etc), and higher values more. Default 1. Returns: ndarray: The brightened image. """ degenerated = np.zeros_like(img) # Note manually convert the dtype to np.float32, to # achieve as close results as PIL.ImageEnhance.Brightness. # Set beta=1-factor, and gamma=0 brightened_img = cv2.addWeighted( img.astype(np.float32), factor, degenerated.astype(np.float32), 1 - factor, 0) brightened_img = np.clip(brightened_img, 0, 255) return brightened_img.astype(img.dtype) def adjust_contrast(img, factor=1.): """Adjust image contrast. This function controls the contrast of an image. An enhancement factor of 0.0 gives a solid grey image. A factor of 1.0 gives the original image. It blends the source image and the degenerated mean image: .. math:: output = img * factor + degenerated * (1 - factor) Args: img (ndarray): Image to be contrasted. BGR order. factor (float): Same as :func:`mmcv.adjust_brightness`. Returns: ndarray: The contrasted image. """ gray_img = bgr2gray(img) hist = np.histogram(gray_img, 256, (0, 255))[0] mean = round(np.sum(gray_img) / np.sum(hist)) degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype) degenerated = gray2bgr(degenerated) contrasted_img = cv2.addWeighted( img.astype(np.float32), factor, degenerated.astype(np.float32), 1 - factor, 0) contrasted_img = np.clip(contrasted_img, 0, 255) return contrasted_img.astype(img.dtype) def auto_contrast(img, cutoff=0): """Auto adjust image contrast. This function maximize (normalize) image contrast by first removing cutoff percent of the lightest and darkest pixels from the histogram and remapping the image so that the darkest pixel becomes black (0), and the lightest becomes white (255). Args: img (ndarray): Image to be contrasted. BGR order. cutoff (int | float | tuple): The cutoff percent of the lightest and darkest pixels to be removed. If given as tuple, it shall be (low, high). Otherwise, the single value will be used for both. Defaults to 0. Returns: ndarray: The contrasted image. """ def _auto_contrast_channel(im, c, cutoff): im = im[:, :, c] # Compute the histogram of the image channel. histo = np.histogram(im, 256, (0, 255))[0] # Remove cut-off percent pixels from histo histo_sum = np.cumsum(histo) cut_low = histo_sum[-1] * cutoff[0] // 100 cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100 histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0) # Compute mapping low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1] # If all the values have been cut off, return the origin img if low >= high: return im scale = 255.0 / (high - low) offset = -low * scale lut = np.array(range(256)) lut = lut * scale + offset lut = np.clip(lut, 0, 255) return lut[im] if isinstance(cutoff, (int, float)): cutoff = (cutoff, cutoff) else: assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \ f'float or tuple, but got {type(cutoff)} instead.' # Auto adjusts contrast for each channel independently and then stacks # the result. s1 = _auto_contrast_channel(img, 0, cutoff) s2 = _auto_contrast_channel(img, 1, cutoff) s3 = _auto_contrast_channel(img, 2, cutoff) contrasted_img = np.stack([s1, s2, s3], axis=-1) return contrasted_img.astype(img.dtype) def adjust_sharpness(img, factor=1., kernel=None): """Adjust image sharpness. This function controls the sharpness of an image. An enhancement factor of 0.0 gives a blurred image. A factor of 1.0 gives the original image. And a factor of 2.0 gives a sharpened image. It blends the source image and the degenerated mean image: .. math:: output = img * factor + degenerated * (1 - factor) Args: img (ndarray): Image to be sharpened. BGR order. factor (float): Same as :func:`mmcv.adjust_brightness`. kernel (np.ndarray, optional): Filter kernel to be applied on the img to obtain the degenerated img. Defaults to None. Note: No value sanity check is enforced on the kernel set by users. So with an inappropriate kernel, the ``adjust_sharpness`` may fail to perform the function its name indicates but end up performing whatever transform determined by the kernel. Returns: ndarray: The sharpened image. """ if kernel is None: # adopted from PIL.ImageFilter.SMOOTH kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13 assert isinstance(kernel, np.ndarray), \ f'kernel must be of type np.ndarray, but got {type(kernel)} instead.' assert kernel.ndim == 2, \ f'kernel must have a dimension of 2, but got {kernel.ndim} instead.' degenerated = cv2.filter2D(img, -1, kernel) sharpened_img = cv2.addWeighted( img.astype(np.float32), factor, degenerated.astype(np.float32), 1 - factor, 0) sharpened_img = np.clip(sharpened_img, 0, 255) return sharpened_img.astype(img.dtype) def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True): """AlexNet-style PCA jitter. This data augmentation is proposed in `ImageNet Classification with Deep Convolutional Neural Networks `_. Args: img (ndarray): Image to be adjusted lighting. BGR order. eigval (ndarray): the eigenvalue of the convariance matrix of pixel values, respectively. eigvec (ndarray): the eigenvector of the convariance matrix of pixel values, respectively. alphastd (float): The standard deviation for distribution of alpha. Defaults to 0.1 to_rgb (bool): Whether to convert img to rgb. Returns: ndarray: The adjusted image. """ assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \ f'eigval and eigvec should both be of type np.ndarray, got ' \ f'{type(eigval)} and {type(eigvec)} instead.' assert eigval.ndim == 1 and eigvec.ndim == 2 assert eigvec.shape == (3, eigval.shape[0]) n_eigval = eigval.shape[0] assert isinstance(alphastd, float), 'alphastd should be of type float, ' \ f'got {type(alphastd)} instead.' img = img.copy().astype(np.float32) if to_rgb: cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace alpha = np.random.normal(0, alphastd, n_eigval) alter = eigvec \ * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \ * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval)) alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape) img_adjusted = img + alter return img_adjusted def lut_transform(img, lut_table): """Transform array by look-up table. The function lut_transform fills the output array with values from the look-up table. Indices of the entries are taken from the input array. Args: img (ndarray): Image to be transformed. lut_table (ndarray): look-up table of 256 elements; in case of multi-channel input array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the input array. Returns: ndarray: The transformed image. """ assert isinstance(img, np.ndarray) assert 0 <= np.min(img) and np.max(img) <= 255 assert isinstance(lut_table, np.ndarray) assert lut_table.shape == (256, ) return cv2.LUT(np.array(img, dtype=np.uint8), lut_table) def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)): """Use CLAHE method to process the image. See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J]. Graphics Gems, 1994:474-485.` for more information. Args: img (ndarray): Image to be processed. clip_limit (float): Threshold for contrast limiting. Default: 40.0. tile_grid_size (tuple[int]): Size of grid for histogram equalization. Input image will be divided into equally sized rectangular tiles. It defines the number of tiles in row and column. Default: (8, 8). Returns: ndarray: The processed image. """ assert isinstance(img, np.ndarray) assert img.ndim == 2 assert isinstance(clip_limit, (float, int)) assert is_tuple_of(tile_grid_size, int) assert len(tile_grid_size) == 2 clahe = cv2.createCLAHE(clip_limit, tile_grid_size) return clahe.apply(np.array(img, dtype=np.uint8)) ================================================ FILE: annotator/mmpkg/mmcv/model_zoo/deprecated.json ================================================ { "resnet50_caffe": "detectron/resnet50_caffe", "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr", "resnet101_caffe": "detectron/resnet101_caffe", "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr" } ================================================ FILE: annotator/mmpkg/mmcv/model_zoo/mmcls.json ================================================ { "vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth", "vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth", "vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth", "vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth", "vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth", "vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth", "vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth", "vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth", "resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_batch256_imagenet_20200708-34ab8f90.pth", "resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_batch256_imagenet_20200708-32ffb4f7.pth", "resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth", "resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.pth", "resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.pth", "resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_batch256_imagenet_20200708-1ad0ce94.pth", "resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_batch256_imagenet_20200708-9cb302ef.pth", "resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_batch256_imagenet_20200708-e79cb6a2.pth", "resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth", "resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth", "resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth", "resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth", "se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth", "se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth", "resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth", "resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth", "resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth", "resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth", "shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth", "shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth", "mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth" } ================================================ FILE: annotator/mmpkg/mmcv/model_zoo/open_mmlab.json ================================================ { "vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth", "detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth", "detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth", "detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth", "detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth", "detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth", "resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth", "resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth", "resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth", "contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth", "detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth", "detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth", "jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth", "jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth", "jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth", "jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth", "jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth", "jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth", "msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth", "msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth", "msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth", "msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth", "msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth", "bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth", "kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth", "kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth", "res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth", "regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth", "regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth", "regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth", "regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth", "regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth", "regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth", "regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth", "regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth", "resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth", "resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth", "resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth", "mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth", "mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth", "mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth", "contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth", "contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth", "resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth", "resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth", "resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth", "darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth", "mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth" } ================================================ FILE: annotator/mmpkg/mmcv/ops/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .assign_score_withk import assign_score_withk from .ball_query import ball_query from .bbox import bbox_overlaps from .border_align import BorderAlign, border_align from .box_iou_rotated import box_iou_rotated from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive from .cc_attention import CrissCrossAttention from .contour_expand import contour_expand from .corner_pool import CornerPool from .correlation import Correlation from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack, ModulatedDeformRoIPoolPack, deform_roi_pool) from .deprecated_wrappers import Conv2d_deprecated as Conv2d from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d from .deprecated_wrappers import Linear_deprecated as Linear from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss, sigmoid_focal_loss, softmax_focal_loss) from .furthest_point_sample import (furthest_point_sample, furthest_point_sample_with_dist) from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu from .gather_points import gather_points from .group_points import GroupAll, QueryAndGroup, grouping_operation from .info import (get_compiler_version, get_compiling_cuda_version, get_onnxruntime_op_path) from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev from .knn import knn from .masked_conv import MaskedConv2d, masked_conv2d from .modulated_deform_conv import (ModulatedDeformConv2d, ModulatedDeformConv2dPack, modulated_deform_conv2d) from .multi_scale_deform_attn import MultiScaleDeformableAttention from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms from .pixel_group import pixel_group from .point_sample import (SimpleRoIAlign, point_sample, rel_roi_point_to_rel_img_point) from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu, points_in_boxes_part) from .points_sampler import PointsSampler from .psa_mask import PSAMask from .roi_align import RoIAlign, roi_align from .roi_align_rotated import RoIAlignRotated, roi_align_rotated from .roi_pool import RoIPool, roi_pool from .roiaware_pool3d import RoIAwarePool3d from .roipoint_pool3d import RoIPointPool3d from .saconv import SAConv2d from .scatter_points import DynamicScatter, dynamic_scatter from .sync_bn import SyncBatchNorm from .three_interpolate import three_interpolate from .three_nn import three_nn from .tin_shift import TINShift, tin_shift from .upfirdn2d import upfirdn2d from .voxelize import Voxelization, voxelization __all__ = [ 'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe', 'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack', 'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack', 'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss', 'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss', 'get_compiler_version', 'get_compiling_cuda_version', 'get_onnxruntime_op_path', 'MaskedConv2d', 'masked_conv2d', 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack', 'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d', 'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask', 'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign', 'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk', 'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu', 'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn', 'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign', 'border_align', 'gather_points', 'furthest_point_sample', 'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation', 'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all' ] ================================================ FILE: annotator/mmpkg/mmcv/ops/assign_score_withk.py ================================================ from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward']) class AssignScoreWithK(Function): r"""Perform weighted sum to generate output features according to scores. Modified from `PAConv `_. This is a memory-efficient CUDA implementation of assign_scores operation, which first transform all point features with weight bank, then assemble neighbor features with ``knn_idx`` and perform weighted sum of ``scores``. See the `paper `_ appendix Sec. D for more detailed descriptions. Note: This implementation assumes using ``neighbor`` kernel input, which is (point_features - center_features, point_features). See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/ pointnet2/paconv.py#L128 for more details. """ @staticmethod def forward(ctx, scores, point_features, center_features, knn_idx, aggregate='sum'): """ Args: scores (torch.Tensor): (B, npoint, K, M), predicted scores to aggregate weight matrices in the weight bank. ``npoint`` is the number of sampled centers. ``K`` is the number of queried neighbors. ``M`` is the number of weight matrices in the weight bank. point_features (torch.Tensor): (B, N, M, out_dim) Pre-computed point features to be aggregated. center_features (torch.Tensor): (B, N, M, out_dim) Pre-computed center features to be aggregated. knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN. We assume the first idx in each row is the idx of the center. aggregate (str, optional): Aggregation method. Can be 'sum', 'avg' or 'max'. Defaults: 'sum'. Returns: torch.Tensor: (B, out_dim, npoint, K), the aggregated features. """ agg = {'sum': 0, 'avg': 1, 'max': 2} B, N, M, out_dim = point_features.size() _, npoint, K, _ = scores.size() output = point_features.new_zeros((B, out_dim, npoint, K)) ext_module.assign_score_withk_forward( point_features.contiguous(), center_features.contiguous(), scores.contiguous(), knn_idx.contiguous(), output, B=B, N0=N, N1=npoint, M=M, K=K, O=out_dim, aggregate=agg[aggregate]) ctx.save_for_backward(output, point_features, center_features, scores, knn_idx) ctx.agg = agg[aggregate] return output @staticmethod def backward(ctx, grad_out): """ Args: grad_out (torch.Tensor): (B, out_dim, npoint, K) Returns: grad_scores (torch.Tensor): (B, npoint, K, M) grad_point_features (torch.Tensor): (B, N, M, out_dim) grad_center_features (torch.Tensor): (B, N, M, out_dim) """ _, point_features, center_features, scores, knn_idx = ctx.saved_tensors agg = ctx.agg B, N, M, out_dim = point_features.size() _, npoint, K, _ = scores.size() grad_point_features = point_features.new_zeros(point_features.shape) grad_center_features = center_features.new_zeros(center_features.shape) grad_scores = scores.new_zeros(scores.shape) ext_module.assign_score_withk_backward( grad_out.contiguous(), point_features.contiguous(), center_features.contiguous(), scores.contiguous(), knn_idx.contiguous(), grad_point_features, grad_center_features, grad_scores, B=B, N0=N, N1=npoint, M=M, K=K, O=out_dim, aggregate=agg) return grad_scores, grad_point_features, \ grad_center_features, None, None assign_score_withk = AssignScoreWithK.apply ================================================ FILE: annotator/mmpkg/mmcv/ops/ball_query.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['ball_query_forward']) class BallQuery(Function): """Find nearby points in spherical space.""" @staticmethod def forward(ctx, min_radius: float, max_radius: float, sample_num: int, xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor: """ Args: min_radius (float): minimum radius of the balls. max_radius (float): maximum radius of the balls. sample_num (int): maximum number of features in the balls. xyz (Tensor): (B, N, 3) xyz coordinates of the features. center_xyz (Tensor): (B, npoint, 3) centers of the ball query. Returns: Tensor: (B, npoint, nsample) tensor with the indices of the features that form the query balls. """ assert center_xyz.is_contiguous() assert xyz.is_contiguous() assert min_radius < max_radius B, N, _ = xyz.size() npoint = center_xyz.size(1) idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int) ext_module.ball_query_forward( center_xyz, xyz, idx, b=B, n=N, m=npoint, min_radius=min_radius, max_radius=max_radius, nsample=sample_num) if torch.__version__ != 'parrots': ctx.mark_non_differentiable(idx) return idx @staticmethod def backward(ctx, a=None): return None, None, None, None ball_query = BallQuery.apply ================================================ FILE: annotator/mmpkg/mmcv/ops/bbox.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps']) def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0): """Calculate overlap between two set of bboxes. If ``aligned`` is ``False``, then calculate the ious between each bbox of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (Tensor): shape (m, 4) in format or empty. bboxes2 (Tensor): shape (n, 4) in format or empty. If aligned is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or iof (intersection over foreground). Returns: ious(Tensor): shape (m, n) if aligned == False else shape (m, 1) Example: >>> bboxes1 = torch.FloatTensor([ >>> [0, 0, 10, 10], >>> [10, 10, 20, 20], >>> [32, 32, 38, 42], >>> ]) >>> bboxes2 = torch.FloatTensor([ >>> [0, 0, 10, 20], >>> [0, 10, 10, 19], >>> [10, 10, 20, 20], >>> ]) >>> bbox_overlaps(bboxes1, bboxes2) tensor([[0.5000, 0.0000, 0.0000], [0.0000, 0.0000, 1.0000], [0.0000, 0.0000, 0.0000]]) Example: >>> empty = torch.FloatTensor([]) >>> nonempty = torch.FloatTensor([ >>> [0, 0, 10, 9], >>> ]) >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) """ mode_dict = {'iou': 0, 'iof': 1} assert mode in mode_dict.keys() mode_flag = mode_dict[mode] # Either the boxes are empty or the length of boxes' last dimension is 4 assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0) assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0) assert offset == 1 or offset == 0 rows = bboxes1.size(0) cols = bboxes2.size(0) if aligned: assert rows == cols if rows * cols == 0: return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols) if aligned: ious = bboxes1.new_zeros(rows) else: ious = bboxes1.new_zeros((rows, cols)) ext_module.bbox_overlaps( bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset) return ious ================================================ FILE: annotator/mmpkg/mmcv/ops/border_align.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # modified from # https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py import torch import torch.nn as nn from torch.autograd import Function from torch.autograd.function import once_differentiable from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['border_align_forward', 'border_align_backward']) class BorderAlignFunction(Function): @staticmethod def symbolic(g, input, boxes, pool_size): return g.op( 'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size) @staticmethod def forward(ctx, input, boxes, pool_size): ctx.pool_size = pool_size ctx.input_shape = input.size() assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]' assert boxes.size(2) == 4, \ 'the last dimension of boxes must be (x1, y1, x2, y2)' assert input.size(1) % 4 == 0, \ 'the channel for input feature must be divisible by factor 4' # [B, C//4, H*W, 4] output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4) output = input.new_zeros(output_shape) # `argmax_idx` only used for backward argmax_idx = input.new_zeros(output_shape).to(torch.int) ext_module.border_align_forward( input, boxes, output, argmax_idx, pool_size=ctx.pool_size) ctx.save_for_backward(boxes, argmax_idx) return output @staticmethod @once_differentiable def backward(ctx, grad_output): boxes, argmax_idx = ctx.saved_tensors grad_input = grad_output.new_zeros(ctx.input_shape) # complex head architecture may cause grad_output uncontiguous grad_output = grad_output.contiguous() ext_module.border_align_backward( grad_output, boxes, argmax_idx, grad_input, pool_size=ctx.pool_size) return grad_input, None, None border_align = BorderAlignFunction.apply class BorderAlign(nn.Module): r"""Border align pooling layer. Applies border_align over the input feature based on predicted bboxes. The details were described in the paper `BorderDet: Border Feature for Dense Object Detection `_. For each border line (e.g. top, left, bottom or right) of each box, border_align does the following: 1. uniformly samples `pool_size`+1 positions on this line, involving \ the start and end points. 2. the corresponding features on these points are computed by \ bilinear interpolation. 3. max pooling over all the `pool_size`+1 positions are used for \ computing pooled feature. Args: pool_size (int): number of positions sampled over the boxes' borders (e.g. top, bottom, left, right). """ def __init__(self, pool_size): super(BorderAlign, self).__init__() self.pool_size = pool_size def forward(self, input, boxes): """ Args: input: Features with shape [N,4C,H,W]. Channels ranged in [0,C), [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom, right features respectively. boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2). Returns: Tensor: Pooled features with shape [N,C,H*W,4]. The order is (top,left,bottom,right) for the last dimension. """ return border_align(input, boxes, self.pool_size) def __repr__(self): s = self.__class__.__name__ s += f'(pool_size={self.pool_size})' return s ================================================ FILE: annotator/mmpkg/mmcv/ops/box_iou_rotated.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated']) def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False): """Return intersection-over-union (Jaccard index) of boxes. Both sets of boxes are expected to be in (x_center, y_center, width, height, angle) format. If ``aligned`` is ``False``, then calculate the ious between each bbox of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. Arguments: boxes1 (Tensor): rotated bboxes 1. \ It has shape (N, 5), indicating (x, y, w, h, theta) for each row. Note that theta is in radian. boxes2 (Tensor): rotated bboxes 2. \ It has shape (M, 5), indicating (x, y, w, h, theta) for each row. Note that theta is in radian. mode (str): "iou" (intersection over union) or iof (intersection over foreground). Returns: ious(Tensor): shape (N, M) if aligned == False else shape (N,) """ assert mode in ['iou', 'iof'] mode_dict = {'iou': 0, 'iof': 1} mode_flag = mode_dict[mode] rows = bboxes1.size(0) cols = bboxes2.size(0) if aligned: ious = bboxes1.new_zeros(rows) else: ious = bboxes1.new_zeros((rows * cols)) bboxes1 = bboxes1.contiguous() bboxes2 = bboxes2.contiguous() ext_module.box_iou_rotated( bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned) if not aligned: ious = ious.view(rows, cols) return ious ================================================ FILE: annotator/mmpkg/mmcv/ops/carafe.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd import Function from torch.nn.modules.module import Module from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', [ 'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward', 'carafe_backward' ]) class CARAFENaiveFunction(Function): @staticmethod def symbolic(g, features, masks, kernel_size, group_size, scale_factor): return g.op( 'mmcv::MMCVCARAFENaive', features, masks, kernel_size_i=kernel_size, group_size_i=group_size, scale_factor_f=scale_factor) @staticmethod def forward(ctx, features, masks, kernel_size, group_size, scale_factor): assert scale_factor >= 1 assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(-1) == features.size(-1) * scale_factor assert masks.size(-2) == features.size(-2) * scale_factor assert features.size(1) % group_size == 0 assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 ctx.kernel_size = kernel_size ctx.group_size = group_size ctx.scale_factor = scale_factor ctx.feature_size = features.size() ctx.mask_size = masks.size() n, c, h, w = features.size() output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) ext_module.carafe_naive_forward( features, masks, output, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) if features.requires_grad or masks.requires_grad: ctx.save_for_backward(features, masks) return output @staticmethod def backward(ctx, grad_output): assert grad_output.is_cuda features, masks = ctx.saved_tensors kernel_size = ctx.kernel_size group_size = ctx.group_size scale_factor = ctx.scale_factor grad_input = torch.zeros_like(features) grad_masks = torch.zeros_like(masks) ext_module.carafe_naive_backward( grad_output.contiguous(), features, masks, grad_input, grad_masks, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) return grad_input, grad_masks, None, None, None carafe_naive = CARAFENaiveFunction.apply class CARAFENaive(Module): def __init__(self, kernel_size, group_size, scale_factor): super(CARAFENaive, self).__init__() assert isinstance(kernel_size, int) and isinstance( group_size, int) and isinstance(scale_factor, int) self.kernel_size = kernel_size self.group_size = group_size self.scale_factor = scale_factor def forward(self, features, masks): return carafe_naive(features, masks, self.kernel_size, self.group_size, self.scale_factor) class CARAFEFunction(Function): @staticmethod def symbolic(g, features, masks, kernel_size, group_size, scale_factor): return g.op( 'mmcv::MMCVCARAFE', features, masks, kernel_size_i=kernel_size, group_size_i=group_size, scale_factor_f=scale_factor) @staticmethod def forward(ctx, features, masks, kernel_size, group_size, scale_factor): assert scale_factor >= 1 assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(-1) == features.size(-1) * scale_factor assert masks.size(-2) == features.size(-2) * scale_factor assert features.size(1) % group_size == 0 assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 ctx.kernel_size = kernel_size ctx.group_size = group_size ctx.scale_factor = scale_factor ctx.feature_size = features.size() ctx.mask_size = masks.size() n, c, h, w = features.size() output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) routput = features.new_zeros(output.size(), requires_grad=False) rfeatures = features.new_zeros(features.size(), requires_grad=False) rmasks = masks.new_zeros(masks.size(), requires_grad=False) ext_module.carafe_forward( features, masks, rfeatures, routput, rmasks, output, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) if features.requires_grad or masks.requires_grad: ctx.save_for_backward(features, masks, rfeatures) return output @staticmethod def backward(ctx, grad_output): assert grad_output.is_cuda features, masks, rfeatures = ctx.saved_tensors kernel_size = ctx.kernel_size group_size = ctx.group_size scale_factor = ctx.scale_factor rgrad_output = torch.zeros_like(grad_output, requires_grad=False) rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False) rgrad_input = torch.zeros_like(features, requires_grad=False) rgrad_masks = torch.zeros_like(masks, requires_grad=False) grad_input = torch.zeros_like(features, requires_grad=False) grad_masks = torch.zeros_like(masks, requires_grad=False) ext_module.carafe_backward( grad_output.contiguous(), rfeatures, masks, rgrad_output, rgrad_input_hs, rgrad_input, rgrad_masks, grad_input, grad_masks, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) return grad_input, grad_masks, None, None, None carafe = CARAFEFunction.apply class CARAFE(Module): """ CARAFE: Content-Aware ReAssembly of FEatures Please refer to https://arxiv.org/abs/1905.02188 for more details. Args: kernel_size (int): reassemble kernel size group_size (int): reassemble group size scale_factor (int): upsample ratio Returns: upsampled feature map """ def __init__(self, kernel_size, group_size, scale_factor): super(CARAFE, self).__init__() assert isinstance(kernel_size, int) and isinstance( group_size, int) and isinstance(scale_factor, int) self.kernel_size = kernel_size self.group_size = group_size self.scale_factor = scale_factor def forward(self, features, masks): return carafe(features, masks, self.kernel_size, self.group_size, self.scale_factor) @UPSAMPLE_LAYERS.register_module(name='carafe') class CARAFEPack(nn.Module): """A unified package of CARAFE upsampler that contains: 1) channel compressor 2) content encoder 3) CARAFE op. Official implementation of ICCV 2019 paper CARAFE: Content-Aware ReAssembly of FEatures Please refer to https://arxiv.org/abs/1905.02188 for more details. Args: channels (int): input feature channels scale_factor (int): upsample ratio up_kernel (int): kernel size of CARAFE op up_group (int): group size of CARAFE op encoder_kernel (int): kernel size of content encoder encoder_dilation (int): dilation of content encoder compressed_channels (int): output channels of channels compressor Returns: upsampled feature map """ def __init__(self, channels, scale_factor, up_kernel=5, up_group=1, encoder_kernel=3, encoder_dilation=1, compressed_channels=64): super(CARAFEPack, self).__init__() self.channels = channels self.scale_factor = scale_factor self.up_kernel = up_kernel self.up_group = up_group self.encoder_kernel = encoder_kernel self.encoder_dilation = encoder_dilation self.compressed_channels = compressed_channels self.channel_compressor = nn.Conv2d(channels, self.compressed_channels, 1) self.content_encoder = nn.Conv2d( self.compressed_channels, self.up_kernel * self.up_kernel * self.up_group * self.scale_factor * self.scale_factor, self.encoder_kernel, padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2), dilation=self.encoder_dilation, groups=1) self.init_weights() def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): xavier_init(m, distribution='uniform') normal_init(self.content_encoder, std=0.001) def kernel_normalizer(self, mask): mask = F.pixel_shuffle(mask, self.scale_factor) n, mask_c, h, w = mask.size() # use float division explicitly, # to void inconsistency while exporting to onnx mask_channel = int(mask_c / float(self.up_kernel**2)) mask = mask.view(n, mask_channel, -1, h, w) mask = F.softmax(mask, dim=2, dtype=mask.dtype) mask = mask.view(n, mask_c, h, w).contiguous() return mask def feature_reassemble(self, x, mask): x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor) return x def forward(self, x): compressed_x = self.channel_compressor(x) mask = self.content_encoder(compressed_x) mask = self.kernel_normalizer(mask) x = self.feature_reassemble(x, mask) return x ================================================ FILE: annotator/mmpkg/mmcv/ops/cc_attention.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import PLUGIN_LAYERS, Scale def NEG_INF_DIAG(n, device): """Returns a diagonal matrix of size [n, n]. The diagonal are all "-inf". This is for avoiding calculating the overlapped element in the Criss-Cross twice. """ return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0) @PLUGIN_LAYERS.register_module() class CrissCrossAttention(nn.Module): """Criss-Cross Attention Module. .. note:: Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch to a pure PyTorch and equivalent implementation. For more details, please refer to https://github.com/open-mmlab/mmcv/pull/1201. Speed comparison for one forward pass - Input size: [2,512,97,97] - Device: 1 NVIDIA GeForce RTX 2080 Ti +-----------------------+---------------+------------+---------------+ | |PyTorch version|CUDA version|Relative speed | +=======================+===============+============+===============+ |with torch.no_grad() |0.00554402 s |0.0299619 s |5.4x | +-----------------------+---------------+------------+---------------+ |no with torch.no_grad()|0.00562803 s |0.0301349 s |5.4x | +-----------------------+---------------+------------+---------------+ Args: in_channels (int): Channels of the input feature map. """ def __init__(self, in_channels): super().__init__() self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1) self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1) self.value_conv = nn.Conv2d(in_channels, in_channels, 1) self.gamma = Scale(0.) self.in_channels = in_channels def forward(self, x): """forward function of Criss-Cross Attention. Args: x (Tensor): Input feature. \ shape (batch_size, in_channels, height, width) Returns: Tensor: Output of the layer, with shape of \ (batch_size, in_channels, height, width) """ B, C, H, W = x.size() query = self.query_conv(x) key = self.key_conv(x) value = self.value_conv(x) energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG( H, query.device) energy_H = energy_H.transpose(1, 2) energy_W = torch.einsum('bchw,bchj->bhwj', query, key) attn = F.softmax( torch.cat([energy_H, energy_W], dim=-1), dim=-1) # [B,H,W,(H+W)] out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H]) out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:]) out = self.gamma(out) + x out = out.contiguous() return out def __repr__(self): s = self.__class__.__name__ s += f'(in_channels={self.in_channels})' return s ================================================ FILE: annotator/mmpkg/mmcv/ops/contour_expand.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['contour_expand']) def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area, kernel_num): """Expand kernel contours so that foreground pixels are assigned into instances. Arguments: kernel_mask (np.array or Tensor): The instance kernel mask with size hxw. internal_kernel_label (np.array or Tensor): The instance internal kernel label with size hxw. min_kernel_area (int): The minimum kernel area. kernel_num (int): The instance kernel number. Returns: label (list): The instance index map with size hxw. """ assert isinstance(kernel_mask, (torch.Tensor, np.ndarray)) assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray)) assert isinstance(min_kernel_area, int) assert isinstance(kernel_num, int) if isinstance(kernel_mask, np.ndarray): kernel_mask = torch.from_numpy(kernel_mask) if isinstance(internal_kernel_label, np.ndarray): internal_kernel_label = torch.from_numpy(internal_kernel_label) if torch.__version__ == 'parrots': if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0: label = [] else: label = ext_module.contour_expand( kernel_mask, internal_kernel_label, min_kernel_area=min_kernel_area, kernel_num=kernel_num) label = label.tolist() else: label = ext_module.contour_expand(kernel_mask, internal_kernel_label, min_kernel_area, kernel_num) return label ================================================ FILE: annotator/mmpkg/mmcv/ops/corner_pool.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', [ 'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward', 'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward', 'right_pool_forward', 'right_pool_backward' ]) _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} class TopPoolFunction(Function): @staticmethod def symbolic(g, input): output = g.op( 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top'])) return output @staticmethod def forward(ctx, input): output = ext_module.top_pool_forward(input) ctx.save_for_backward(input) return output @staticmethod def backward(ctx, grad_output): input, = ctx.saved_tensors output = ext_module.top_pool_backward(input, grad_output) return output class BottomPoolFunction(Function): @staticmethod def symbolic(g, input): output = g.op( 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom'])) return output @staticmethod def forward(ctx, input): output = ext_module.bottom_pool_forward(input) ctx.save_for_backward(input) return output @staticmethod def backward(ctx, grad_output): input, = ctx.saved_tensors output = ext_module.bottom_pool_backward(input, grad_output) return output class LeftPoolFunction(Function): @staticmethod def symbolic(g, input): output = g.op( 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left'])) return output @staticmethod def forward(ctx, input): output = ext_module.left_pool_forward(input) ctx.save_for_backward(input) return output @staticmethod def backward(ctx, grad_output): input, = ctx.saved_tensors output = ext_module.left_pool_backward(input, grad_output) return output class RightPoolFunction(Function): @staticmethod def symbolic(g, input): output = g.op( 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right'])) return output @staticmethod def forward(ctx, input): output = ext_module.right_pool_forward(input) ctx.save_for_backward(input) return output @staticmethod def backward(ctx, grad_output): input, = ctx.saved_tensors output = ext_module.right_pool_backward(input, grad_output) return output class CornerPool(nn.Module): """Corner Pooling. Corner Pooling is a new type of pooling layer that helps a convolutional network better localize corners of bounding boxes. Please refer to https://arxiv.org/abs/1808.01244 for more details. Code is modified from https://github.com/princeton-vl/CornerNet-Lite. Args: mode(str): Pooling orientation for the pooling layer - 'bottom': Bottom Pooling - 'left': Left Pooling - 'right': Right Pooling - 'top': Top Pooling Returns: Feature map after pooling. """ pool_functions = { 'bottom': BottomPoolFunction, 'left': LeftPoolFunction, 'right': RightPoolFunction, 'top': TopPoolFunction, } cummax_dim_flip = { 'bottom': (2, False), 'left': (3, True), 'right': (3, False), 'top': (2, True), } def __init__(self, mode): super(CornerPool, self).__init__() assert mode in self.pool_functions self.mode = mode self.corner_pool = self.pool_functions[mode] def forward(self, x): if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0': if torch.onnx.is_in_onnx_export(): assert torch.__version__ >= '1.7.0', \ 'When `cummax` serves as an intermediate component whose '\ 'outputs is used as inputs for another modules, it\'s '\ 'expected that pytorch version must be >= 1.7.0, '\ 'otherwise Error appears like: `RuntimeError: tuple '\ 'appears in op that does not forward tuples, unsupported '\ 'kind: prim::PythonOp`.' dim, flip = self.cummax_dim_flip[self.mode] if flip: x = x.flip(dim) pool_tensor, _ = torch.cummax(x, dim=dim) if flip: pool_tensor = pool_tensor.flip(dim) return pool_tensor else: return self.corner_pool.apply(x) ================================================ FILE: annotator/mmpkg/mmcv/ops/correlation.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import Tensor, nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['correlation_forward', 'correlation_backward']) class CorrelationFunction(Function): @staticmethod def forward(ctx, input1, input2, kernel_size=1, max_displacement=1, stride=1, padding=1, dilation=1, dilation_patch=1): ctx.save_for_backward(input1, input2) kH, kW = ctx.kernel_size = _pair(kernel_size) patch_size = max_displacement * 2 + 1 ctx.patch_size = patch_size dH, dW = ctx.stride = _pair(stride) padH, padW = ctx.padding = _pair(padding) dilationH, dilationW = ctx.dilation = _pair(dilation) dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair( dilation_patch) output_size = CorrelationFunction._output_size(ctx, input1) output = input1.new_zeros(output_size) ext_module.correlation_forward( input1, input2, output, kH=kH, kW=kW, patchH=patch_size, patchW=patch_size, padH=padH, padW=padW, dilationH=dilationH, dilationW=dilationW, dilation_patchH=dilation_patchH, dilation_patchW=dilation_patchW, dH=dH, dW=dW) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input1, input2 = ctx.saved_tensors kH, kW = ctx.kernel_size patch_size = ctx.patch_size padH, padW = ctx.padding dilationH, dilationW = ctx.dilation dilation_patchH, dilation_patchW = ctx.dilation_patch dH, dW = ctx.stride grad_input1 = torch.zeros_like(input1) grad_input2 = torch.zeros_like(input2) ext_module.correlation_backward( grad_output, input1, input2, grad_input1, grad_input2, kH=kH, kW=kW, patchH=patch_size, patchW=patch_size, padH=padH, padW=padW, dilationH=dilationH, dilationW=dilationW, dilation_patchH=dilation_patchH, dilation_patchW=dilation_patchW, dH=dH, dW=dW) return grad_input1, grad_input2, None, None, None, None, None, None @staticmethod def _output_size(ctx, input1): iH, iW = input1.size(2), input1.size(3) batch_size = input1.size(0) kH, kW = ctx.kernel_size patch_size = ctx.patch_size dH, dW = ctx.stride padH, padW = ctx.padding dilationH, dilationW = ctx.dilation dilatedKH = (kH - 1) * dilationH + 1 dilatedKW = (kW - 1) * dilationW + 1 oH = int((iH + 2 * padH - dilatedKH) / dH + 1) oW = int((iW + 2 * padW - dilatedKW) / dW + 1) output_size = (batch_size, patch_size, patch_size, oH, oW) return output_size class Correlation(nn.Module): r"""Correlation operator This correlation operator works for optical flow correlation computation. There are two batched tensors with shape :math:`(N, C, H, W)`, and the correlation output's shape is :math:`(N, max\_displacement \times 2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})` where .. math:: H_{out} = \left\lfloor\frac{H_{in} + 2 \times padding - dilation \times (kernel\_size - 1) - 1} {stride} + 1\right\rfloor .. math:: W_{out} = \left\lfloor\frac{W_{in} + 2 \times padding - dilation \times (kernel\_size - 1) - 1} {stride} + 1\right\rfloor the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding window convolution between input1 and shifted input2, .. math:: Corr(N_i, dx, dy) = \sum_{c=0}^{C-1} input1(N_i, c) \star \mathcal{S}(input2(N_i, c), dy, dx) where :math:`\star` is the valid 2d sliding window convolution operator, and :math:`\mathcal{S}` means shifting the input features (auto-complete zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in [-max\_displacement \times dilation\_patch, max\_displacement \times dilation\_patch]`. Args: kernel_size (int): The size of sliding window i.e. local neighborhood representing the center points and involved in correlation computation. Defaults to 1. max_displacement (int): The radius for computing correlation volume, but the actual working space can be dilated by dilation_patch. Defaults to 1. stride (int): The stride of the sliding blocks in the input spatial dimensions. Defaults to 1. padding (int): Zero padding added to all four sides of the input1. Defaults to 0. dilation (int): The spacing of local neighborhood that will involved in correlation. Defaults to 1. dilation_patch (int): The spacing between position need to compute correlation. Defaults to 1. """ def __init__(self, kernel_size: int = 1, max_displacement: int = 1, stride: int = 1, padding: int = 0, dilation: int = 1, dilation_patch: int = 1) -> None: super().__init__() self.kernel_size = kernel_size self.max_displacement = max_displacement self.stride = stride self.padding = padding self.dilation = dilation self.dilation_patch = dilation_patch def forward(self, input1: Tensor, input2: Tensor) -> Tensor: return CorrelationFunction.apply(input1, input2, self.kernel_size, self.max_displacement, self.stride, self.padding, self.dilation, self.dilation_patch) def __repr__(self) -> str: s = self.__class__.__name__ s += f'(kernel_size={self.kernel_size}, ' s += f'max_displacement={self.max_displacement}, ' s += f'stride={self.stride}, ' s += f'padding={self.padding}, ' s += f'dilation={self.dilation}, ' s += f'dilation_patch={self.dilation_patch})' return s ================================================ FILE: annotator/mmpkg/mmcv/ops/deform_conv.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair, _single from annotator.mmpkg.mmcv.utils import deprecated_api_warning from ..cnn import CONV_LAYERS from ..utils import ext_loader, print_log ext_module = ext_loader.load_ext('_ext', [ 'deform_conv_forward', 'deform_conv_backward_input', 'deform_conv_backward_parameters' ]) class DeformConv2dFunction(Function): @staticmethod def symbolic(g, input, offset, weight, stride, padding, dilation, groups, deform_groups, bias=False, im2col_step=32): return g.op( 'mmcv::MMCVDeformConv2d', input, offset, weight, stride_i=stride, padding_i=padding, dilation_i=dilation, groups_i=groups, deform_groups_i=deform_groups, bias_i=bias, im2col_step_i=im2col_step) @staticmethod def forward(ctx, input, offset, weight, stride=1, padding=0, dilation=1, groups=1, deform_groups=1, bias=False, im2col_step=32): if input is not None and input.dim() != 4: raise ValueError( f'Expected 4D tensor as input, got {input.dim()}D tensor \ instead.') assert bias is False, 'Only support bias is False.' ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.groups = groups ctx.deform_groups = deform_groups ctx.im2col_step = im2col_step # When pytorch version >= 1.6.0, amp is adopted for fp16 mode; # amp won't cast the type of model (float32), but "offset" is cast # to float16 by nn.Conv2d automatically, leading to the type # mismatch with input (when it is float32) or weight. # The flag for whether to use fp16 or amp is the type of "offset", # we cast weight and input to temporarily support fp16 and amp # whatever the pytorch version is. input = input.type_as(offset) weight = weight.type_as(input) ctx.save_for_backward(input, offset, weight) output = input.new_empty( DeformConv2dFunction._output_size(ctx, input, weight)) ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones cur_im2col_step = min(ctx.im2col_step, input.size(0)) assert (input.size(0) % cur_im2col_step) == 0, 'im2col step must divide batchsize' ext_module.deform_conv_forward( input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1], kW=weight.size(3), kH=weight.size(2), dW=ctx.stride[1], dH=ctx.stride[0], padW=ctx.padding[1], padH=ctx.padding[0], dilationW=ctx.dilation[1], dilationH=ctx.dilation[0], group=ctx.groups, deformable_group=ctx.deform_groups, im2col_step=cur_im2col_step) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, offset, weight = ctx.saved_tensors grad_input = grad_offset = grad_weight = None cur_im2col_step = min(ctx.im2col_step, input.size(0)) assert (input.size(0) % cur_im2col_step ) == 0, 'batch size must be divisible by im2col_step' grad_output = grad_output.contiguous() if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: grad_input = torch.zeros_like(input) grad_offset = torch.zeros_like(offset) ext_module.deform_conv_backward_input( input, offset, grad_output, grad_input, grad_offset, weight, ctx.bufs_[0], kW=weight.size(3), kH=weight.size(2), dW=ctx.stride[1], dH=ctx.stride[0], padW=ctx.padding[1], padH=ctx.padding[0], dilationW=ctx.dilation[1], dilationH=ctx.dilation[0], group=ctx.groups, deformable_group=ctx.deform_groups, im2col_step=cur_im2col_step) if ctx.needs_input_grad[2]: grad_weight = torch.zeros_like(weight) ext_module.deform_conv_backward_parameters( input, offset, grad_output, grad_weight, ctx.bufs_[0], ctx.bufs_[1], kW=weight.size(3), kH=weight.size(2), dW=ctx.stride[1], dH=ctx.stride[0], padW=ctx.padding[1], padH=ctx.padding[0], dilationW=ctx.dilation[1], dilationH=ctx.dilation[0], group=ctx.groups, deformable_group=ctx.deform_groups, scale=1, im2col_step=cur_im2col_step) return grad_input, grad_offset, grad_weight, \ None, None, None, None, None, None, None @staticmethod def _output_size(ctx, input, weight): channels = weight.size(0) output_size = (input.size(0), channels) for d in range(input.dim() - 2): in_size = input.size(d + 2) pad = ctx.padding[d] kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1 stride_ = ctx.stride[d] output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) if not all(map(lambda s: s > 0, output_size)): raise ValueError( 'convolution input is too small (output would be ' + 'x'.join(map(str, output_size)) + ')') return output_size deform_conv2d = DeformConv2dFunction.apply class DeformConv2d(nn.Module): r"""Deformable 2D convolution. Applies a deformable 2D convolution over an input signal composed of several input planes. DeformConv2d was described in the paper `Deformable Convolutional Networks `_ Note: The argument ``im2col_step`` was added in version 1.3.17, which means number of samples processed by the ``im2col_cuda_kernel`` per call. It enables users to define ``batch_size`` and ``im2col_step`` more flexibly and solved `issue mmcv#1440 `_. Args: in_channels (int): Number of channels in the input image. out_channels (int): Number of channels produced by the convolution. kernel_size(int, tuple): Size of the convolving kernel. stride(int, tuple): Stride of the convolution. Default: 1. padding (int or tuple): Zero-padding added to both sides of the input. Default: 0. dilation (int or tuple): Spacing between kernel elements. Default: 1. groups (int): Number of blocked connections from input. channels to output channels. Default: 1. deform_groups (int): Number of deformable group partitions. bias (bool): If True, adds a learnable bias to the output. Default: False. im2col_step (int): Number of samples processed by im2col_cuda_kernel per call. It will work when ``batch_size`` > ``im2col_step``, but ``batch_size`` must be divisible by ``im2col_step``. Default: 32. `New in version 1.3.17.` """ @deprecated_api_warning({'deformable_groups': 'deform_groups'}, cls_name='DeformConv2d') def __init__(self, in_channels: int, out_channels: int, kernel_size: Union[int, Tuple[int, ...]], stride: Union[int, Tuple[int, ...]] = 1, padding: Union[int, Tuple[int, ...]] = 0, dilation: Union[int, Tuple[int, ...]] = 1, groups: int = 1, deform_groups: int = 1, bias: bool = False, im2col_step: int = 32) -> None: super(DeformConv2d, self).__init__() assert not bias, \ f'bias={bias} is not supported in DeformConv2d.' assert in_channels % groups == 0, \ f'in_channels {in_channels} cannot be divisible by groups {groups}' assert out_channels % groups == 0, \ f'out_channels {out_channels} cannot be divisible by groups \ {groups}' self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = _pair(stride) self.padding = _pair(padding) self.dilation = _pair(dilation) self.groups = groups self.deform_groups = deform_groups self.im2col_step = im2col_step # enable compatibility with nn.Conv2d self.transposed = False self.output_padding = _single(0) # only weight, no bias self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)) self.reset_parameters() def reset_parameters(self): # switch the initialization of `self.weight` to the standard kaiming # method described in `Delving deep into rectifiers: Surpassing # human-level performance on ImageNet classification` - He, K. et al. # (2015), using a uniform distribution nn.init.kaiming_uniform_(self.weight, nonlinearity='relu') def forward(self, x: Tensor, offset: Tensor) -> Tensor: """Deformable Convolutional forward function. Args: x (Tensor): Input feature, shape (B, C_in, H_in, W_in) offset (Tensor): Offset for deformable convolution, shape (B, deform_groups*kernel_size[0]*kernel_size[1]*2, H_out, W_out), H_out, W_out are equal to the output's. An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`. The spatial arrangement is like: .. code:: text (x0, y0) (x1, y1) (x2, y2) (x3, y3) (x4, y4) (x5, y5) (x6, y6) (x7, y7) (x8, y8) Returns: Tensor: Output of the layer. """ # To fix an assert error in deform_conv_cuda.cpp:128 # input image is smaller than kernel input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) < self.kernel_size[1]) if input_pad: pad_h = max(self.kernel_size[0] - x.size(2), 0) pad_w = max(self.kernel_size[1] - x.size(3), 0) x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous() offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0) offset = offset.contiguous() out = deform_conv2d(x, offset, self.weight, self.stride, self.padding, self.dilation, self.groups, self.deform_groups, False, self.im2col_step) if input_pad: out = out[:, :, :out.size(2) - pad_h, :out.size(3) - pad_w].contiguous() return out def __repr__(self): s = self.__class__.__name__ s += f'(in_channels={self.in_channels},\n' s += f'out_channels={self.out_channels},\n' s += f'kernel_size={self.kernel_size},\n' s += f'stride={self.stride},\n' s += f'padding={self.padding},\n' s += f'dilation={self.dilation},\n' s += f'groups={self.groups},\n' s += f'deform_groups={self.deform_groups},\n' # bias is not supported in DeformConv2d. s += 'bias=False)' return s @CONV_LAYERS.register_module('DCN') class DeformConv2dPack(DeformConv2d): """A Deformable Conv Encapsulation that acts as normal Conv layers. The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`. The spatial arrangement is like: .. code:: text (x0, y0) (x1, y1) (x2, y2) (x3, y3) (x4, y4) (x5, y5) (x6, y6) (x7, y7) (x8, y8) Args: in_channels (int): Same as nn.Conv2d. out_channels (int): Same as nn.Conv2d. kernel_size (int or tuple[int]): Same as nn.Conv2d. stride (int or tuple[int]): Same as nn.Conv2d. padding (int or tuple[int]): Same as nn.Conv2d. dilation (int or tuple[int]): Same as nn.Conv2d. groups (int): Same as nn.Conv2d. bias (bool or str): If specified as `auto`, it will be decided by the norm_cfg. Bias will be set as True if norm_cfg is None, otherwise False. """ _version = 2 def __init__(self, *args, **kwargs): super(DeformConv2dPack, self).__init__(*args, **kwargs) self.conv_offset = nn.Conv2d( self.in_channels, self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1], kernel_size=self.kernel_size, stride=_pair(self.stride), padding=_pair(self.padding), dilation=_pair(self.dilation), bias=True) self.init_offset() def init_offset(self): self.conv_offset.weight.data.zero_() self.conv_offset.bias.data.zero_() def forward(self, x): offset = self.conv_offset(x) return deform_conv2d(x, offset, self.weight, self.stride, self.padding, self.dilation, self.groups, self.deform_groups, False, self.im2col_step) def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): version = local_metadata.get('version', None) if version is None or version < 2: # the key is different in early versions # In version < 2, DeformConvPack loads previous benchmark models. if (prefix + 'conv_offset.weight' not in state_dict and prefix[:-1] + '_offset.weight' in state_dict): state_dict[prefix + 'conv_offset.weight'] = state_dict.pop( prefix[:-1] + '_offset.weight') if (prefix + 'conv_offset.bias' not in state_dict and prefix[:-1] + '_offset.bias' in state_dict): state_dict[prefix + 'conv_offset.bias'] = state_dict.pop(prefix[:-1] + '_offset.bias') if version is not None and version > 1: print_log( f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to ' 'version 2.', logger='root') super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) ================================================ FILE: annotator/mmpkg/mmcv/ops/deform_roi_pool.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward']) class DeformRoIPoolFunction(Function): @staticmethod def symbolic(g, input, rois, offset, output_size, spatial_scale, sampling_ratio, gamma): return g.op( 'mmcv::MMCVDeformRoIPool', input, rois, offset, pooled_height_i=output_size[0], pooled_width_i=output_size[1], spatial_scale_f=spatial_scale, sampling_ratio_f=sampling_ratio, gamma_f=gamma) @staticmethod def forward(ctx, input, rois, offset, output_size, spatial_scale=1.0, sampling_ratio=0, gamma=0.1): if offset is None: offset = input.new_zeros(0) ctx.output_size = _pair(output_size) ctx.spatial_scale = float(spatial_scale) ctx.sampling_ratio = int(sampling_ratio) ctx.gamma = float(gamma) assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!' output_shape = (rois.size(0), input.size(1), ctx.output_size[0], ctx.output_size[1]) output = input.new_zeros(output_shape) ext_module.deform_roi_pool_forward( input, rois, offset, output, pooled_height=ctx.output_size[0], pooled_width=ctx.output_size[1], spatial_scale=ctx.spatial_scale, sampling_ratio=ctx.sampling_ratio, gamma=ctx.gamma) ctx.save_for_backward(input, rois, offset) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, rois, offset = ctx.saved_tensors grad_input = grad_output.new_zeros(input.shape) grad_offset = grad_output.new_zeros(offset.shape) ext_module.deform_roi_pool_backward( grad_output, input, rois, offset, grad_input, grad_offset, pooled_height=ctx.output_size[0], pooled_width=ctx.output_size[1], spatial_scale=ctx.spatial_scale, sampling_ratio=ctx.sampling_ratio, gamma=ctx.gamma) if grad_offset.numel() == 0: grad_offset = None return grad_input, None, grad_offset, None, None, None, None deform_roi_pool = DeformRoIPoolFunction.apply class DeformRoIPool(nn.Module): def __init__(self, output_size, spatial_scale=1.0, sampling_ratio=0, gamma=0.1): super(DeformRoIPool, self).__init__() self.output_size = _pair(output_size) self.spatial_scale = float(spatial_scale) self.sampling_ratio = int(sampling_ratio) self.gamma = float(gamma) def forward(self, input, rois, offset=None): return deform_roi_pool(input, rois, offset, self.output_size, self.spatial_scale, self.sampling_ratio, self.gamma) class DeformRoIPoolPack(DeformRoIPool): def __init__(self, output_size, output_channels, deform_fc_channels=1024, spatial_scale=1.0, sampling_ratio=0, gamma=0.1): super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale, sampling_ratio, gamma) self.output_channels = output_channels self.deform_fc_channels = deform_fc_channels self.offset_fc = nn.Sequential( nn.Linear( self.output_size[0] * self.output_size[1] * self.output_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.output_size[0] * self.output_size[1] * 2)) self.offset_fc[-1].weight.data.zero_() self.offset_fc[-1].bias.data.zero_() def forward(self, input, rois): assert input.size(1) == self.output_channels x = deform_roi_pool(input, rois, None, self.output_size, self.spatial_scale, self.sampling_ratio, self.gamma) rois_num = rois.size(0) offset = self.offset_fc(x.view(rois_num, -1)) offset = offset.view(rois_num, 2, self.output_size[0], self.output_size[1]) return deform_roi_pool(input, rois, offset, self.output_size, self.spatial_scale, self.sampling_ratio, self.gamma) class ModulatedDeformRoIPoolPack(DeformRoIPool): def __init__(self, output_size, output_channels, deform_fc_channels=1024, spatial_scale=1.0, sampling_ratio=0, gamma=0.1): super(ModulatedDeformRoIPoolPack, self).__init__(output_size, spatial_scale, sampling_ratio, gamma) self.output_channels = output_channels self.deform_fc_channels = deform_fc_channels self.offset_fc = nn.Sequential( nn.Linear( self.output_size[0] * self.output_size[1] * self.output_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.output_size[0] * self.output_size[1] * 2)) self.offset_fc[-1].weight.data.zero_() self.offset_fc[-1].bias.data.zero_() self.mask_fc = nn.Sequential( nn.Linear( self.output_size[0] * self.output_size[1] * self.output_channels, self.deform_fc_channels), nn.ReLU(inplace=True), nn.Linear(self.deform_fc_channels, self.output_size[0] * self.output_size[1] * 1), nn.Sigmoid()) self.mask_fc[2].weight.data.zero_() self.mask_fc[2].bias.data.zero_() def forward(self, input, rois): assert input.size(1) == self.output_channels x = deform_roi_pool(input, rois, None, self.output_size, self.spatial_scale, self.sampling_ratio, self.gamma) rois_num = rois.size(0) offset = self.offset_fc(x.view(rois_num, -1)) offset = offset.view(rois_num, 2, self.output_size[0], self.output_size[1]) mask = self.mask_fc(x.view(rois_num, -1)) mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1]) d = deform_roi_pool(input, rois, offset, self.output_size, self.spatial_scale, self.sampling_ratio, self.gamma) return d * mask ================================================ FILE: annotator/mmpkg/mmcv/ops/deprecated_wrappers.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # This file is for backward compatibility. # Module wrappers for empty tensor have been moved to mmcv.cnn.bricks. import warnings from ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d class Conv2d_deprecated(Conv2d): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) warnings.warn( 'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in' ' the future. Please import them from "mmcv.cnn" instead') class ConvTranspose2d_deprecated(ConvTranspose2d): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) warnings.warn( 'Importing ConvTranspose2d wrapper from "mmcv.ops" will be ' 'deprecated in the future. Please import them from "mmcv.cnn" ' 'instead') class MaxPool2d_deprecated(MaxPool2d): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) warnings.warn( 'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in' ' the future. Please import them from "mmcv.cnn" instead') class Linear_deprecated(Linear): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) warnings.warn( 'Importing Linear wrapper from "mmcv.ops" will be deprecated in' ' the future. Please import them from "mmcv.cnn" instead') ================================================ FILE: annotator/mmpkg/mmcv/ops/focal_loss.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn from torch.autograd import Function from torch.autograd.function import once_differentiable from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', [ 'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward', 'softmax_focal_loss_forward', 'softmax_focal_loss_backward' ]) class SigmoidFocalLossFunction(Function): @staticmethod def symbolic(g, input, target, gamma, alpha, weight, reduction): return g.op( 'mmcv::MMCVSigmoidFocalLoss', input, target, gamma_f=gamma, alpha_f=alpha, weight_f=weight, reduction_s=reduction) @staticmethod def forward(ctx, input, target, gamma=2.0, alpha=0.25, weight=None, reduction='mean'): assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor)) assert input.dim() == 2 assert target.dim() == 1 assert input.size(0) == target.size(0) if weight is None: weight = input.new_empty(0) else: assert weight.dim() == 1 assert input.size(1) == weight.size(0) ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2} assert reduction in ctx.reduction_dict.keys() ctx.gamma = float(gamma) ctx.alpha = float(alpha) ctx.reduction = ctx.reduction_dict[reduction] output = input.new_zeros(input.size()) ext_module.sigmoid_focal_loss_forward( input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha) if ctx.reduction == ctx.reduction_dict['mean']: output = output.sum() / input.size(0) elif ctx.reduction == ctx.reduction_dict['sum']: output = output.sum() ctx.save_for_backward(input, target, weight) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, target, weight = ctx.saved_tensors grad_input = input.new_zeros(input.size()) ext_module.sigmoid_focal_loss_backward( input, target, weight, grad_input, gamma=ctx.gamma, alpha=ctx.alpha) grad_input *= grad_output if ctx.reduction == ctx.reduction_dict['mean']: grad_input /= input.size(0) return grad_input, None, None, None, None, None sigmoid_focal_loss = SigmoidFocalLossFunction.apply class SigmoidFocalLoss(nn.Module): def __init__(self, gamma, alpha, weight=None, reduction='mean'): super(SigmoidFocalLoss, self).__init__() self.gamma = gamma self.alpha = alpha self.register_buffer('weight', weight) self.reduction = reduction def forward(self, input, target): return sigmoid_focal_loss(input, target, self.gamma, self.alpha, self.weight, self.reduction) def __repr__(self): s = self.__class__.__name__ s += f'(gamma={self.gamma}, ' s += f'alpha={self.alpha}, ' s += f'reduction={self.reduction})' return s class SoftmaxFocalLossFunction(Function): @staticmethod def symbolic(g, input, target, gamma, alpha, weight, reduction): return g.op( 'mmcv::MMCVSoftmaxFocalLoss', input, target, gamma_f=gamma, alpha_f=alpha, weight_f=weight, reduction_s=reduction) @staticmethod def forward(ctx, input, target, gamma=2.0, alpha=0.25, weight=None, reduction='mean'): assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor)) assert input.dim() == 2 assert target.dim() == 1 assert input.size(0) == target.size(0) if weight is None: weight = input.new_empty(0) else: assert weight.dim() == 1 assert input.size(1) == weight.size(0) ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2} assert reduction in ctx.reduction_dict.keys() ctx.gamma = float(gamma) ctx.alpha = float(alpha) ctx.reduction = ctx.reduction_dict[reduction] channel_stats, _ = torch.max(input, dim=1) input_softmax = input - channel_stats.unsqueeze(1).expand_as(input) input_softmax.exp_() channel_stats = input_softmax.sum(dim=1) input_softmax /= channel_stats.unsqueeze(1).expand_as(input) output = input.new_zeros(input.size(0)) ext_module.softmax_focal_loss_forward( input_softmax, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha) if ctx.reduction == ctx.reduction_dict['mean']: output = output.sum() / input.size(0) elif ctx.reduction == ctx.reduction_dict['sum']: output = output.sum() ctx.save_for_backward(input_softmax, target, weight) return output @staticmethod def backward(ctx, grad_output): input_softmax, target, weight = ctx.saved_tensors buff = input_softmax.new_zeros(input_softmax.size(0)) grad_input = input_softmax.new_zeros(input_softmax.size()) ext_module.softmax_focal_loss_backward( input_softmax, target, weight, buff, grad_input, gamma=ctx.gamma, alpha=ctx.alpha) grad_input *= grad_output if ctx.reduction == ctx.reduction_dict['mean']: grad_input /= input_softmax.size(0) return grad_input, None, None, None, None, None softmax_focal_loss = SoftmaxFocalLossFunction.apply class SoftmaxFocalLoss(nn.Module): def __init__(self, gamma, alpha, weight=None, reduction='mean'): super(SoftmaxFocalLoss, self).__init__() self.gamma = gamma self.alpha = alpha self.register_buffer('weight', weight) self.reduction = reduction def forward(self, input, target): return softmax_focal_loss(input, target, self.gamma, self.alpha, self.weight, self.reduction) def __repr__(self): s = self.__class__.__name__ s += f'(gamma={self.gamma}, ' s += f'alpha={self.alpha}, ' s += f'reduction={self.reduction})' return s ================================================ FILE: annotator/mmpkg/mmcv/ops/furthest_point_sample.py ================================================ import torch from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', [ 'furthest_point_sampling_forward', 'furthest_point_sampling_with_dist_forward' ]) class FurthestPointSampling(Function): """Uses iterative furthest point sampling to select a set of features whose corresponding points have the furthest distance.""" @staticmethod def forward(ctx, points_xyz: torch.Tensor, num_points: int) -> torch.Tensor: """ Args: points_xyz (Tensor): (B, N, 3) where N > num_points. num_points (int): Number of points in the sampled set. Returns: Tensor: (B, num_points) indices of the sampled points. """ assert points_xyz.is_contiguous() B, N = points_xyz.size()[:2] output = torch.cuda.IntTensor(B, num_points) temp = torch.cuda.FloatTensor(B, N).fill_(1e10) ext_module.furthest_point_sampling_forward( points_xyz, temp, output, b=B, n=N, m=num_points, ) if torch.__version__ != 'parrots': ctx.mark_non_differentiable(output) return output @staticmethod def backward(xyz, a=None): return None, None class FurthestPointSamplingWithDist(Function): """Uses iterative furthest point sampling to select a set of features whose corresponding points have the furthest distance.""" @staticmethod def forward(ctx, points_dist: torch.Tensor, num_points: int) -> torch.Tensor: """ Args: points_dist (Tensor): (B, N, N) Distance between each point pair. num_points (int): Number of points in the sampled set. Returns: Tensor: (B, num_points) indices of the sampled points. """ assert points_dist.is_contiguous() B, N, _ = points_dist.size() output = points_dist.new_zeros([B, num_points], dtype=torch.int32) temp = points_dist.new_zeros([B, N]).fill_(1e10) ext_module.furthest_point_sampling_with_dist_forward( points_dist, temp, output, b=B, n=N, m=num_points) if torch.__version__ != 'parrots': ctx.mark_non_differentiable(output) return output @staticmethod def backward(xyz, a=None): return None, None furthest_point_sample = FurthestPointSampling.apply furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply ================================================ FILE: annotator/mmpkg/mmcv/ops/fused_bias_leakyrelu.py ================================================ # modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501 # Copyright (c) 2021, NVIDIA Corporation. All rights reserved. # NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator # Augmentation (ADA) # ======================================================================= # 1. Definitions # "Licensor" means any person or entity that distributes its Work. # "Software" means the original work of authorship made available under # this License. # "Work" means the Software and any additions to or derivative works of # the Software that are made available under this License. # The terms "reproduce," "reproduction," "derivative works," and # "distribution" have the meaning as provided under U.S. copyright law; # provided, however, that for the purposes of this License, derivative # works shall not include works that remain separable from, or merely # link (or bind by name) to the interfaces of, the Work. # Works, including the Software, are "made available" under this License # by including in or with the Work either (a) a copyright notice # referencing the applicability of this License to the Work, or (b) a # copy of this License. # 2. License Grants # 2.1 Copyright Grant. Subject to the terms and conditions of this # License, each Licensor grants to you a perpetual, worldwide, # non-exclusive, royalty-free, copyright license to reproduce, # prepare derivative works of, publicly display, publicly perform, # sublicense and distribute its Work and any resulting derivative # works in any form. # 3. Limitations # 3.1 Redistribution. You may reproduce or distribute the Work only # if (a) you do so under this License, (b) you include a complete # copy of this License with your distribution, and (c) you retain # without modification any copyright, patent, trademark, or # attribution notices that are present in the Work. # 3.2 Derivative Works. You may specify that additional or different # terms apply to the use, reproduction, and distribution of your # derivative works of the Work ("Your Terms") only if (a) Your Terms # provide that the use limitation in Section 3.3 applies to your # derivative works, and (b) you identify the specific derivative # works that are subject to Your Terms. Notwithstanding Your Terms, # this License (including the redistribution requirements in Section # 3.1) will continue to apply to the Work itself. # 3.3 Use Limitation. The Work and any derivative works thereof only # may be used or intended for use non-commercially. Notwithstanding # the foregoing, NVIDIA and its affiliates may use the Work and any # derivative works commercially. As used herein, "non-commercially" # means for research or evaluation purposes only. # 3.4 Patent Claims. If you bring or threaten to bring a patent claim # against any Licensor (including any claim, cross-claim or # counterclaim in a lawsuit) to enforce any patents that you allege # are infringed by any Work, then your rights under this License from # such Licensor (including the grant in Section 2.1) will terminate # immediately. # 3.5 Trademarks. This License does not grant any rights to use any # Licensor’s or its affiliates’ names, logos, or trademarks, except # as necessary to reproduce the notices described in this License. # 3.6 Termination. If you violate any term of this License, then your # rights under this License (including the grant in Section 2.1) will # terminate immediately. # 4. Disclaimer of Warranty. # THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR # NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER # THIS LICENSE. # 5. Limitation of Liability. # EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL # THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE # SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, # INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF # OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK # (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, # LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER # COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF # THE POSSIBILITY OF SUCH DAMAGES. # ======================================================================= import torch import torch.nn.functional as F from torch import nn from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu']) class FusedBiasLeakyReLUFunctionBackward(Function): """Calculate second order deviation. This function is to compute the second order deviation for the fused leaky relu operation. """ @staticmethod def forward(ctx, grad_output, out, negative_slope, scale): ctx.save_for_backward(out) ctx.negative_slope = negative_slope ctx.scale = scale empty = grad_output.new_empty(0) grad_input = ext_module.fused_bias_leakyrelu( grad_output, empty, out, act=3, grad=1, alpha=negative_slope, scale=scale) dim = [0] if grad_input.ndim > 2: dim += list(range(2, grad_input.ndim)) grad_bias = grad_input.sum(dim).detach() return grad_input, grad_bias @staticmethod def backward(ctx, gradgrad_input, gradgrad_bias): out, = ctx.saved_tensors # The second order deviation, in fact, contains two parts, while the # the first part is zero. Thus, we direct consider the second part # which is similar with the first order deviation in implementation. gradgrad_out = ext_module.fused_bias_leakyrelu( gradgrad_input, gradgrad_bias.to(out.dtype), out, act=3, grad=1, alpha=ctx.negative_slope, scale=ctx.scale) return gradgrad_out, None, None, None class FusedBiasLeakyReLUFunction(Function): @staticmethod def forward(ctx, input, bias, negative_slope, scale): empty = input.new_empty(0) out = ext_module.fused_bias_leakyrelu( input, bias, empty, act=3, grad=0, alpha=negative_slope, scale=scale) ctx.save_for_backward(out) ctx.negative_slope = negative_slope ctx.scale = scale return out @staticmethod def backward(ctx, grad_output): out, = ctx.saved_tensors grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply( grad_output, out, ctx.negative_slope, ctx.scale) return grad_input, grad_bias, None, None class FusedBiasLeakyReLU(nn.Module): """Fused bias leaky ReLU. This function is introduced in the StyleGAN2: http://arxiv.org/abs/1912.04958 The bias term comes from the convolution operation. In addition, to keep the variance of the feature map or gradients unchanged, they also adopt a scale similarly with Kaiming initialization. However, since the :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501 your own scale. TODO: Implement the CPU version. Args: channel (int): The channel number of the feature map. negative_slope (float, optional): Same as nn.LeakyRelu. Defaults to 0.2. scale (float, optional): A scalar to adjust the variance of the feature map. Defaults to 2**0.5. """ def __init__(self, num_channels, negative_slope=0.2, scale=2**0.5): super(FusedBiasLeakyReLU, self).__init__() self.bias = nn.Parameter(torch.zeros(num_channels)) self.negative_slope = negative_slope self.scale = scale def forward(self, input): return fused_bias_leakyrelu(input, self.bias, self.negative_slope, self.scale) def fused_bias_leakyrelu(input, bias, negative_slope=0.2, scale=2**0.5): """Fused bias leaky ReLU function. This function is introduced in the StyleGAN2: http://arxiv.org/abs/1912.04958 The bias term comes from the convolution operation. In addition, to keep the variance of the feature map or gradients unchanged, they also adopt a scale similarly with Kaiming initialization. However, since the :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501 your own scale. Args: input (torch.Tensor): Input feature map. bias (nn.Parameter): The bias from convolution operation. negative_slope (float, optional): Same as nn.LeakyRelu. Defaults to 0.2. scale (float, optional): A scalar to adjust the variance of the feature map. Defaults to 2**0.5. Returns: torch.Tensor: Feature map after non-linear activation. """ if not input.is_cuda: return bias_leakyrelu_ref(input, bias, negative_slope, scale) return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype), negative_slope, scale) def bias_leakyrelu_ref(x, bias, negative_slope=0.2, scale=2**0.5): if bias is not None: assert bias.ndim == 1 assert bias.shape[0] == x.shape[1] x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)]) x = F.leaky_relu(x, negative_slope) if scale != 1: x = x * scale return x ================================================ FILE: annotator/mmpkg/mmcv/ops/gather_points.py ================================================ import torch from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['gather_points_forward', 'gather_points_backward']) class GatherPoints(Function): """Gather points with given index.""" @staticmethod def forward(ctx, features: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: """ Args: features (Tensor): (B, C, N) features to gather. indices (Tensor): (B, M) where M is the number of points. Returns: Tensor: (B, C, M) where M is the number of points. """ assert features.is_contiguous() assert indices.is_contiguous() B, npoint = indices.size() _, C, N = features.size() output = torch.cuda.FloatTensor(B, C, npoint) ext_module.gather_points_forward( features, indices, output, b=B, c=C, n=N, npoints=npoint) ctx.for_backwards = (indices, C, N) if torch.__version__ != 'parrots': ctx.mark_non_differentiable(indices) return output @staticmethod def backward(ctx, grad_out): idx, C, N = ctx.for_backwards B, npoint = idx.size() grad_features = torch.cuda.FloatTensor(B, C, N).zero_() grad_out_data = grad_out.data.contiguous() ext_module.gather_points_backward( grad_out_data, idx, grad_features.data, b=B, c=C, n=N, npoints=npoint) return grad_features, None gather_points = GatherPoints.apply ================================================ FILE: annotator/mmpkg/mmcv/ops/group_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple import torch from torch import nn as nn from torch.autograd import Function from ..utils import ext_loader from .ball_query import ball_query from .knn import knn ext_module = ext_loader.load_ext( '_ext', ['group_points_forward', 'group_points_backward']) class QueryAndGroup(nn.Module): """Groups points with a ball query of radius. Args: max_radius (float): The maximum radius of the balls. If None is given, we will use kNN sampling instead of ball query. sample_num (int): Maximum number of features to gather in the ball. min_radius (float, optional): The minimum radius of the balls. Default: 0. use_xyz (bool, optional): Whether to use xyz. Default: True. return_grouped_xyz (bool, optional): Whether to return grouped xyz. Default: False. normalize_xyz (bool, optional): Whether to normalize xyz. Default: False. uniform_sample (bool, optional): Whether to sample uniformly. Default: False return_unique_cnt (bool, optional): Whether to return the count of unique samples. Default: False. return_grouped_idx (bool, optional): Whether to return grouped idx. Default: False. """ def __init__(self, max_radius, sample_num, min_radius=0, use_xyz=True, return_grouped_xyz=False, normalize_xyz=False, uniform_sample=False, return_unique_cnt=False, return_grouped_idx=False): super().__init__() self.max_radius = max_radius self.min_radius = min_radius self.sample_num = sample_num self.use_xyz = use_xyz self.return_grouped_xyz = return_grouped_xyz self.normalize_xyz = normalize_xyz self.uniform_sample = uniform_sample self.return_unique_cnt = return_unique_cnt self.return_grouped_idx = return_grouped_idx if self.return_unique_cnt: assert self.uniform_sample, \ 'uniform_sample should be True when ' \ 'returning the count of unique samples' if self.max_radius is None: assert not self.normalize_xyz, \ 'can not normalize grouped xyz when max_radius is None' def forward(self, points_xyz, center_xyz, features=None): """ Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. center_xyz (Tensor): (B, npoint, 3) coordinates of the centriods. features (Tensor): (B, C, N) Descriptors of the features. Returns: Tensor: (B, 3 + C, npoint, sample_num) Grouped feature. """ # if self.max_radius is None, we will perform kNN instead of ball query # idx is of shape [B, npoint, sample_num] if self.max_radius is None: idx = knn(self.sample_num, points_xyz, center_xyz, False) idx = idx.transpose(1, 2).contiguous() else: idx = ball_query(self.min_radius, self.max_radius, self.sample_num, points_xyz, center_xyz) if self.uniform_sample: unique_cnt = torch.zeros((idx.shape[0], idx.shape[1])) for i_batch in range(idx.shape[0]): for i_region in range(idx.shape[1]): unique_ind = torch.unique(idx[i_batch, i_region, :]) num_unique = unique_ind.shape[0] unique_cnt[i_batch, i_region] = num_unique sample_ind = torch.randint( 0, num_unique, (self.sample_num - num_unique, ), dtype=torch.long) all_ind = torch.cat((unique_ind, unique_ind[sample_ind])) idx[i_batch, i_region, :] = all_ind xyz_trans = points_xyz.transpose(1, 2).contiguous() # (B, 3, npoint, sample_num) grouped_xyz = grouping_operation(xyz_trans, idx) grouped_xyz_diff = grouped_xyz - \ center_xyz.transpose(1, 2).unsqueeze(-1) # relative offsets if self.normalize_xyz: grouped_xyz_diff /= self.max_radius if features is not None: grouped_features = grouping_operation(features, idx) if self.use_xyz: # (B, C + 3, npoint, sample_num) new_features = torch.cat([grouped_xyz_diff, grouped_features], dim=1) else: new_features = grouped_features else: assert (self.use_xyz ), 'Cannot have not features and not use xyz as a feature!' new_features = grouped_xyz_diff ret = [new_features] if self.return_grouped_xyz: ret.append(grouped_xyz) if self.return_unique_cnt: ret.append(unique_cnt) if self.return_grouped_idx: ret.append(idx) if len(ret) == 1: return ret[0] else: return tuple(ret) class GroupAll(nn.Module): """Group xyz with feature. Args: use_xyz (bool): Whether to use xyz. """ def __init__(self, use_xyz: bool = True): super().__init__() self.use_xyz = use_xyz def forward(self, xyz: torch.Tensor, new_xyz: torch.Tensor, features: torch.Tensor = None): """ Args: xyz (Tensor): (B, N, 3) xyz coordinates of the features. new_xyz (Tensor): new xyz coordinates of the features. features (Tensor): (B, C, N) features to group. Returns: Tensor: (B, C + 3, 1, N) Grouped feature. """ grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) if features is not None: grouped_features = features.unsqueeze(2) if self.use_xyz: # (B, 3 + C, 1, N) new_features = torch.cat([grouped_xyz, grouped_features], dim=1) else: new_features = grouped_features else: new_features = grouped_xyz return new_features class GroupingOperation(Function): """Group feature with given index.""" @staticmethod def forward(ctx, features: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: """ Args: features (Tensor): (B, C, N) tensor of features to group. indices (Tensor): (B, npoint, nsample) the indices of features to group with. Returns: Tensor: (B, C, npoint, nsample) Grouped features. """ features = features.contiguous() indices = indices.contiguous() B, nfeatures, nsample = indices.size() _, C, N = features.size() output = torch.cuda.FloatTensor(B, C, nfeatures, nsample) ext_module.group_points_forward(B, C, N, nfeatures, nsample, features, indices, output) ctx.for_backwards = (indices, N) return output @staticmethod def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Args: grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients of the output from forward. Returns: Tensor: (B, C, N) gradient of the features. """ idx, N = ctx.for_backwards B, C, npoint, nsample = grad_out.size() grad_features = torch.cuda.FloatTensor(B, C, N).zero_() grad_out_data = grad_out.data.contiguous() ext_module.group_points_backward(B, C, N, npoint, nsample, grad_out_data, idx, grad_features.data) return grad_features, None grouping_operation = GroupingOperation.apply ================================================ FILE: annotator/mmpkg/mmcv/ops/info.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import glob import os import torch if torch.__version__ == 'parrots': import parrots def get_compiler_version(): return 'GCC ' + parrots.version.compiler def get_compiling_cuda_version(): return parrots.version.cuda else: from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['get_compiler_version', 'get_compiling_cuda_version']) def get_compiler_version(): return ext_module.get_compiler_version() def get_compiling_cuda_version(): return ext_module.get_compiling_cuda_version() def get_onnxruntime_op_path(): wildcard = os.path.join( os.path.abspath(os.path.dirname(os.path.dirname(__file__))), '_ext_ort.*.so') paths = glob.glob(wildcard) if len(paths) > 0: return paths[0] else: return '' ================================================ FILE: annotator/mmpkg/mmcv/ops/iou3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', [ 'iou3d_boxes_iou_bev_forward', 'iou3d_nms_forward', 'iou3d_nms_normal_forward' ]) def boxes_iou_bev(boxes_a, boxes_b): """Calculate boxes IoU in the Bird's Eye View. Args: boxes_a (torch.Tensor): Input boxes a with shape (M, 5). boxes_b (torch.Tensor): Input boxes b with shape (N, 5). Returns: ans_iou (torch.Tensor): IoU result with shape (M, N). """ ans_iou = boxes_a.new_zeros( torch.Size((boxes_a.shape[0], boxes_b.shape[0]))) ext_module.iou3d_boxes_iou_bev_forward(boxes_a.contiguous(), boxes_b.contiguous(), ans_iou) return ans_iou def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None): """NMS function GPU implementation (for BEV boxes). The overlap of two boxes for IoU calculation is defined as the exact overlapping area of the two boxes. In this function, one can also set ``pre_max_size`` and ``post_max_size``. Args: boxes (torch.Tensor): Input boxes with the shape of [N, 5] ([x1, y1, x2, y2, ry]). scores (torch.Tensor): Scores of boxes with the shape of [N]. thresh (float): Overlap threshold of NMS. pre_max_size (int, optional): Max size of boxes before NMS. Default: None. post_max_size (int, optional): Max size of boxes after NMS. Default: None. Returns: torch.Tensor: Indexes after NMS. """ assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]' order = scores.sort(0, descending=True)[1] if pre_max_size is not None: order = order[:pre_max_size] boxes = boxes[order].contiguous() keep = torch.zeros(boxes.size(0), dtype=torch.long) num_out = ext_module.iou3d_nms_forward(boxes, keep, thresh) keep = order[keep[:num_out].cuda(boxes.device)].contiguous() if post_max_size is not None: keep = keep[:post_max_size] return keep def nms_normal_bev(boxes, scores, thresh): """Normal NMS function GPU implementation (for BEV boxes). The overlap of two boxes for IoU calculation is defined as the exact overlapping area of the two boxes WITH their yaw angle set to 0. Args: boxes (torch.Tensor): Input boxes with shape (N, 5). scores (torch.Tensor): Scores of predicted boxes with shape (N). thresh (float): Overlap threshold of NMS. Returns: torch.Tensor: Remaining indices with scores in descending order. """ assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]' order = scores.sort(0, descending=True)[1] boxes = boxes[order].contiguous() keep = torch.zeros(boxes.size(0), dtype=torch.long) num_out = ext_module.iou3d_nms_normal_forward(boxes, keep, thresh) return order[keep[:num_out].cuda(boxes.device)].contiguous() ================================================ FILE: annotator/mmpkg/mmcv/ops/knn.py ================================================ import torch from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['knn_forward']) class KNN(Function): r"""KNN (CUDA) based on heap data structure. Modified from `PAConv `_. Find k-nearest points. """ @staticmethod def forward(ctx, k: int, xyz: torch.Tensor, center_xyz: torch.Tensor = None, transposed: bool = False) -> torch.Tensor: """ Args: k (int): number of nearest neighbors. xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N). xyz coordinates of the features. center_xyz (Tensor, optional): (B, npoint, 3) if transposed == False, else (B, 3, npoint). centers of the knn query. Default: None. transposed (bool, optional): whether the input tensors are transposed. Should not explicitly use this keyword when calling knn (=KNN.apply), just add the fourth param. Default: False. Returns: Tensor: (B, k, npoint) tensor with the indices of the features that form k-nearest neighbours. """ assert (k > 0) & (k < 100), 'k should be in range(0, 100)' if center_xyz is None: center_xyz = xyz if transposed: xyz = xyz.transpose(2, 1).contiguous() center_xyz = center_xyz.transpose(2, 1).contiguous() assert xyz.is_contiguous() # [B, N, 3] assert center_xyz.is_contiguous() # [B, npoint, 3] center_xyz_device = center_xyz.get_device() assert center_xyz_device == xyz.get_device(), \ 'center_xyz and xyz should be put on the same device' if torch.cuda.current_device() != center_xyz_device: torch.cuda.set_device(center_xyz_device) B, npoint, _ = center_xyz.shape N = xyz.shape[1] idx = center_xyz.new_zeros((B, npoint, k)).int() dist2 = center_xyz.new_zeros((B, npoint, k)).float() ext_module.knn_forward( xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k) # idx shape to [B, k, npoint] idx = idx.transpose(2, 1).contiguous() if torch.__version__ != 'parrots': ctx.mark_non_differentiable(idx) return idx @staticmethod def backward(ctx, a=None): return None, None, None knn = KNN.apply ================================================ FILE: annotator/mmpkg/mmcv/ops/masked_conv.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import torch import torch.nn as nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['masked_im2col_forward', 'masked_col2im_forward']) class MaskedConv2dFunction(Function): @staticmethod def symbolic(g, features, mask, weight, bias, padding, stride): return g.op( 'mmcv::MMCVMaskedConv2d', features, mask, weight, bias, padding_i=padding, stride_i=stride) @staticmethod def forward(ctx, features, mask, weight, bias, padding=0, stride=1): assert mask.dim() == 3 and mask.size(0) == 1 assert features.dim() == 4 and features.size(0) == 1 assert features.size()[2:] == mask.size()[1:] pad_h, pad_w = _pair(padding) stride_h, stride_w = _pair(stride) if stride_h != 1 or stride_w != 1: raise ValueError( 'Stride could not only be 1 in masked_conv2d currently.') out_channel, in_channel, kernel_h, kernel_w = weight.size() batch_size = features.size(0) out_h = int( math.floor((features.size(2) + 2 * pad_h - (kernel_h - 1) - 1) / stride_h + 1)) out_w = int( math.floor((features.size(3) + 2 * pad_w - (kernel_h - 1) - 1) / stride_w + 1)) mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False) output = features.new_zeros(batch_size, out_channel, out_h, out_w) if mask_inds.numel() > 0: mask_h_idx = mask_inds[:, 0].contiguous() mask_w_idx = mask_inds[:, 1].contiguous() data_col = features.new_zeros(in_channel * kernel_h * kernel_w, mask_inds.size(0)) ext_module.masked_im2col_forward( features, mask_h_idx, mask_w_idx, data_col, kernel_h=kernel_h, kernel_w=kernel_w, pad_h=pad_h, pad_w=pad_w) masked_output = torch.addmm(1, bias[:, None], 1, weight.view(out_channel, -1), data_col) ext_module.masked_col2im_forward( masked_output, mask_h_idx, mask_w_idx, output, height=out_h, width=out_w, channels=out_channel) return output @staticmethod @once_differentiable def backward(ctx, grad_output): return (None, ) * 5 masked_conv2d = MaskedConv2dFunction.apply class MaskedConv2d(nn.Conv2d): """A MaskedConv2d which inherits the official Conv2d. The masked forward doesn't implement the backward function and only supports the stride parameter to be 1 currently. """ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super(MaskedConv2d, self).__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias) def forward(self, input, mask=None): if mask is None: # fallback to the normal Conv2d return super(MaskedConv2d, self).forward(input) else: return masked_conv2d(input, mask, self.weight, self.bias, self.padding) ================================================ FILE: annotator/mmpkg/mmcv/ops/merge_cells.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from abc import abstractmethod import torch import torch.nn as nn import torch.nn.functional as F from ..cnn import ConvModule class BaseMergeCell(nn.Module): """The basic class for cells used in NAS-FPN and NAS-FCOS. BaseMergeCell takes 2 inputs. After applying convolution on them, they are resized to the target size. Then, they go through binary_op, which depends on the type of cell. If with_out_conv is True, the result of output will go through another convolution layer. Args: in_channels (int): number of input channels in out_conv layer. out_channels (int): number of output channels in out_conv layer. with_out_conv (bool): Whether to use out_conv layer out_conv_cfg (dict): Config dict for convolution layer, which should contain "groups", "kernel_size", "padding", "bias" to build out_conv layer. out_norm_cfg (dict): Config dict for normalization layer in out_conv. out_conv_order (tuple): The order of conv/norm/activation layers in out_conv. with_input1_conv (bool): Whether to use convolution on input1. with_input2_conv (bool): Whether to use convolution on input2. input_conv_cfg (dict): Config dict for building input1_conv layer and input2_conv layer, which is expected to contain the type of convolution. Default: None, which means using conv2d. input_norm_cfg (dict): Config dict for normalization layer in input1_conv and input2_conv layer. Default: None. upsample_mode (str): Interpolation method used to resize the output of input1_conv and input2_conv to target size. Currently, we support ['nearest', 'bilinear']. Default: 'nearest'. """ def __init__(self, fused_channels=256, out_channels=256, with_out_conv=True, out_conv_cfg=dict( groups=1, kernel_size=3, padding=1, bias=True), out_norm_cfg=None, out_conv_order=('act', 'conv', 'norm'), with_input1_conv=False, with_input2_conv=False, input_conv_cfg=None, input_norm_cfg=None, upsample_mode='nearest'): super(BaseMergeCell, self).__init__() assert upsample_mode in ['nearest', 'bilinear'] self.with_out_conv = with_out_conv self.with_input1_conv = with_input1_conv self.with_input2_conv = with_input2_conv self.upsample_mode = upsample_mode if self.with_out_conv: self.out_conv = ConvModule( fused_channels, out_channels, **out_conv_cfg, norm_cfg=out_norm_cfg, order=out_conv_order) self.input1_conv = self._build_input_conv( out_channels, input_conv_cfg, input_norm_cfg) if with_input1_conv else nn.Sequential() self.input2_conv = self._build_input_conv( out_channels, input_conv_cfg, input_norm_cfg) if with_input2_conv else nn.Sequential() def _build_input_conv(self, channel, conv_cfg, norm_cfg): return ConvModule( channel, channel, 3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True) @abstractmethod def _binary_op(self, x1, x2): pass def _resize(self, x, size): if x.shape[-2:] == size: return x elif x.shape[-2:] < size: return F.interpolate(x, size=size, mode=self.upsample_mode) else: assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0 kernel_size = x.shape[-1] // size[-1] x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size) return x def forward(self, x1, x2, out_size=None): assert x1.shape[:2] == x2.shape[:2] assert out_size is None or len(out_size) == 2 if out_size is None: # resize to larger one out_size = max(x1.size()[2:], x2.size()[2:]) x1 = self.input1_conv(x1) x2 = self.input2_conv(x2) x1 = self._resize(x1, out_size) x2 = self._resize(x2, out_size) x = self._binary_op(x1, x2) if self.with_out_conv: x = self.out_conv(x) return x class SumCell(BaseMergeCell): def __init__(self, in_channels, out_channels, **kwargs): super(SumCell, self).__init__(in_channels, out_channels, **kwargs) def _binary_op(self, x1, x2): return x1 + x2 class ConcatCell(BaseMergeCell): def __init__(self, in_channels, out_channels, **kwargs): super(ConcatCell, self).__init__(in_channels * 2, out_channels, **kwargs) def _binary_op(self, x1, x2): ret = torch.cat([x1, x2], dim=1) return ret class GlobalPoolingCell(BaseMergeCell): def __init__(self, in_channels=None, out_channels=None, **kwargs): super().__init__(in_channels, out_channels, **kwargs) self.global_pool = nn.AdaptiveAvgPool2d((1, 1)) def _binary_op(self, x1, x2): x2_att = self.global_pool(x2).sigmoid() return x2 + x2_att * x1 ================================================ FILE: annotator/mmpkg/mmcv/ops/modulated_deform_conv.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import torch import torch.nn as nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair, _single from annotator.mmpkg.mmcv.utils import deprecated_api_warning from ..cnn import CONV_LAYERS from ..utils import ext_loader, print_log ext_module = ext_loader.load_ext( '_ext', ['modulated_deform_conv_forward', 'modulated_deform_conv_backward']) class ModulatedDeformConv2dFunction(Function): @staticmethod def symbolic(g, input, offset, mask, weight, bias, stride, padding, dilation, groups, deform_groups): input_tensors = [input, offset, mask, weight] if bias is not None: input_tensors.append(bias) return g.op( 'mmcv::MMCVModulatedDeformConv2d', *input_tensors, stride_i=stride, padding_i=padding, dilation_i=dilation, groups_i=groups, deform_groups_i=deform_groups) @staticmethod def forward(ctx, input, offset, mask, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, deform_groups=1): if input is not None and input.dim() != 4: raise ValueError( f'Expected 4D tensor as input, got {input.dim()}D tensor \ instead.') ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.groups = groups ctx.deform_groups = deform_groups ctx.with_bias = bias is not None if not ctx.with_bias: bias = input.new_empty(0) # fake tensor # When pytorch version >= 1.6.0, amp is adopted for fp16 mode; # amp won't cast the type of model (float32), but "offset" is cast # to float16 by nn.Conv2d automatically, leading to the type # mismatch with input (when it is float32) or weight. # The flag for whether to use fp16 or amp is the type of "offset", # we cast weight and input to temporarily support fp16 and amp # whatever the pytorch version is. input = input.type_as(offset) weight = weight.type_as(input) ctx.save_for_backward(input, offset, mask, weight, bias) output = input.new_empty( ModulatedDeformConv2dFunction._output_size(ctx, input, weight)) ctx._bufs = [input.new_empty(0), input.new_empty(0)] ext_module.modulated_deform_conv_forward( input, weight, bias, ctx._bufs[0], offset, mask, output, ctx._bufs[1], kernel_h=weight.size(2), kernel_w=weight.size(3), stride_h=ctx.stride[0], stride_w=ctx.stride[1], pad_h=ctx.padding[0], pad_w=ctx.padding[1], dilation_h=ctx.dilation[0], dilation_w=ctx.dilation[1], group=ctx.groups, deformable_group=ctx.deform_groups, with_bias=ctx.with_bias) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, offset, mask, weight, bias = ctx.saved_tensors grad_input = torch.zeros_like(input) grad_offset = torch.zeros_like(offset) grad_mask = torch.zeros_like(mask) grad_weight = torch.zeros_like(weight) grad_bias = torch.zeros_like(bias) grad_output = grad_output.contiguous() ext_module.modulated_deform_conv_backward( input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1], grad_input, grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h=weight.size(2), kernel_w=weight.size(3), stride_h=ctx.stride[0], stride_w=ctx.stride[1], pad_h=ctx.padding[0], pad_w=ctx.padding[1], dilation_h=ctx.dilation[0], dilation_w=ctx.dilation[1], group=ctx.groups, deformable_group=ctx.deform_groups, with_bias=ctx.with_bias) if not ctx.with_bias: grad_bias = None return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias, None, None, None, None, None) @staticmethod def _output_size(ctx, input, weight): channels = weight.size(0) output_size = (input.size(0), channels) for d in range(input.dim() - 2): in_size = input.size(d + 2) pad = ctx.padding[d] kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1 stride_ = ctx.stride[d] output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) if not all(map(lambda s: s > 0, output_size)): raise ValueError( 'convolution input is too small (output would be ' + 'x'.join(map(str, output_size)) + ')') return output_size modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply class ModulatedDeformConv2d(nn.Module): @deprecated_api_warning({'deformable_groups': 'deform_groups'}, cls_name='ModulatedDeformConv2d') def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deform_groups=1, bias=True): super(ModulatedDeformConv2d, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = _pair(stride) self.padding = _pair(padding) self.dilation = _pair(dilation) self.groups = groups self.deform_groups = deform_groups # enable compatibility with nn.Conv2d self.transposed = False self.output_padding = _single(0) self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)) if bias: self.bias = nn.Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.init_weights() def init_weights(self): n = self.in_channels for k in self.kernel_size: n *= k stdv = 1. / math.sqrt(n) self.weight.data.uniform_(-stdv, stdv) if self.bias is not None: self.bias.data.zero_() def forward(self, x, offset, mask): return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.deform_groups) @CONV_LAYERS.register_module('DCNv2') class ModulatedDeformConv2dPack(ModulatedDeformConv2d): """A ModulatedDeformable Conv Encapsulation that acts as normal Conv layers. Args: in_channels (int): Same as nn.Conv2d. out_channels (int): Same as nn.Conv2d. kernel_size (int or tuple[int]): Same as nn.Conv2d. stride (int): Same as nn.Conv2d, while tuple is not supported. padding (int): Same as nn.Conv2d, while tuple is not supported. dilation (int): Same as nn.Conv2d, while tuple is not supported. groups (int): Same as nn.Conv2d. bias (bool or str): If specified as `auto`, it will be decided by the norm_cfg. Bias will be set as True if norm_cfg is None, otherwise False. """ _version = 2 def __init__(self, *args, **kwargs): super(ModulatedDeformConv2dPack, self).__init__(*args, **kwargs) self.conv_offset = nn.Conv2d( self.in_channels, self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1], kernel_size=self.kernel_size, stride=self.stride, padding=self.padding, dilation=self.dilation, bias=True) self.init_weights() def init_weights(self): super(ModulatedDeformConv2dPack, self).init_weights() if hasattr(self, 'conv_offset'): self.conv_offset.weight.data.zero_() self.conv_offset.bias.data.zero_() def forward(self, x): out = self.conv_offset(x) o1, o2, mask = torch.chunk(out, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.deform_groups) def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): version = local_metadata.get('version', None) if version is None or version < 2: # the key is different in early versions # In version < 2, ModulatedDeformConvPack # loads previous benchmark models. if (prefix + 'conv_offset.weight' not in state_dict and prefix[:-1] + '_offset.weight' in state_dict): state_dict[prefix + 'conv_offset.weight'] = state_dict.pop( prefix[:-1] + '_offset.weight') if (prefix + 'conv_offset.bias' not in state_dict and prefix[:-1] + '_offset.bias' in state_dict): state_dict[prefix + 'conv_offset.bias'] = state_dict.pop(prefix[:-1] + '_offset.bias') if version is not None and version > 1: print_log( f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to ' 'version 2.', logger='root') super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) ================================================ FILE: annotator/mmpkg/mmcv/ops/multi_scale_deform_attn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import warnings import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd.function import Function, once_differentiable from annotator.mmpkg.mmcv import deprecated_api_warning from annotator.mmpkg.mmcv.cnn import constant_init, xavier_init from annotator.mmpkg.mmcv.cnn.bricks.registry import ATTENTION from annotator.mmpkg.mmcv.runner import BaseModule from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) class MultiScaleDeformableAttnFunction(Function): @staticmethod def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): """GPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), im2col_step (Tensor): The step used in image to column. Returns: Tensor: has shape (bs, num_queries, embed_dims) """ ctx.im2col_step = im2col_step output = ext_module.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step=ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable def backward(ctx, grad_output): """GPU version of backward function. Args: grad_output (Tensor): Gradient of output tensor of forward. Returns: Tuple[Tensor]: Gradient of input tensors in forward. """ value, value_spatial_shapes, value_level_start_index,\ sampling_locations, attention_weights = ctx.saved_tensors grad_value = torch.zeros_like(value) grad_sampling_loc = torch.zeros_like(sampling_locations) grad_attn_weight = torch.zeros_like(attention_weights) ext_module.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output.contiguous(), grad_value, grad_sampling_loc, grad_attn_weight, im2col_step=ctx.im2col_step) return grad_value, None, None, \ grad_sampling_loc, grad_attn_weight, None def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): """CPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), Returns: Tensor: has shape (bs, num_queries, embed_dims) """ bs, _, num_heads, embed_dims = value.shape _, num_queries, num_heads, num_levels, num_points, _ =\ sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for level, (H_, W_) in enumerate(value_spatial_shapes): # bs, H_*W_, num_heads, embed_dims -> # bs, H_*W_, num_heads*embed_dims -> # bs, num_heads*embed_dims, H_*W_ -> # bs*num_heads, embed_dims, H_, W_ value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape( bs * num_heads, embed_dims, H_, W_) # bs, num_queries, num_heads, num_points, 2 -> # bs, num_heads, num_queries, num_points, 2 -> # bs*num_heads, num_queries, num_points, 2 sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1) # bs*num_heads, embed_dims, num_queries, num_points sampling_value_l_ = F.grid_sample( value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (bs, num_queries, num_heads, num_levels, num_points) -> # (bs, num_heads, num_queries, num_levels, num_points) -> # (bs, num_heads, 1, num_queries, num_levels*num_points) attention_weights = attention_weights.transpose(1, 2).reshape( bs * num_heads, 1, num_queries, num_levels * num_points) output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(bs, num_heads * embed_dims, num_queries) return output.transpose(1, 2).contiguous() @ATTENTION.register_module() class MultiScaleDeformableAttention(BaseModule): """An attention module used in Deformable-Detr. `Deformable DETR: Deformable Transformers for End-to-End Object Detection. `_. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 64. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 4. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_identity`. Default: 0.1. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=4, im2col_step=64, dropout=0.1, batch_first=False, norm_cfg=None, init_cfg=None): super().__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.dropout = nn.Dropout(dropout) self.batch_first = batch_first # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn( "You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points self.sampling_offsets = nn.Linear( embed_dims, num_heads * num_levels * num_points * 2) self.attention_weights = nn.Linear(embed_dims, num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.output_proj = nn.Linear(embed_dims, embed_dims) self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" constant_init(self.sampling_offsets, 0.) thetas = torch.arange( self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view( self.num_heads, 1, 1, 2).repeat(1, self.num_levels, self.num_points, 1) for i in range(self.num_points): grid_init[:, :, i, :] *= i + 1 self.sampling_offsets.bias.data = grid_init.view(-1) constant_init(self.attention_weights, val=0., bias=0.) xavier_init(self.value_proj, distribution='uniform', bias=0.) xavier_init(self.output_proj, distribution='uniform', bias=0.) self._is_init = True @deprecated_api_warning({'residual': 'identity'}, cls_name='MultiScaleDeformableAttention') def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape `(num_key, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_key, bs, embed_dims)`. identity (Tensor): The tensor used for addition, with the same shape as `query`. Default None. If None, `query` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, 2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. or (N, Length_{query}, num_levels, 4), add additional two dimensions is (w, h) to form reference boxes. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different levels. With shape (num_levels, 2), last dimension represents (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape ``(num_levels, )`` and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if value is None: value = query if identity is None: identity = query if query_pos is not None: query = query + query_pos if not self.batch_first: # change to (bs, num_query ,embed_dims) query = query.permute(1, 0, 2) value = value.permute(1, 0, 2) bs, num_query, _ = query.shape bs, num_value, _ = value.shape assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value value = self.value_proj(value) if key_padding_mask is not None: value = value.masked_fill(key_padding_mask[..., None], 0.0) value = value.view(bs, num_value, self.num_heads, -1) sampling_offsets = self.sampling_offsets(query).view( bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) attention_weights = self.attention_weights(query).view( bs, num_query, self.num_heads, self.num_levels * self.num_points) attention_weights = attention_weights.softmax(-1) attention_weights = attention_weights.view(bs, num_query, self.num_heads, self.num_levels, self.num_points) if reference_points.shape[-1] == 2: offset_normalizer = torch.stack( [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets \ / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: sampling_locations = reference_points[:, :, None, :, None, :2] \ + sampling_offsets / self.num_points \ * reference_points[:, :, None, :, None, 2:] \ * 0.5 else: raise ValueError( f'Last dim of reference_points must be' f' 2 or 4, but get {reference_points.shape[-1]} instead.') if torch.cuda.is_available() and value.is_cuda: output = MultiScaleDeformableAttnFunction.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, sampling_locations, attention_weights) output = self.output_proj(output) if not self.batch_first: # (num_query, bs ,embed_dims) output = output.permute(1, 0, 2) return self.dropout(output) + identity ================================================ FILE: annotator/mmpkg/mmcv/ops/nms.py ================================================ import os import numpy as np import torch from annotator.mmpkg.mmcv.utils import deprecated_api_warning from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated']) # This function is modified from: https://github.com/pytorch/vision/ class NMSop(torch.autograd.Function): @staticmethod def forward(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num): is_filtering_by_score = score_threshold > 0 if is_filtering_by_score: valid_mask = scores > score_threshold bboxes, scores = bboxes[valid_mask], scores[valid_mask] valid_inds = torch.nonzero( valid_mask, as_tuple=False).squeeze(dim=1) inds = ext_module.nms( bboxes, scores, iou_threshold=float(iou_threshold), offset=offset) if max_num > 0: inds = inds[:max_num] if is_filtering_by_score: inds = valid_inds[inds] return inds @staticmethod def symbolic(g, bboxes, scores, iou_threshold, offset, score_threshold, max_num): from ..onnx import is_custom_op_loaded has_custom_op = is_custom_op_loaded() # TensorRT nms plugin is aligned with original nms in ONNXRuntime is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT' if has_custom_op and (not is_trt_backend): return g.op( 'mmcv::NonMaxSuppression', bboxes, scores, iou_threshold_f=float(iou_threshold), offset_i=int(offset)) else: from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze from ..onnx.onnx_utils.symbolic_helper import _size_helper boxes = unsqueeze(g, bboxes, 0) scores = unsqueeze(g, unsqueeze(g, scores, 0), 0) if max_num > 0: max_num = g.op( 'Constant', value_t=torch.tensor(max_num, dtype=torch.long)) else: dim = g.op('Constant', value_t=torch.tensor(0)) max_num = _size_helper(g, bboxes, dim) max_output_per_class = max_num iou_threshold = g.op( 'Constant', value_t=torch.tensor([iou_threshold], dtype=torch.float)) score_threshold = g.op( 'Constant', value_t=torch.tensor([score_threshold], dtype=torch.float)) nms_out = g.op('NonMaxSuppression', boxes, scores, max_output_per_class, iou_threshold, score_threshold) return squeeze( g, select( g, nms_out, 1, g.op( 'Constant', value_t=torch.tensor([2], dtype=torch.long))), 1) class SoftNMSop(torch.autograd.Function): @staticmethod def forward(ctx, boxes, scores, iou_threshold, sigma, min_score, method, offset): dets = boxes.new_empty((boxes.size(0), 5), device='cpu') inds = ext_module.softnms( boxes.cpu(), scores.cpu(), dets.cpu(), iou_threshold=float(iou_threshold), sigma=float(sigma), min_score=float(min_score), method=int(method), offset=int(offset)) return dets, inds @staticmethod def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method, offset): from packaging import version assert version.parse(torch.__version__) >= version.parse('1.7.0') nms_out = g.op( 'mmcv::SoftNonMaxSuppression', boxes, scores, iou_threshold_f=float(iou_threshold), sigma_f=float(sigma), min_score_f=float(min_score), method_i=int(method), offset_i=int(offset), outputs=2) return nms_out @deprecated_api_warning({'iou_thr': 'iou_threshold'}) def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1): """Dispatch to either CPU or GPU NMS implementations. The input can be either torch tensor or numpy array. GPU NMS will be used if the input is gpu tensor, otherwise CPU NMS will be used. The returned type will always be the same as inputs. Arguments: boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4). scores (torch.Tensor or np.ndarray): scores in shape (N, ). iou_threshold (float): IoU threshold for NMS. offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset). score_threshold (float): score threshold for NMS. max_num (int): maximum number of boxes after NMS. Returns: tuple: kept dets(boxes and scores) and indice, which is always the \ same data type as the input. Example: >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9], >>> [49.3, 32.9, 51.0, 35.3], >>> [49.2, 31.8, 51.0, 35.4], >>> [35.1, 11.5, 39.1, 15.7], >>> [35.6, 11.8, 39.3, 14.2], >>> [35.3, 11.5, 39.9, 14.5], >>> [35.2, 11.7, 39.7, 15.7]], dtype=np.float32) >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\ dtype=np.float32) >>> iou_threshold = 0.6 >>> dets, inds = nms(boxes, scores, iou_threshold) >>> assert len(inds) == len(dets) == 3 """ assert isinstance(boxes, (torch.Tensor, np.ndarray)) assert isinstance(scores, (torch.Tensor, np.ndarray)) is_numpy = False if isinstance(boxes, np.ndarray): is_numpy = True boxes = torch.from_numpy(boxes) if isinstance(scores, np.ndarray): scores = torch.from_numpy(scores) assert boxes.size(1) == 4 assert boxes.size(0) == scores.size(0) assert offset in (0, 1) if torch.__version__ == 'parrots': indata_list = [boxes, scores] indata_dict = { 'iou_threshold': float(iou_threshold), 'offset': int(offset) } inds = ext_module.nms(*indata_list, **indata_dict) else: inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold, max_num) dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1) if is_numpy: dets = dets.cpu().numpy() inds = inds.cpu().numpy() return dets, inds @deprecated_api_warning({'iou_thr': 'iou_threshold'}) def soft_nms(boxes, scores, iou_threshold=0.3, sigma=0.5, min_score=1e-3, method='linear', offset=0): """Dispatch to only CPU Soft NMS implementations. The input can be either a torch tensor or numpy array. The returned type will always be the same as inputs. Arguments: boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4). scores (torch.Tensor or np.ndarray): scores in shape (N, ). iou_threshold (float): IoU threshold for NMS. sigma (float): hyperparameter for gaussian method min_score (float): score filter threshold method (str): either 'linear' or 'gaussian' offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset). Returns: tuple: kept dets(boxes and scores) and indice, which is always the \ same data type as the input. Example: >>> boxes = np.array([[4., 3., 5., 3.], >>> [4., 3., 5., 4.], >>> [3., 1., 3., 1.], >>> [3., 1., 3., 1.], >>> [3., 1., 3., 1.], >>> [3., 1., 3., 1.]], dtype=np.float32) >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32) >>> iou_threshold = 0.6 >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5) >>> assert len(inds) == len(dets) == 5 """ assert isinstance(boxes, (torch.Tensor, np.ndarray)) assert isinstance(scores, (torch.Tensor, np.ndarray)) is_numpy = False if isinstance(boxes, np.ndarray): is_numpy = True boxes = torch.from_numpy(boxes) if isinstance(scores, np.ndarray): scores = torch.from_numpy(scores) assert boxes.size(1) == 4 assert boxes.size(0) == scores.size(0) assert offset in (0, 1) method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2} assert method in method_dict.keys() if torch.__version__ == 'parrots': dets = boxes.new_empty((boxes.size(0), 5), device='cpu') indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()] indata_dict = { 'iou_threshold': float(iou_threshold), 'sigma': float(sigma), 'min_score': min_score, 'method': method_dict[method], 'offset': int(offset) } inds = ext_module.softnms(*indata_list, **indata_dict) else: dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(), float(iou_threshold), float(sigma), float(min_score), method_dict[method], int(offset)) dets = dets[:inds.size(0)] if is_numpy: dets = dets.cpu().numpy() inds = inds.cpu().numpy() return dets, inds else: return dets.to(device=boxes.device), inds.to(device=boxes.device) def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False): """Performs non-maximum suppression in a batched fashion. Modified from https://github.com/pytorch/vision/blob /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39. In order to perform NMS independently per class, we add an offset to all the boxes. The offset is dependent only on the class idx, and is large enough so that boxes from different classes do not overlap. Arguments: boxes (torch.Tensor): boxes in shape (N, 4). scores (torch.Tensor): scores in shape (N, ). idxs (torch.Tensor): each index value correspond to a bbox cluster, and NMS will not be applied between elements of different idxs, shape (N, ). nms_cfg (dict): specify nms type and other parameters like iou_thr. Possible keys includes the following. - iou_thr (float): IoU threshold used for NMS. - split_thr (float): threshold number of boxes. In some cases the number of boxes is large (e.g., 200k). To avoid OOM during training, the users could set `split_thr` to a small value. If the number of boxes is greater than the threshold, it will perform NMS on each group of boxes separately and sequentially. Defaults to 10000. class_agnostic (bool): if true, nms is class agnostic, i.e. IoU thresholding happens over all boxes, regardless of the predicted class. Returns: tuple: kept dets and indice. """ nms_cfg_ = nms_cfg.copy() class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic) if class_agnostic: boxes_for_nms = boxes else: max_coordinate = boxes.max() offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes)) boxes_for_nms = boxes + offsets[:, None] nms_type = nms_cfg_.pop('type', 'nms') nms_op = eval(nms_type) split_thr = nms_cfg_.pop('split_thr', 10000) # Won't split to multiple nms nodes when exporting to onnx if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export(): dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_) boxes = boxes[keep] # -1 indexing works abnormal in TensorRT # This assumes `dets` has 5 dimensions where # the last dimension is score. # TODO: more elegant way to handle the dimension issue. # Some type of nms would reweight the score, such as SoftNMS scores = dets[:, 4] else: max_num = nms_cfg_.pop('max_num', -1) total_mask = scores.new_zeros(scores.size(), dtype=torch.bool) # Some type of nms would reweight the score, such as SoftNMS scores_after_nms = scores.new_zeros(scores.size()) for id in torch.unique(idxs): mask = (idxs == id).nonzero(as_tuple=False).view(-1) dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_) total_mask[mask[keep]] = True scores_after_nms[mask[keep]] = dets[:, -1] keep = total_mask.nonzero(as_tuple=False).view(-1) scores, inds = scores_after_nms[keep].sort(descending=True) keep = keep[inds] boxes = boxes[keep] if max_num > 0: keep = keep[:max_num] boxes = boxes[:max_num] scores = scores[:max_num] return torch.cat([boxes, scores[:, None]], -1), keep def nms_match(dets, iou_threshold): """Matched dets into different groups by NMS. NMS match is Similar to NMS but when a bbox is suppressed, nms match will record the indice of suppressed bbox and form a group with the indice of kept bbox. In each group, indice is sorted as score order. Arguments: dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5). iou_thr (float): IoU thresh for NMS. Returns: List[torch.Tensor | np.ndarray]: The outer list corresponds different matched group, the inner Tensor corresponds the indices for a group in score order. """ if dets.shape[0] == 0: matched = [] else: assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \ f'but get {dets.shape}' if isinstance(dets, torch.Tensor): dets_t = dets.detach().cpu() else: dets_t = torch.from_numpy(dets) indata_list = [dets_t] indata_dict = {'iou_threshold': float(iou_threshold)} matched = ext_module.nms_match(*indata_list, **indata_dict) if torch.__version__ == 'parrots': matched = matched.tolist() if isinstance(dets, torch.Tensor): return [dets.new_tensor(m, dtype=torch.long) for m in matched] else: return [np.array(m, dtype=np.int) for m in matched] def nms_rotated(dets, scores, iou_threshold, labels=None): """Performs non-maximum suppression (NMS) on the rotated boxes according to their intersection-over-union (IoU). Rotated NMS iteratively removes lower scoring rotated boxes which have an IoU greater than iou_threshold with another (higher scoring) rotated box. Args: boxes (Tensor): Rotated boxes in shape (N, 5). They are expected to \ be in (x_ctr, y_ctr, width, height, angle_radian) format. scores (Tensor): scores in shape (N, ). iou_threshold (float): IoU thresh for NMS. labels (Tensor): boxes' label in shape (N,). Returns: tuple: kept dets(boxes and scores) and indice, which is always the \ same data type as the input. """ if dets.shape[0] == 0: return dets, None multi_label = labels is not None if multi_label: dets_wl = torch.cat((dets, labels.unsqueeze(1)), 1) else: dets_wl = dets _, order = scores.sort(0, descending=True) dets_sorted = dets_wl.index_select(0, order) if torch.__version__ == 'parrots': keep_inds = ext_module.nms_rotated( dets_wl, scores, order, dets_sorted, iou_threshold=iou_threshold, multi_label=multi_label) else: keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted, iou_threshold, multi_label) dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)), dim=1) return dets, keep_inds ================================================ FILE: annotator/mmpkg/mmcv/ops/pixel_group.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['pixel_group']) def pixel_group(score, mask, embedding, kernel_label, kernel_contour, kernel_region_num, distance_threshold): """Group pixels into text instances, which is widely used text detection methods. Arguments: score (np.array or Tensor): The foreground score with size hxw. mask (np.array or Tensor): The foreground mask with size hxw. embedding (np.array or Tensor): The embedding with size hxwxc to distinguish instances. kernel_label (np.array or Tensor): The instance kernel index with size hxw. kernel_contour (np.array or Tensor): The kernel contour with size hxw. kernel_region_num (int): The instance kernel region number. distance_threshold (float): The embedding distance threshold between kernel and pixel in one instance. Returns: pixel_assignment (List[List[float]]): The instance coordinate list. Each element consists of averaged confidence, pixel number, and coordinates (x_i, y_i for all pixels) in order. """ assert isinstance(score, (torch.Tensor, np.ndarray)) assert isinstance(mask, (torch.Tensor, np.ndarray)) assert isinstance(embedding, (torch.Tensor, np.ndarray)) assert isinstance(kernel_label, (torch.Tensor, np.ndarray)) assert isinstance(kernel_contour, (torch.Tensor, np.ndarray)) assert isinstance(kernel_region_num, int) assert isinstance(distance_threshold, float) if isinstance(score, np.ndarray): score = torch.from_numpy(score) if isinstance(mask, np.ndarray): mask = torch.from_numpy(mask) if isinstance(embedding, np.ndarray): embedding = torch.from_numpy(embedding) if isinstance(kernel_label, np.ndarray): kernel_label = torch.from_numpy(kernel_label) if isinstance(kernel_contour, np.ndarray): kernel_contour = torch.from_numpy(kernel_contour) if torch.__version__ == 'parrots': label = ext_module.pixel_group( score, mask, embedding, kernel_label, kernel_contour, kernel_region_num=kernel_region_num, distance_threshold=distance_threshold) label = label.tolist() label = label[0] list_index = kernel_region_num pixel_assignment = [] for x in range(kernel_region_num): pixel_assignment.append( np.array( label[list_index:list_index + int(label[x])], dtype=np.float)) list_index = list_index + int(label[x]) else: pixel_assignment = ext_module.pixel_group(score, mask, embedding, kernel_label, kernel_contour, kernel_region_num, distance_threshold) return pixel_assignment ================================================ FILE: annotator/mmpkg/mmcv/ops/point_sample.py ================================================ # Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend # noqa from os import path as osp import torch import torch.nn as nn import torch.nn.functional as F from torch.nn.modules.utils import _pair from torch.onnx.operators import shape_as_tensor def bilinear_grid_sample(im, grid, align_corners=False): """Given an input and a flow-field grid, computes the output using input values and pixel locations from grid. Supported only bilinear interpolation method to sample the input pixels. Args: im (torch.Tensor): Input feature map, shape (N, C, H, W) grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2) align_corners {bool}: If set to True, the extrema (-1 and 1) are considered as referring to the center points of the input’s corner pixels. If set to False, they are instead considered as referring to the corner points of the input’s corner pixels, making the sampling more resolution agnostic. Returns: torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg) """ n, c, h, w = im.shape gn, gh, gw, _ = grid.shape assert n == gn x = grid[:, :, :, 0] y = grid[:, :, :, 1] if align_corners: x = ((x + 1) / 2) * (w - 1) y = ((y + 1) / 2) * (h - 1) else: x = ((x + 1) * w - 1) / 2 y = ((y + 1) * h - 1) / 2 x = x.view(n, -1) y = y.view(n, -1) x0 = torch.floor(x).long() y0 = torch.floor(y).long() x1 = x0 + 1 y1 = y0 + 1 wa = ((x1 - x) * (y1 - y)).unsqueeze(1) wb = ((x1 - x) * (y - y0)).unsqueeze(1) wc = ((x - x0) * (y1 - y)).unsqueeze(1) wd = ((x - x0) * (y - y0)).unsqueeze(1) # Apply default for grid_sample function zero padding im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0) padded_h = h + 2 padded_w = w + 2 # save points positions after padding x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1 # Clip coordinates to padded image size x0 = torch.where(x0 < 0, torch.tensor(0), x0) x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0) x1 = torch.where(x1 < 0, torch.tensor(0), x1) x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1) y0 = torch.where(y0 < 0, torch.tensor(0), y0) y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0) y1 = torch.where(y1 < 0, torch.tensor(0), y1) y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1) im_padded = im_padded.view(n, c, -1) x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1) x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1) x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1) x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1) Ia = torch.gather(im_padded, 2, x0_y0) Ib = torch.gather(im_padded, 2, x0_y1) Ic = torch.gather(im_padded, 2, x1_y0) Id = torch.gather(im_padded, 2, x1_y1) return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw) def is_in_onnx_export_without_custom_ops(): from annotator.mmpkg.mmcv.ops import get_onnxruntime_op_path ort_custom_op_path = get_onnxruntime_op_path() return torch.onnx.is_in_onnx_export( ) and not osp.exists(ort_custom_op_path) def normalize(grid): """Normalize input grid from [-1, 1] to [0, 1] Args: grid (Tensor): The grid to be normalize, range [-1, 1]. Returns: Tensor: Normalized grid, range [0, 1]. """ return (grid + 1.0) / 2.0 def denormalize(grid): """Denormalize input grid from range [0, 1] to [-1, 1] Args: grid (Tensor): The grid to be denormalize, range [0, 1]. Returns: Tensor: Denormalized grid, range [-1, 1]. """ return grid * 2.0 - 1.0 def generate_grid(num_grid, size, device): """Generate regular square grid of points in [0, 1] x [0, 1] coordinate space. Args: num_grid (int): The number of grids to sample, one for each region. size (tuple(int, int)): The side size of the regular grid. device (torch.device): Desired device of returned tensor. Returns: (torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that contains coordinates for the regular grids. """ affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device) grid = F.affine_grid( affine_trans, torch.Size((1, 1, *size)), align_corners=False) grid = normalize(grid) return grid.view(1, -1, 2).expand(num_grid, -1, -1) def rel_roi_point_to_abs_img_point(rois, rel_roi_points): """Convert roi based relative point coordinates to image based absolute point coordinates. Args: rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5) rel_roi_points (Tensor): Point coordinates inside RoI, relative to RoI, location, range (0, 1), shape (N, P, 2) Returns: Tensor: Image based absolute point coordinates, shape (N, P, 2) """ with torch.no_grad(): assert rel_roi_points.size(0) == rois.size(0) assert rois.dim() == 2 assert rel_roi_points.dim() == 3 assert rel_roi_points.size(2) == 2 # remove batch idx if rois.size(1) == 5: rois = rois[:, 1:] abs_img_points = rel_roi_points.clone() # To avoid an error during exporting to onnx use independent # variables instead inplace computation xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0]) ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1]) xs += rois[:, None, 0] ys += rois[:, None, 1] abs_img_points = torch.stack([xs, ys], dim=2) return abs_img_points def get_shape_from_feature_map(x): """Get spatial resolution of input feature map considering exporting to onnx mode. Args: x (torch.Tensor): Input tensor, shape (N, C, H, W) Returns: torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2) """ if torch.onnx.is_in_onnx_export(): img_shape = shape_as_tensor(x)[2:].flip(0).view(1, 1, 2).to( x.device).float() else: img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1, 2).to( x.device).float() return img_shape def abs_img_point_to_rel_img_point(abs_img_points, img, spatial_scale=1.): """Convert image based absolute point coordinates to image based relative coordinates for sampling. Args: abs_img_points (Tensor): Image based absolute point coordinates, shape (N, P, 2) img (tuple/Tensor): (height, width) of image or feature map. spatial_scale (float): Scale points by this factor. Default: 1. Returns: Tensor: Image based relative point coordinates for sampling, shape (N, P, 2) """ assert (isinstance(img, tuple) and len(img) == 2) or \ (isinstance(img, torch.Tensor) and len(img.shape) == 4) if isinstance(img, tuple): h, w = img scale = torch.tensor([w, h], dtype=torch.float, device=abs_img_points.device) scale = scale.view(1, 1, 2) else: scale = get_shape_from_feature_map(img) return abs_img_points / scale * spatial_scale def rel_roi_point_to_rel_img_point(rois, rel_roi_points, img, spatial_scale=1.): """Convert roi based relative point coordinates to image based absolute point coordinates. Args: rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5) rel_roi_points (Tensor): Point coordinates inside RoI, relative to RoI, location, range (0, 1), shape (N, P, 2) img (tuple/Tensor): (height, width) of image or feature map. spatial_scale (float): Scale points by this factor. Default: 1. Returns: Tensor: Image based relative point coordinates for sampling, shape (N, P, 2) """ abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points) rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img, spatial_scale) return rel_img_point def point_sample(input, points, align_corners=False, **kwargs): """A wrapper around :func:`grid_sample` to support 3D point_coords tensors Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to lie inside ``[0, 1] x [0, 1]`` square. Args: input (Tensor): Feature map, shape (N, C, H, W). points (Tensor): Image based absolute point coordinates (normalized), range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2). align_corners (bool): Whether align_corners. Default: False Returns: Tensor: Features of `point` on `input`, shape (N, C, P) or (N, C, Hgrid, Wgrid). """ add_dim = False if points.dim() == 3: add_dim = True points = points.unsqueeze(2) if is_in_onnx_export_without_custom_ops(): # If custom ops for onnx runtime not compiled use python # implementation of grid_sample function to make onnx graph # with supported nodes output = bilinear_grid_sample( input, denormalize(points), align_corners=align_corners) else: output = F.grid_sample( input, denormalize(points), align_corners=align_corners, **kwargs) if add_dim: output = output.squeeze(3) return output class SimpleRoIAlign(nn.Module): def __init__(self, output_size, spatial_scale, aligned=True): """Simple RoI align in PointRend, faster than standard RoIAlign. Args: output_size (tuple[int]): h, w spatial_scale (float): scale the input boxes by this number aligned (bool): if False, use the legacy implementation in MMDetection, align_corners=True will be used in F.grid_sample. If True, align the results more perfectly. """ super(SimpleRoIAlign, self).__init__() self.output_size = _pair(output_size) self.spatial_scale = float(spatial_scale) # to be consistent with other RoI ops self.use_torchvision = False self.aligned = aligned def forward(self, features, rois): num_imgs = features.size(0) num_rois = rois.size(0) rel_roi_points = generate_grid( num_rois, self.output_size, device=rois.device) if torch.onnx.is_in_onnx_export(): rel_img_points = rel_roi_point_to_rel_img_point( rois, rel_roi_points, features, self.spatial_scale) rel_img_points = rel_img_points.reshape(num_imgs, -1, *rel_img_points.shape[1:]) point_feats = point_sample( features, rel_img_points, align_corners=not self.aligned) point_feats = point_feats.transpose(1, 2) else: point_feats = [] for batch_ind in range(num_imgs): # unravel batch dim feat = features[batch_ind].unsqueeze(0) inds = (rois[:, 0].long() == batch_ind) if inds.any(): rel_img_points = rel_roi_point_to_rel_img_point( rois[inds], rel_roi_points[inds], feat, self.spatial_scale).unsqueeze(0) point_feat = point_sample( feat, rel_img_points, align_corners=not self.aligned) point_feat = point_feat.squeeze(0).transpose(0, 1) point_feats.append(point_feat) point_feats = torch.cat(point_feats, dim=0) channels = features.size(1) roi_feats = point_feats.reshape(num_rois, channels, *self.output_size) return roi_feats def __repr__(self): format_str = self.__class__.__name__ format_str += '(output_size={}, spatial_scale={}'.format( self.output_size, self.spatial_scale) return format_str ================================================ FILE: annotator/mmpkg/mmcv/ops/points_in_boxes.py ================================================ import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', [ 'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward', 'points_in_boxes_all_forward' ]) def points_in_boxes_part(points, boxes): """Find the box in which each point is (CUDA). Args: points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate boxes (torch.Tensor): [B, T, 7], num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in LiDAR/DEPTH coordinate, (x, y, z) is the bottom center Returns: box_idxs_of_pts (torch.Tensor): (B, M), default background = -1 """ assert points.shape[0] == boxes.shape[0], \ 'Points and boxes should have the same batch size, ' \ f'but got {points.shape[0]} and {boxes.shape[0]}' assert boxes.shape[2] == 7, \ 'boxes dimension should be 7, ' \ f'but got unexpected shape {boxes.shape[2]}' assert points.shape[2] == 3, \ 'points dimension should be 3, ' \ f'but got unexpected shape {points.shape[2]}' batch_size, num_points, _ = points.shape box_idxs_of_pts = points.new_zeros((batch_size, num_points), dtype=torch.int).fill_(-1) # If manually put the tensor 'points' or 'boxes' on a device # which is not the current device, some temporary variables # will be created on the current device in the cuda op, # and the output will be incorrect. # Therefore, we force the current device to be the same # as the device of the tensors if it was not. # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305 # for the incorrect output before the fix. points_device = points.get_device() assert points_device == boxes.get_device(), \ 'Points and boxes should be put on the same device' if torch.cuda.current_device() != points_device: torch.cuda.set_device(points_device) ext_module.points_in_boxes_part_forward(boxes.contiguous(), points.contiguous(), box_idxs_of_pts) return box_idxs_of_pts def points_in_boxes_cpu(points, boxes): """Find all boxes in which each point is (CPU). The CPU version of :meth:`points_in_boxes_all`. Args: points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate boxes (torch.Tensor): [B, T, 7], num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz], (x, y, z) is the bottom center. Returns: box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0. """ assert points.shape[0] == boxes.shape[0], \ 'Points and boxes should have the same batch size, ' \ f'but got {points.shape[0]} and {boxes.shape[0]}' assert boxes.shape[2] == 7, \ 'boxes dimension should be 7, ' \ f'but got unexpected shape {boxes.shape[2]}' assert points.shape[2] == 3, \ 'points dimension should be 3, ' \ f'but got unexpected shape {points.shape[2]}' batch_size, num_points, _ = points.shape num_boxes = boxes.shape[1] point_indices = points.new_zeros((batch_size, num_boxes, num_points), dtype=torch.int) for b in range(batch_size): ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(), points[b].float().contiguous(), point_indices[b]) point_indices = point_indices.transpose(1, 2) return point_indices def points_in_boxes_all(points, boxes): """Find all boxes in which each point is (CUDA). Args: points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate boxes (torch.Tensor): [B, T, 7], num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz], (x, y, z) is the bottom center. Returns: box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0. """ assert boxes.shape[0] == points.shape[0], \ 'Points and boxes should have the same batch size, ' \ f'but got {boxes.shape[0]} and {boxes.shape[0]}' assert boxes.shape[2] == 7, \ 'boxes dimension should be 7, ' \ f'but got unexpected shape {boxes.shape[2]}' assert points.shape[2] == 3, \ 'points dimension should be 3, ' \ f'but got unexpected shape {points.shape[2]}' batch_size, num_points, _ = points.shape num_boxes = boxes.shape[1] box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes), dtype=torch.int).fill_(0) # Same reason as line 25-32 points_device = points.get_device() assert points_device == boxes.get_device(), \ 'Points and boxes should be put on the same device' if torch.cuda.current_device() != points_device: torch.cuda.set_device(points_device) ext_module.points_in_boxes_all_forward(boxes.contiguous(), points.contiguous(), box_idxs_of_pts) return box_idxs_of_pts ================================================ FILE: annotator/mmpkg/mmcv/ops/points_sampler.py ================================================ from typing import List import torch from torch import nn as nn from annotator.mmpkg.mmcv.runner import force_fp32 from .furthest_point_sample import (furthest_point_sample, furthest_point_sample_with_dist) def calc_square_dist(point_feat_a, point_feat_b, norm=True): """Calculating square distance between a and b. Args: point_feat_a (Tensor): (B, N, C) Feature vector of each point. point_feat_b (Tensor): (B, M, C) Feature vector of each point. norm (Bool, optional): Whether to normalize the distance. Default: True. Returns: Tensor: (B, N, M) Distance between each pair points. """ num_channel = point_feat_a.shape[-1] # [bs, n, 1] a_square = torch.sum(point_feat_a.unsqueeze(dim=2).pow(2), dim=-1) # [bs, 1, m] b_square = torch.sum(point_feat_b.unsqueeze(dim=1).pow(2), dim=-1) corr_matrix = torch.matmul(point_feat_a, point_feat_b.transpose(1, 2)) dist = a_square + b_square - 2 * corr_matrix if norm: dist = torch.sqrt(dist) / num_channel return dist def get_sampler_cls(sampler_type): """Get the type and mode of points sampler. Args: sampler_type (str): The type of points sampler. The valid value are "D-FPS", "F-FPS", or "FS". Returns: class: Points sampler type. """ sampler_mappings = { 'D-FPS': DFPSSampler, 'F-FPS': FFPSSampler, 'FS': FSSampler, } try: return sampler_mappings[sampler_type] except KeyError: raise KeyError( f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \ {sampler_type}') class PointsSampler(nn.Module): """Points sampling. Args: num_point (list[int]): Number of sample points. fps_mod_list (list[str], optional): Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. F-FPS: using feature distances for FPS. D-FPS: using Euclidean distances of points for FPS. FS: using F-FPS and D-FPS simultaneously. fps_sample_range_list (list[int], optional): Range of points to apply FPS. Default: [-1]. """ def __init__(self, num_point: List[int], fps_mod_list: List[str] = ['D-FPS'], fps_sample_range_list: List[int] = [-1]): super().__init__() # FPS would be applied to different fps_mod in the list, # so the length of the num_point should be equal to # fps_mod_list and fps_sample_range_list. assert len(num_point) == len(fps_mod_list) == len( fps_sample_range_list) self.num_point = num_point self.fps_sample_range_list = fps_sample_range_list self.samplers = nn.ModuleList() for fps_mod in fps_mod_list: self.samplers.append(get_sampler_cls(fps_mod)()) self.fp16_enabled = False @force_fp32() def forward(self, points_xyz, features): """ Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. features (Tensor): (B, C, N) Descriptors of the features. Returns: Tensor: (B, npoint, sample_num) Indices of sampled points. """ indices = [] last_fps_end_index = 0 for fps_sample_range, sampler, npoint in zip( self.fps_sample_range_list, self.samplers, self.num_point): assert fps_sample_range < points_xyz.shape[1] if fps_sample_range == -1: sample_points_xyz = points_xyz[:, last_fps_end_index:] if features is not None: sample_features = features[:, :, last_fps_end_index:] else: sample_features = None else: sample_points_xyz = \ points_xyz[:, last_fps_end_index:fps_sample_range] if features is not None: sample_features = features[:, :, last_fps_end_index: fps_sample_range] else: sample_features = None fps_idx = sampler(sample_points_xyz.contiguous(), sample_features, npoint) indices.append(fps_idx + last_fps_end_index) last_fps_end_index += fps_sample_range indices = torch.cat(indices, dim=1) return indices class DFPSSampler(nn.Module): """Using Euclidean distances of points for FPS.""" def __init__(self): super().__init__() def forward(self, points, features, npoint): """Sampling points with D-FPS.""" fps_idx = furthest_point_sample(points.contiguous(), npoint) return fps_idx class FFPSSampler(nn.Module): """Using feature distances for FPS.""" def __init__(self): super().__init__() def forward(self, points, features, npoint): """Sampling points with F-FPS.""" assert features is not None, \ 'feature input to FFPS_Sampler should not be None' features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2) features_dist = calc_square_dist( features_for_fps, features_for_fps, norm=False) fps_idx = furthest_point_sample_with_dist(features_dist, npoint) return fps_idx class FSSampler(nn.Module): """Using F-FPS and D-FPS simultaneously.""" def __init__(self): super().__init__() def forward(self, points, features, npoint): """Sampling points with FS_Sampling.""" assert features is not None, \ 'feature input to FS_Sampler should not be None' ffps_sampler = FFPSSampler() dfps_sampler = DFPSSampler() fps_idx_ffps = ffps_sampler(points, features, npoint) fps_idx_dfps = dfps_sampler(points, features, npoint) fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1) return fps_idx ================================================ FILE: annotator/mmpkg/mmcv/ops/psa_mask.py ================================================ # Modified from https://github.com/hszhao/semseg/blob/master/lib/psa from torch import nn from torch.autograd import Function from torch.nn.modules.utils import _pair from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['psamask_forward', 'psamask_backward']) class PSAMaskFunction(Function): @staticmethod def symbolic(g, input, psa_type, mask_size): return g.op( 'mmcv::MMCVPSAMask', input, psa_type_i=psa_type, mask_size_i=mask_size) @staticmethod def forward(ctx, input, psa_type, mask_size): ctx.psa_type = psa_type ctx.mask_size = _pair(mask_size) ctx.save_for_backward(input) h_mask, w_mask = ctx.mask_size batch_size, channels, h_feature, w_feature = input.size() assert channels == h_mask * w_mask output = input.new_zeros( (batch_size, h_feature * w_feature, h_feature, w_feature)) ext_module.psamask_forward( input, output, psa_type=psa_type, num_=batch_size, h_feature=h_feature, w_feature=w_feature, h_mask=h_mask, w_mask=w_mask, half_h_mask=(h_mask - 1) // 2, half_w_mask=(w_mask - 1) // 2) return output @staticmethod def backward(ctx, grad_output): input = ctx.saved_tensors[0] psa_type = ctx.psa_type h_mask, w_mask = ctx.mask_size batch_size, channels, h_feature, w_feature = input.size() grad_input = grad_output.new_zeros( (batch_size, channels, h_feature, w_feature)) ext_module.psamask_backward( grad_output, grad_input, psa_type=psa_type, num_=batch_size, h_feature=h_feature, w_feature=w_feature, h_mask=h_mask, w_mask=w_mask, half_h_mask=(h_mask - 1) // 2, half_w_mask=(w_mask - 1) // 2) return grad_input, None, None, None psa_mask = PSAMaskFunction.apply class PSAMask(nn.Module): def __init__(self, psa_type, mask_size=None): super(PSAMask, self).__init__() assert psa_type in ['collect', 'distribute'] if psa_type == 'collect': psa_type_enum = 0 else: psa_type_enum = 1 self.psa_type_enum = psa_type_enum self.mask_size = mask_size self.psa_type = psa_type def forward(self, input): return psa_mask(input, self.psa_type_enum, self.mask_size) def __repr__(self): s = self.__class__.__name__ s += f'(psa_type={self.psa_type}, ' s += f'mask_size={self.mask_size})' return s ================================================ FILE: annotator/mmpkg/mmcv/ops/roi_align.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from ..utils import deprecated_api_warning, ext_loader ext_module = ext_loader.load_ext('_ext', ['roi_align_forward', 'roi_align_backward']) class RoIAlignFunction(Function): @staticmethod def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio, pool_mode, aligned): from ..onnx import is_custom_op_loaded has_custom_op = is_custom_op_loaded() if has_custom_op: return g.op( 'mmcv::MMCVRoiAlign', input, rois, output_height_i=output_size[0], output_width_i=output_size[1], spatial_scale_f=spatial_scale, sampling_ratio_i=sampling_ratio, mode_s=pool_mode, aligned_i=aligned) else: from torch.onnx.symbolic_opset9 import sub, squeeze from torch.onnx.symbolic_helper import _slice_helper from torch.onnx import TensorProtoDataType # batch_indices = rois[:, 0].long() batch_indices = _slice_helper( g, rois, axes=[1], starts=[0], ends=[1]) batch_indices = squeeze(g, batch_indices, 1) batch_indices = g.op( 'Cast', batch_indices, to_i=TensorProtoDataType.INT64) # rois = rois[:, 1:] rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5]) if aligned: # rois -= 0.5/spatial_scale aligned_offset = g.op( 'Constant', value_t=torch.tensor([0.5 / spatial_scale], dtype=torch.float32)) rois = sub(g, rois, aligned_offset) # roi align return g.op( 'RoiAlign', input, rois, batch_indices, output_height_i=output_size[0], output_width_i=output_size[1], spatial_scale_f=spatial_scale, sampling_ratio_i=max(0, sampling_ratio), mode_s=pool_mode) @staticmethod def forward(ctx, input, rois, output_size, spatial_scale=1.0, sampling_ratio=0, pool_mode='avg', aligned=True): ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.sampling_ratio = sampling_ratio assert pool_mode in ('max', 'avg') ctx.pool_mode = 0 if pool_mode == 'max' else 1 ctx.aligned = aligned ctx.input_shape = input.size() assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!' output_shape = (rois.size(0), input.size(1), ctx.output_size[0], ctx.output_size[1]) output = input.new_zeros(output_shape) if ctx.pool_mode == 0: argmax_y = input.new_zeros(output_shape) argmax_x = input.new_zeros(output_shape) else: argmax_y = input.new_zeros(0) argmax_x = input.new_zeros(0) ext_module.roi_align_forward( input, rois, output, argmax_y, argmax_x, aligned_height=ctx.output_size[0], aligned_width=ctx.output_size[1], spatial_scale=ctx.spatial_scale, sampling_ratio=ctx.sampling_ratio, pool_mode=ctx.pool_mode, aligned=ctx.aligned) ctx.save_for_backward(rois, argmax_y, argmax_x) return output @staticmethod @once_differentiable def backward(ctx, grad_output): rois, argmax_y, argmax_x = ctx.saved_tensors grad_input = grad_output.new_zeros(ctx.input_shape) # complex head architecture may cause grad_output uncontiguous. grad_output = grad_output.contiguous() ext_module.roi_align_backward( grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height=ctx.output_size[0], aligned_width=ctx.output_size[1], spatial_scale=ctx.spatial_scale, sampling_ratio=ctx.sampling_ratio, pool_mode=ctx.pool_mode, aligned=ctx.aligned) return grad_input, None, None, None, None, None, None roi_align = RoIAlignFunction.apply class RoIAlign(nn.Module): """RoI align pooling layer. Args: output_size (tuple): h, w spatial_scale (float): scale the input boxes by this number sampling_ratio (int): number of inputs samples to take for each output sample. 0 to take samples densely for current models. pool_mode (str, 'avg' or 'max'): pooling mode in each bin. aligned (bool): if False, use the legacy implementation in MMDetection. If True, align the results more perfectly. use_torchvision (bool): whether to use roi_align from torchvision. Note: The implementation of RoIAlign when aligned=True is modified from https://github.com/facebookresearch/detectron2/ The meaning of aligned=True: Given a continuous coordinate c, its two neighboring pixel indices (in our pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled from the underlying signal at continuous coordinates 0.5 and 1.5). But the original roi_align (aligned=False) does not subtract the 0.5 when computing neighboring pixel indices and therefore it uses pixels with a slightly incorrect alignment (relative to our pixel model) when performing bilinear interpolation. With `aligned=True`, we first appropriately scale the ROI and then shift it by -0.5 prior to calling roi_align. This produces the correct neighbors; The difference does not make a difference to the model's performance if ROIAlign is used together with conv layers. """ @deprecated_api_warning( { 'out_size': 'output_size', 'sample_num': 'sampling_ratio' }, cls_name='RoIAlign') def __init__(self, output_size, spatial_scale=1.0, sampling_ratio=0, pool_mode='avg', aligned=True, use_torchvision=False): super(RoIAlign, self).__init__() self.output_size = _pair(output_size) self.spatial_scale = float(spatial_scale) self.sampling_ratio = int(sampling_ratio) self.pool_mode = pool_mode self.aligned = aligned self.use_torchvision = use_torchvision def forward(self, input, rois): """ Args: input: NCHW images rois: Bx5 boxes. First column is the index into N.\ The other 4 columns are xyxy. """ if self.use_torchvision: from torchvision.ops import roi_align as tv_roi_align if 'aligned' in tv_roi_align.__code__.co_varnames: return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned) else: if self.aligned: rois -= rois.new_tensor([0.] + [0.5 / self.spatial_scale] * 4) return tv_roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio) else: return roi_align(input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.pool_mode, self.aligned) def __repr__(self): s = self.__class__.__name__ s += f'(output_size={self.output_size}, ' s += f'spatial_scale={self.spatial_scale}, ' s += f'sampling_ratio={self.sampling_ratio}, ' s += f'pool_mode={self.pool_mode}, ' s += f'aligned={self.aligned}, ' s += f'use_torchvision={self.use_torchvision})' return s ================================================ FILE: annotator/mmpkg/mmcv/ops/roi_align_rotated.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn as nn from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward']) class RoIAlignRotatedFunction(Function): @staticmethod def symbolic(g, features, rois, out_size, spatial_scale, sample_num, aligned, clockwise): if isinstance(out_size, int): out_h = out_size out_w = out_size elif isinstance(out_size, tuple): assert len(out_size) == 2 assert isinstance(out_size[0], int) assert isinstance(out_size[1], int) out_h, out_w = out_size else: raise TypeError( '"out_size" must be an integer or tuple of integers') return g.op( 'mmcv::MMCVRoIAlignRotated', features, rois, output_height_i=out_h, output_width_i=out_h, spatial_scale_f=spatial_scale, sampling_ratio_i=sample_num, aligned_i=aligned, clockwise_i=clockwise) @staticmethod def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0, aligned=True, clockwise=False): if isinstance(out_size, int): out_h = out_size out_w = out_size elif isinstance(out_size, tuple): assert len(out_size) == 2 assert isinstance(out_size[0], int) assert isinstance(out_size[1], int) out_h, out_w = out_size else: raise TypeError( '"out_size" must be an integer or tuple of integers') ctx.spatial_scale = spatial_scale ctx.sample_num = sample_num ctx.aligned = aligned ctx.clockwise = clockwise ctx.save_for_backward(rois) ctx.feature_size = features.size() batch_size, num_channels, data_height, data_width = features.size() num_rois = rois.size(0) output = features.new_zeros(num_rois, num_channels, out_h, out_w) ext_module.roi_align_rotated_forward( features, rois, output, pooled_height=out_h, pooled_width=out_w, spatial_scale=spatial_scale, sample_num=sample_num, aligned=aligned, clockwise=clockwise) return output @staticmethod def backward(ctx, grad_output): feature_size = ctx.feature_size spatial_scale = ctx.spatial_scale aligned = ctx.aligned clockwise = ctx.clockwise sample_num = ctx.sample_num rois = ctx.saved_tensors[0] assert feature_size is not None batch_size, num_channels, data_height, data_width = feature_size out_w = grad_output.size(3) out_h = grad_output.size(2) grad_input = grad_rois = None if ctx.needs_input_grad[0]: grad_input = rois.new_zeros(batch_size, num_channels, data_height, data_width) ext_module.roi_align_rotated_backward( grad_output.contiguous(), rois, grad_input, pooled_height=out_h, pooled_width=out_w, spatial_scale=spatial_scale, sample_num=sample_num, aligned=aligned, clockwise=clockwise) return grad_input, grad_rois, None, None, None, None, None roi_align_rotated = RoIAlignRotatedFunction.apply class RoIAlignRotated(nn.Module): """RoI align pooling layer for rotated proposals. It accepts a feature map of shape (N, C, H, W) and rois with shape (n, 6) with each roi decoded as (batch_index, center_x, center_y, w, h, angle). The angle is in radian. Args: out_size (tuple): h, w spatial_scale (float): scale the input boxes by this number sample_num (int): number of inputs samples to take for each output sample. 0 to take samples densely for current models. aligned (bool): if False, use the legacy implementation in MMDetection. If True, align the results more perfectly. Default: True. clockwise (bool): If True, the angle in each proposal follows a clockwise fashion in image space, otherwise, the angle is counterclockwise. Default: False. Note: The implementation of RoIAlign when aligned=True is modified from https://github.com/facebookresearch/detectron2/ The meaning of aligned=True: Given a continuous coordinate c, its two neighboring pixel indices (in our pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled from the underlying signal at continuous coordinates 0.5 and 1.5). But the original roi_align (aligned=False) does not subtract the 0.5 when computing neighboring pixel indices and therefore it uses pixels with a slightly incorrect alignment (relative to our pixel model) when performing bilinear interpolation. With `aligned=True`, we first appropriately scale the ROI and then shift it by -0.5 prior to calling roi_align. This produces the correct neighbors; The difference does not make a difference to the model's performance if ROIAlign is used together with conv layers. """ def __init__(self, out_size, spatial_scale, sample_num=0, aligned=True, clockwise=False): super(RoIAlignRotated, self).__init__() self.out_size = out_size self.spatial_scale = float(spatial_scale) self.sample_num = int(sample_num) self.aligned = aligned self.clockwise = clockwise def forward(self, features, rois): return RoIAlignRotatedFunction.apply(features, rois, self.out_size, self.spatial_scale, self.sample_num, self.aligned, self.clockwise) ================================================ FILE: annotator/mmpkg/mmcv/ops/roi_pool.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['roi_pool_forward', 'roi_pool_backward']) class RoIPoolFunction(Function): @staticmethod def symbolic(g, input, rois, output_size, spatial_scale): return g.op( 'MaxRoiPool', input, rois, pooled_shape_i=output_size, spatial_scale_f=spatial_scale) @staticmethod def forward(ctx, input, rois, output_size, spatial_scale=1.0): ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.input_shape = input.size() assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!' output_shape = (rois.size(0), input.size(1), ctx.output_size[0], ctx.output_size[1]) output = input.new_zeros(output_shape) argmax = input.new_zeros(output_shape, dtype=torch.int) ext_module.roi_pool_forward( input, rois, output, argmax, pooled_height=ctx.output_size[0], pooled_width=ctx.output_size[1], spatial_scale=ctx.spatial_scale) ctx.save_for_backward(rois, argmax) return output @staticmethod @once_differentiable def backward(ctx, grad_output): rois, argmax = ctx.saved_tensors grad_input = grad_output.new_zeros(ctx.input_shape) ext_module.roi_pool_backward( grad_output, rois, argmax, grad_input, pooled_height=ctx.output_size[0], pooled_width=ctx.output_size[1], spatial_scale=ctx.spatial_scale) return grad_input, None, None, None roi_pool = RoIPoolFunction.apply class RoIPool(nn.Module): def __init__(self, output_size, spatial_scale=1.0): super(RoIPool, self).__init__() self.output_size = _pair(output_size) self.spatial_scale = float(spatial_scale) def forward(self, input, rois): return roi_pool(input, rois, self.output_size, self.spatial_scale) def __repr__(self): s = self.__class__.__name__ s += f'(output_size={self.output_size}, ' s += f'spatial_scale={self.spatial_scale})' return s ================================================ FILE: annotator/mmpkg/mmcv/ops/roiaware_pool3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn as nn from torch.autograd import Function import annotator.mmpkg.mmcv as mmcv from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward']) class RoIAwarePool3d(nn.Module): """Encode the geometry-specific features of each 3D proposal. Please refer to `PartA2 `_ for more details. Args: out_size (int or tuple): The size of output features. n or [n1, n2, n3]. max_pts_per_voxel (int, optional): The maximum number of points per voxel. Default: 128. mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'. Default: 'max'. """ def __init__(self, out_size, max_pts_per_voxel=128, mode='max'): super().__init__() self.out_size = out_size self.max_pts_per_voxel = max_pts_per_voxel assert mode in ['max', 'avg'] pool_mapping = {'max': 0, 'avg': 1} self.mode = pool_mapping[mode] def forward(self, rois, pts, pts_feature): """ Args: rois (torch.Tensor): [N, 7], in LiDAR coordinate, (x, y, z) is the bottom center of rois. pts (torch.Tensor): [npoints, 3], coordinates of input points. pts_feature (torch.Tensor): [npoints, C], features of input points. Returns: pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C] """ return RoIAwarePool3dFunction.apply(rois, pts, pts_feature, self.out_size, self.max_pts_per_voxel, self.mode) class RoIAwarePool3dFunction(Function): @staticmethod def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel, mode): """ Args: rois (torch.Tensor): [N, 7], in LiDAR coordinate, (x, y, z) is the bottom center of rois. pts (torch.Tensor): [npoints, 3], coordinates of input points. pts_feature (torch.Tensor): [npoints, C], features of input points. out_size (int or tuple): The size of output features. n or [n1, n2, n3]. max_pts_per_voxel (int): The maximum number of points per voxel. Default: 128. mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average pool). Returns: pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C], output pooled features. """ if isinstance(out_size, int): out_x = out_y = out_z = out_size else: assert len(out_size) == 3 assert mmcv.is_tuple_of(out_size, int) out_x, out_y, out_z = out_size num_rois = rois.shape[0] num_channels = pts_feature.shape[-1] num_pts = pts.shape[0] pooled_features = pts_feature.new_zeros( (num_rois, out_x, out_y, out_z, num_channels)) argmax = pts_feature.new_zeros( (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int) pts_idx_of_voxels = pts_feature.new_zeros( (num_rois, out_x, out_y, out_z, max_pts_per_voxel), dtype=torch.int) ext_module.roiaware_pool3d_forward(rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features, mode) ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode, num_pts, num_channels) return pooled_features @staticmethod def backward(ctx, grad_out): ret = ctx.roiaware_pool3d_for_backward pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret grad_in = grad_out.new_zeros((num_pts, num_channels)) ext_module.roiaware_pool3d_backward(pts_idx_of_voxels, argmax, grad_out.contiguous(), grad_in, mode) return None, None, grad_in, None, None, None ================================================ FILE: annotator/mmpkg/mmcv/ops/roipoint_pool3d.py ================================================ from torch import nn as nn from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward']) class RoIPointPool3d(nn.Module): """Encode the geometry-specific features of each 3D proposal. Please refer to `Paper of PartA2 `_ for more details. Args: num_sampled_points (int, optional): Number of samples in each roi. Default: 512. """ def __init__(self, num_sampled_points=512): super().__init__() self.num_sampled_points = num_sampled_points def forward(self, points, point_features, boxes3d): """ Args: points (torch.Tensor): Input points whose shape is (B, N, C). point_features (torch.Tensor): Features of input points whose shape is (B, N, C). boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7). Returns: pooled_features (torch.Tensor): The output pooled features whose shape is (B, M, 512, 3 + C). pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M). """ return RoIPointPool3dFunction.apply(points, point_features, boxes3d, self.num_sampled_points) class RoIPointPool3dFunction(Function): @staticmethod def forward(ctx, points, point_features, boxes3d, num_sampled_points=512): """ Args: points (torch.Tensor): Input points whose shape is (B, N, C). point_features (torch.Tensor): Features of input points whose shape is (B, N, C). boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7). num_sampled_points (int, optional): The num of sampled points. Default: 512. Returns: pooled_features (torch.Tensor): The output pooled features whose shape is (B, M, 512, 3 + C). pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M). """ assert len(points.shape) == 3 and points.shape[2] == 3 batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[ 1], point_features.shape[2] pooled_boxes3d = boxes3d.view(batch_size, -1, 7) pooled_features = point_features.new_zeros( (batch_size, boxes_num, num_sampled_points, 3 + feature_len)) pooled_empty_flag = point_features.new_zeros( (batch_size, boxes_num)).int() ext_module.roipoint_pool3d_forward(points.contiguous(), pooled_boxes3d.contiguous(), point_features.contiguous(), pooled_features, pooled_empty_flag) return pooled_features, pooled_empty_flag @staticmethod def backward(ctx, grad_out): raise NotImplementedError ================================================ FILE: annotator/mmpkg/mmcv/ops/saconv.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import CONV_LAYERS, ConvAWS2d, constant_init from annotator.mmpkg.mmcv.ops.deform_conv import deform_conv2d from annotator.mmpkg.mmcv.utils import TORCH_VERSION, digit_version @CONV_LAYERS.register_module(name='SAC') class SAConv2d(ConvAWS2d): """SAC (Switchable Atrous Convolution) This is an implementation of SAC in DetectoRS (https://arxiv.org/pdf/2006.02334.pdf). Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution kernel_size (int or tuple): Size of the convolving kernel stride (int or tuple, optional): Stride of the convolution. Default: 1 padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0 padding_mode (string, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` use_deform: If ``True``, replace convolution with deformable convolution. Default: ``False``. """ def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, use_deform=False): super().__init__( in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) self.use_deform = use_deform self.switch = nn.Conv2d( self.in_channels, 1, kernel_size=1, stride=stride, bias=True) self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size())) self.pre_context = nn.Conv2d( self.in_channels, self.in_channels, kernel_size=1, bias=True) self.post_context = nn.Conv2d( self.out_channels, self.out_channels, kernel_size=1, bias=True) if self.use_deform: self.offset_s = nn.Conv2d( self.in_channels, 18, kernel_size=3, padding=1, stride=stride, bias=True) self.offset_l = nn.Conv2d( self.in_channels, 18, kernel_size=3, padding=1, stride=stride, bias=True) self.init_weights() def init_weights(self): constant_init(self.switch, 0, bias=1) self.weight_diff.data.zero_() constant_init(self.pre_context, 0) constant_init(self.post_context, 0) if self.use_deform: constant_init(self.offset_s, 0) constant_init(self.offset_l, 0) def forward(self, x): # pre-context avg_x = F.adaptive_avg_pool2d(x, output_size=1) avg_x = self.pre_context(avg_x) avg_x = avg_x.expand_as(x) x = x + avg_x # switch avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect') avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0) switch = self.switch(avg_x) # sac weight = self._get_weight(self.weight) zero_bias = torch.zeros( self.out_channels, device=weight.device, dtype=weight.dtype) if self.use_deform: offset = self.offset_s(avg_x) out_s = deform_conv2d(x, offset, weight, self.stride, self.padding, self.dilation, self.groups, 1) else: if (TORCH_VERSION == 'parrots' or digit_version(TORCH_VERSION) < digit_version('1.5.0')): out_s = super().conv2d_forward(x, weight) elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'): # bias is a required argument of _conv_forward in torch 1.8.0 out_s = super()._conv_forward(x, weight, zero_bias) else: out_s = super()._conv_forward(x, weight) ori_p = self.padding ori_d = self.dilation self.padding = tuple(3 * p for p in self.padding) self.dilation = tuple(3 * d for d in self.dilation) weight = weight + self.weight_diff if self.use_deform: offset = self.offset_l(avg_x) out_l = deform_conv2d(x, offset, weight, self.stride, self.padding, self.dilation, self.groups, 1) else: if (TORCH_VERSION == 'parrots' or digit_version(TORCH_VERSION) < digit_version('1.5.0')): out_l = super().conv2d_forward(x, weight) elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'): # bias is a required argument of _conv_forward in torch 1.8.0 out_l = super()._conv_forward(x, weight, zero_bias) else: out_l = super()._conv_forward(x, weight) out = switch * out_s + (1 - switch) * out_l self.padding = ori_p self.dilation = ori_d # post-context avg_x = F.adaptive_avg_pool2d(out, output_size=1) avg_x = self.post_context(avg_x) avg_x = avg_x.expand_as(out) out = out + avg_x return out ================================================ FILE: annotator/mmpkg/mmcv/ops/scatter_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward']) class _DynamicScatter(Function): @staticmethod def forward(ctx, feats, coors, reduce_type='max'): """convert kitti points(N, >=3) to voxels. Args: feats (torch.Tensor): [N, C]. Points features to be reduced into voxels. coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates (specifically multi-dim voxel index) of each points. reduce_type (str, optional): Reduce op. support 'max', 'sum' and 'mean'. Default: 'max'. Returns: voxel_feats (torch.Tensor): [M, C]. Reduced features, input features that shares the same voxel coordinates are reduced to one row. voxel_coors (torch.Tensor): [M, ndim]. Voxel coordinates. """ results = ext_module.dynamic_point_to_voxel_forward( feats, coors, reduce_type) (voxel_feats, voxel_coors, point2voxel_map, voxel_points_count) = results ctx.reduce_type = reduce_type ctx.save_for_backward(feats, voxel_feats, point2voxel_map, voxel_points_count) ctx.mark_non_differentiable(voxel_coors) return voxel_feats, voxel_coors @staticmethod def backward(ctx, grad_voxel_feats, grad_voxel_coors=None): (feats, voxel_feats, point2voxel_map, voxel_points_count) = ctx.saved_tensors grad_feats = torch.zeros_like(feats) # TODO: whether to use index put or use cuda_backward # To use index put, need point to voxel index ext_module.dynamic_point_to_voxel_backward( grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats, point2voxel_map, voxel_points_count, ctx.reduce_type) return grad_feats, None, None dynamic_scatter = _DynamicScatter.apply class DynamicScatter(nn.Module): """Scatters points into voxels, used in the voxel encoder with dynamic voxelization. Note: The CPU and GPU implementation get the same output, but have numerical difference after summation and division (e.g., 5e-7). Args: voxel_size (list): list [x, y, z] size of three dimension. point_cloud_range (list): The coordinate range of points, [x_min, y_min, z_min, x_max, y_max, z_max]. average_points (bool): whether to use avg pooling to scatter points into voxel. """ def __init__(self, voxel_size, point_cloud_range, average_points: bool): super().__init__() self.voxel_size = voxel_size self.point_cloud_range = point_cloud_range self.average_points = average_points def forward_single(self, points, coors): """Scatters points into voxels. Args: points (torch.Tensor): Points to be reduced into voxels. coors (torch.Tensor): Corresponding voxel coordinates (specifically multi-dim voxel index) of each points. Returns: voxel_feats (torch.Tensor): Reduced features, input features that shares the same voxel coordinates are reduced to one row. voxel_coors (torch.Tensor): Voxel coordinates. """ reduce = 'mean' if self.average_points else 'max' return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce) def forward(self, points, coors): """Scatters points/features into voxels. Args: points (torch.Tensor): Points to be reduced into voxels. coors (torch.Tensor): Corresponding voxel coordinates (specifically multi-dim voxel index) of each points. Returns: voxel_feats (torch.Tensor): Reduced features, input features that shares the same voxel coordinates are reduced to one row. voxel_coors (torch.Tensor): Voxel coordinates. """ if coors.size(-1) == 3: return self.forward_single(points, coors) else: batch_size = coors[-1, 0] + 1 voxels, voxel_coors = [], [] for i in range(batch_size): inds = torch.where(coors[:, 0] == i) voxel, voxel_coor = self.forward_single( points[inds], coors[inds][:, 1:]) coor_pad = nn.functional.pad( voxel_coor, (1, 0), mode='constant', value=i) voxel_coors.append(coor_pad) voxels.append(voxel) features = torch.cat(voxels, dim=0) feature_coors = torch.cat(voxel_coors, dim=0) return features, feature_coors def __repr__(self): s = self.__class__.__name__ + '(' s += 'voxel_size=' + str(self.voxel_size) s += ', point_cloud_range=' + str(self.point_cloud_range) s += ', average_points=' + str(self.average_points) s += ')' return s ================================================ FILE: annotator/mmpkg/mmcv/ops/sync_bn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.distributed as dist import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.module import Module from torch.nn.parameter import Parameter from annotator.mmpkg.mmcv.cnn import NORM_LAYERS from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', [ 'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output', 'sync_bn_backward_param', 'sync_bn_backward_data' ]) class SyncBatchNormFunction(Function): @staticmethod def symbolic(g, input, running_mean, running_var, weight, bias, momentum, eps, group, group_size, stats_mode): return g.op( 'mmcv::MMCVSyncBatchNorm', input, running_mean, running_var, weight, bias, momentum_f=momentum, eps_f=eps, group_i=group, group_size_i=group_size, stats_mode=stats_mode) @staticmethod def forward(self, input, running_mean, running_var, weight, bias, momentum, eps, group, group_size, stats_mode): self.momentum = momentum self.eps = eps self.group = group self.group_size = group_size self.stats_mode = stats_mode assert isinstance( input, (torch.HalfTensor, torch.FloatTensor, torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \ f'only support Half or Float Tensor, but {input.type()}' output = torch.zeros_like(input) input3d = input.flatten(start_dim=2) output3d = output.view_as(input3d) num_channels = input3d.size(1) # ensure mean/var/norm/std are initialized as zeros # ``torch.empty()`` does not guarantee that mean = torch.zeros( num_channels, dtype=torch.float, device=input3d.device) var = torch.zeros( num_channels, dtype=torch.float, device=input3d.device) norm = torch.zeros_like( input3d, dtype=torch.float, device=input3d.device) std = torch.zeros( num_channels, dtype=torch.float, device=input3d.device) batch_size = input3d.size(0) if batch_size > 0: ext_module.sync_bn_forward_mean(input3d, mean) batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype) else: # skip updating mean and leave it as zeros when the input is empty batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype) # synchronize mean and the batch flag vec = torch.cat([mean, batch_flag]) if self.stats_mode == 'N': vec *= batch_size if self.group_size > 1: dist.all_reduce(vec, group=self.group) total_batch = vec[-1].detach() mean = vec[:num_channels] if self.stats_mode == 'default': mean = mean / self.group_size elif self.stats_mode == 'N': mean = mean / total_batch.clamp(min=1) else: raise NotImplementedError # leave var as zeros when the input is empty if batch_size > 0: ext_module.sync_bn_forward_var(input3d, mean, var) if self.stats_mode == 'N': var *= batch_size if self.group_size > 1: dist.all_reduce(var, group=self.group) if self.stats_mode == 'default': var /= self.group_size elif self.stats_mode == 'N': var /= total_batch.clamp(min=1) else: raise NotImplementedError # if the total batch size over all the ranks is zero, # we should not update the statistics in the current batch update_flag = total_batch.clamp(max=1) momentum = update_flag * self.momentum ext_module.sync_bn_forward_output( input3d, mean, var, weight, bias, running_mean, running_var, norm, std, output3d, eps=self.eps, momentum=momentum, group_size=self.group_size) self.save_for_backward(norm, std, weight) return output @staticmethod @once_differentiable def backward(self, grad_output): norm, std, weight = self.saved_tensors grad_weight = torch.zeros_like(weight) grad_bias = torch.zeros_like(weight) grad_input = torch.zeros_like(grad_output) grad_output3d = grad_output.flatten(start_dim=2) grad_input3d = grad_input.view_as(grad_output3d) batch_size = grad_input3d.size(0) if batch_size > 0: ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight, grad_bias) # all reduce if self.group_size > 1: dist.all_reduce(grad_weight, group=self.group) dist.all_reduce(grad_bias, group=self.group) grad_weight /= self.group_size grad_bias /= self.group_size if batch_size > 0: ext_module.sync_bn_backward_data(grad_output3d, weight, grad_weight, grad_bias, norm, std, grad_input3d) return grad_input, None, None, grad_weight, grad_bias, \ None, None, None, None, None @NORM_LAYERS.register_module(name='MMSyncBN') class SyncBatchNorm(Module): """Synchronized Batch Normalization. Args: num_features (int): number of features/chennels in input tensor eps (float, optional): a value added to the denominator for numerical stability. Defaults to 1e-5. momentum (float, optional): the value used for the running_mean and running_var computation. Defaults to 0.1. affine (bool, optional): whether to use learnable affine parameters. Defaults to True. track_running_stats (bool, optional): whether to track the running mean and variance during training. When set to False, this module does not track such statistics, and initializes statistics buffers ``running_mean`` and ``running_var`` as ``None``. When these buffers are ``None``, this module always uses batch statistics in both training and eval modes. Defaults to True. group (int, optional): synchronization of stats happen within each process group individually. By default it is synchronization across the whole world. Defaults to None. stats_mode (str, optional): The statistical mode. Available options includes ``'default'`` and ``'N'``. Defaults to 'default'. When ``stats_mode=='default'``, it computes the overall statistics using those from each worker with equal weight, i.e., the statistics are synchronized and simply divied by ``group``. This mode will produce inaccurate statistics when empty tensors occur. When ``stats_mode=='N'``, it compute the overall statistics using the total number of batches in each worker ignoring the number of group, i.e., the statistics are synchronized and then divied by the total batch ``N``. This mode is beneficial when empty tensors occur during training, as it average the total mean by the real number of batch. """ def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, group=None, stats_mode='default'): super(SyncBatchNorm, self).__init__() self.num_features = num_features self.eps = eps self.momentum = momentum self.affine = affine self.track_running_stats = track_running_stats group = dist.group.WORLD if group is None else group self.group = group self.group_size = dist.get_world_size(group) assert stats_mode in ['default', 'N'], \ f'"stats_mode" only accepts "default" and "N", got "{stats_mode}"' self.stats_mode = stats_mode if self.affine: self.weight = Parameter(torch.Tensor(num_features)) self.bias = Parameter(torch.Tensor(num_features)) else: self.register_parameter('weight', None) self.register_parameter('bias', None) if self.track_running_stats: self.register_buffer('running_mean', torch.zeros(num_features)) self.register_buffer('running_var', torch.ones(num_features)) self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long)) else: self.register_buffer('running_mean', None) self.register_buffer('running_var', None) self.register_buffer('num_batches_tracked', None) self.reset_parameters() def reset_running_stats(self): if self.track_running_stats: self.running_mean.zero_() self.running_var.fill_(1) self.num_batches_tracked.zero_() def reset_parameters(self): self.reset_running_stats() if self.affine: self.weight.data.uniform_() # pytorch use ones_() self.bias.data.zero_() def forward(self, input): if input.dim() < 2: raise ValueError( f'expected at least 2D input, got {input.dim()}D input') if self.momentum is None: exponential_average_factor = 0.0 else: exponential_average_factor = self.momentum if self.training and self.track_running_stats: if self.num_batches_tracked is not None: self.num_batches_tracked += 1 if self.momentum is None: # use cumulative moving average exponential_average_factor = 1.0 / float( self.num_batches_tracked) else: # use exponential moving average exponential_average_factor = self.momentum if self.training or not self.track_running_stats: return SyncBatchNormFunction.apply( input, self.running_mean, self.running_var, self.weight, self.bias, exponential_average_factor, self.eps, self.group, self.group_size, self.stats_mode) else: return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, exponential_average_factor, self.eps) def __repr__(self): s = self.__class__.__name__ s += f'({self.num_features}, ' s += f'eps={self.eps}, ' s += f'momentum={self.momentum}, ' s += f'affine={self.affine}, ' s += f'track_running_stats={self.track_running_stats}, ' s += f'group_size={self.group_size},' s += f'stats_mode={self.stats_mode})' return s ================================================ FILE: annotator/mmpkg/mmcv/ops/three_interpolate.py ================================================ from typing import Tuple import torch from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['three_interpolate_forward', 'three_interpolate_backward']) class ThreeInterpolate(Function): """Performs weighted linear interpolation on 3 features. Please refer to `Paper of PointNet++ `_ for more details. """ @staticmethod def forward(ctx, features: torch.Tensor, indices: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: """ Args: features (Tensor): (B, C, M) Features descriptors to be interpolated indices (Tensor): (B, n, 3) index three nearest neighbors of the target features in features weight (Tensor): (B, n, 3) weights of interpolation Returns: Tensor: (B, C, N) tensor of the interpolated features """ assert features.is_contiguous() assert indices.is_contiguous() assert weight.is_contiguous() B, c, m = features.size() n = indices.size(1) ctx.three_interpolate_for_backward = (indices, weight, m) output = torch.cuda.FloatTensor(B, c, n) ext_module.three_interpolate_forward( features, indices, weight, output, b=B, c=c, m=m, n=n) return output @staticmethod def backward( ctx, grad_out: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Args: grad_out (Tensor): (B, C, N) tensor with gradients of outputs Returns: Tensor: (B, C, M) tensor with gradients of features """ idx, weight, m = ctx.three_interpolate_for_backward B, c, n = grad_out.size() grad_features = torch.cuda.FloatTensor(B, c, m).zero_() grad_out_data = grad_out.data.contiguous() ext_module.three_interpolate_backward( grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m) return grad_features, None, None three_interpolate = ThreeInterpolate.apply ================================================ FILE: annotator/mmpkg/mmcv/ops/three_nn.py ================================================ from typing import Tuple import torch from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['three_nn_forward']) class ThreeNN(Function): """Find the top-3 nearest neighbors of the target set from the source set. Please refer to `Paper of PointNet++ `_ for more details. """ @staticmethod def forward(ctx, target: torch.Tensor, source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Args: target (Tensor): shape (B, N, 3), points set that needs to find the nearest neighbors. source (Tensor): shape (B, M, 3), points set that is used to find the nearest neighbors of points in target set. Returns: Tensor: shape (B, N, 3), L2 distance of each point in target set to their corresponding nearest neighbors. """ target = target.contiguous() source = source.contiguous() B, N, _ = target.size() m = source.size(1) dist2 = torch.cuda.FloatTensor(B, N, 3) idx = torch.cuda.IntTensor(B, N, 3) ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m) if torch.__version__ != 'parrots': ctx.mark_non_differentiable(idx) return torch.sqrt(dist2), idx @staticmethod def backward(ctx, a=None, b=None): return None, None three_nn = ThreeNN.apply ================================================ FILE: annotator/mmpkg/mmcv/ops/tin_shift.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # Code reference from "Temporal Interlacing Network" # https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py # Hao Shao, Shengju Qian, Yu Liu # shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk import torch import torch.nn as nn from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['tin_shift_forward', 'tin_shift_backward']) class TINShiftFunction(Function): @staticmethod def forward(ctx, input, shift): C = input.size(2) num_segments = shift.size(1) if C // num_segments <= 0 or C % num_segments != 0: raise ValueError('C should be a multiple of num_segments, ' f'but got C={C} and num_segments={num_segments}.') ctx.save_for_backward(shift) out = torch.zeros_like(input) ext_module.tin_shift_forward(input, shift, out) return out @staticmethod def backward(ctx, grad_output): shift = ctx.saved_tensors[0] data_grad_input = grad_output.new(*grad_output.size()).zero_() shift_grad_input = shift.new(*shift.size()).zero_() ext_module.tin_shift_backward(grad_output, shift, data_grad_input) return data_grad_input, shift_grad_input tin_shift = TINShiftFunction.apply class TINShift(nn.Module): """Temporal Interlace Shift. Temporal Interlace shift is a differentiable temporal-wise frame shifting which is proposed in "Temporal Interlacing Network" Please refer to https://arxiv.org/abs/2001.06499 for more details. Code is modified from https://github.com/mit-han-lab/temporal-shift-module """ def forward(self, input, shift): """Perform temporal interlace shift. Args: input (Tensor): Feature map with shape [N, num_segments, C, H * W]. shift (Tensor): Shift tensor with shape [N, num_segments]. Returns: Feature map after temporal interlace shift. """ return tin_shift(input, shift) ================================================ FILE: annotator/mmpkg/mmcv/ops/upfirdn2d.py ================================================ # modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py # noqa:E501 # Copyright (c) 2021, NVIDIA Corporation. All rights reserved. # NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator # Augmentation (ADA) # ======================================================================= # 1. Definitions # "Licensor" means any person or entity that distributes its Work. # "Software" means the original work of authorship made available under # this License. # "Work" means the Software and any additions to or derivative works of # the Software that are made available under this License. # The terms "reproduce," "reproduction," "derivative works," and # "distribution" have the meaning as provided under U.S. copyright law; # provided, however, that for the purposes of this License, derivative # works shall not include works that remain separable from, or merely # link (or bind by name) to the interfaces of, the Work. # Works, including the Software, are "made available" under this License # by including in or with the Work either (a) a copyright notice # referencing the applicability of this License to the Work, or (b) a # copy of this License. # 2. License Grants # 2.1 Copyright Grant. Subject to the terms and conditions of this # License, each Licensor grants to you a perpetual, worldwide, # non-exclusive, royalty-free, copyright license to reproduce, # prepare derivative works of, publicly display, publicly perform, # sublicense and distribute its Work and any resulting derivative # works in any form. # 3. Limitations # 3.1 Redistribution. You may reproduce or distribute the Work only # if (a) you do so under this License, (b) you include a complete # copy of this License with your distribution, and (c) you retain # without modification any copyright, patent, trademark, or # attribution notices that are present in the Work. # 3.2 Derivative Works. You may specify that additional or different # terms apply to the use, reproduction, and distribution of your # derivative works of the Work ("Your Terms") only if (a) Your Terms # provide that the use limitation in Section 3.3 applies to your # derivative works, and (b) you identify the specific derivative # works that are subject to Your Terms. Notwithstanding Your Terms, # this License (including the redistribution requirements in Section # 3.1) will continue to apply to the Work itself. # 3.3 Use Limitation. The Work and any derivative works thereof only # may be used or intended for use non-commercially. Notwithstanding # the foregoing, NVIDIA and its affiliates may use the Work and any # derivative works commercially. As used herein, "non-commercially" # means for research or evaluation purposes only. # 3.4 Patent Claims. If you bring or threaten to bring a patent claim # against any Licensor (including any claim, cross-claim or # counterclaim in a lawsuit) to enforce any patents that you allege # are infringed by any Work, then your rights under this License from # such Licensor (including the grant in Section 2.1) will terminate # immediately. # 3.5 Trademarks. This License does not grant any rights to use any # Licensor’s or its affiliates’ names, logos, or trademarks, except # as necessary to reproduce the notices described in this License. # 3.6 Termination. If you violate any term of this License, then your # rights under this License (including the grant in Section 2.1) will # terminate immediately. # 4. Disclaimer of Warranty. # THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR # NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER # THIS LICENSE. # 5. Limitation of Liability. # EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL # THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE # SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, # INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF # OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK # (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, # LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER # COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF # THE POSSIBILITY OF SUCH DAMAGES. # ======================================================================= import torch from torch.autograd import Function from torch.nn import functional as F from annotator.mmpkg.mmcv.utils import to_2tuple from ..utils import ext_loader upfirdn2d_ext = ext_loader.load_ext('_ext', ['upfirdn2d']) class UpFirDn2dBackward(Function): @staticmethod def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, in_size, out_size): up_x, up_y = up down_x, down_y = down g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1) grad_input = upfirdn2d_ext.upfirdn2d( grad_output, grad_kernel, up_x=down_x, up_y=down_y, down_x=up_x, down_y=up_y, pad_x0=g_pad_x0, pad_x1=g_pad_x1, pad_y0=g_pad_y0, pad_y1=g_pad_y1) grad_input = grad_input.view(in_size[0], in_size[1], in_size[2], in_size[3]) ctx.save_for_backward(kernel) pad_x0, pad_x1, pad_y0, pad_y1 = pad ctx.up_x = up_x ctx.up_y = up_y ctx.down_x = down_x ctx.down_y = down_y ctx.pad_x0 = pad_x0 ctx.pad_x1 = pad_x1 ctx.pad_y0 = pad_y0 ctx.pad_y1 = pad_y1 ctx.in_size = in_size ctx.out_size = out_size return grad_input @staticmethod def backward(ctx, gradgrad_input): kernel, = ctx.saved_tensors gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2], ctx.in_size[3], 1) gradgrad_out = upfirdn2d_ext.upfirdn2d( gradgrad_input, kernel, up_x=ctx.up_x, up_y=ctx.up_y, down_x=ctx.down_x, down_y=ctx.down_y, pad_x0=ctx.pad_x0, pad_x1=ctx.pad_x1, pad_y0=ctx.pad_y0, pad_y1=ctx.pad_y1) # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0], # ctx.out_size[1], ctx.in_size[3]) gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.in_size[1], ctx.out_size[0], ctx.out_size[1]) return gradgrad_out, None, None, None, None, None, None, None, None class UpFirDn2d(Function): @staticmethod def forward(ctx, input, kernel, up, down, pad): up_x, up_y = up down_x, down_y = down pad_x0, pad_x1, pad_y0, pad_y1 = pad kernel_h, kernel_w = kernel.shape batch, channel, in_h, in_w = input.shape ctx.in_size = input.shape input = input.reshape(-1, in_h, in_w, 1) ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1])) out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 ctx.out_size = (out_h, out_w) ctx.up = (up_x, up_y) ctx.down = (down_x, down_y) ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1) g_pad_x0 = kernel_w - pad_x0 - 1 g_pad_y0 = kernel_h - pad_y0 - 1 g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1 g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1 ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1) out = upfirdn2d_ext.upfirdn2d( input, kernel, up_x=up_x, up_y=up_y, down_x=down_x, down_y=down_y, pad_x0=pad_x0, pad_x1=pad_x1, pad_y0=pad_y0, pad_y1=pad_y1) # out = out.view(major, out_h, out_w, minor) out = out.view(-1, channel, out_h, out_w) return out @staticmethod def backward(ctx, grad_output): kernel, grad_kernel = ctx.saved_tensors grad_input = UpFirDn2dBackward.apply( grad_output, kernel, grad_kernel, ctx.up, ctx.down, ctx.pad, ctx.g_pad, ctx.in_size, ctx.out_size, ) return grad_input, None, None, None, None def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): """UpFRIDn for 2d features. UpFIRDn is short for upsample, apply FIR filter and downsample. More details can be found in: https://www.mathworks.com/help/signal/ref/upfirdn.html Args: input (Tensor): Tensor with shape of (n, c, h, w). kernel (Tensor): Filter kernel. up (int | tuple[int], optional): Upsampling factor. If given a number, we will use this factor for the both height and width side. Defaults to 1. down (int | tuple[int], optional): Downsampling factor. If given a number, we will use this factor for the both height and width side. Defaults to 1. pad (tuple[int], optional): Padding for tensors, (x_pad, y_pad) or (x_pad_0, x_pad_1, y_pad_0, y_pad_1). Defaults to (0, 0). Returns: Tensor: Tensor after UpFIRDn. """ if input.device.type == 'cpu': if len(pad) == 2: pad = (pad[0], pad[1], pad[0], pad[1]) up = to_2tuple(up) down = to_2tuple(down) out = upfirdn2d_native(input, kernel, up[0], up[1], down[0], down[1], pad[0], pad[1], pad[2], pad[3]) else: _up = to_2tuple(up) _down = to_2tuple(down) if len(pad) == 4: _pad = pad elif len(pad) == 2: _pad = (pad[0], pad[1], pad[0], pad[1]) out = UpFirDn2d.apply(input, kernel, _up, _down, _pad) return out def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1): _, channel, in_h, in_w = input.shape input = input.reshape(-1, in_h, in_w, 1) _, in_h, in_w, minor = input.shape kernel_h, kernel_w = kernel.shape out = input.view(-1, in_h, 1, in_w, 1, minor) out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1]) out = out.view(-1, in_h * up_y, in_w * up_x, minor) out = F.pad( out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)]) out = out[:, max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0), max(-pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ] out = out.permute(0, 3, 1, 2) out = out.reshape( [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]) w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w) out = F.conv2d(out, w) out = out.reshape( -1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, ) out = out.permute(0, 2, 3, 1) out = out[:, ::down_y, ::down_x, :] out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 return out.view(-1, channel, out_h, out_w) ================================================ FILE: annotator/mmpkg/mmcv/ops/voxelize.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn from torch.autograd import Function from torch.nn.modules.utils import _pair from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward']) class _Voxelization(Function): @staticmethod def forward(ctx, points, voxel_size, coors_range, max_points=35, max_voxels=20000): """Convert kitti points(N, >=3) to voxels. Args: points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points and points[:, 3:] contain other information like reflectivity. voxel_size (tuple or float): The size of voxel with the shape of [3]. coors_range (tuple or float): The coordinate range of voxel with the shape of [6]. max_points (int, optional): maximum points contained in a voxel. if max_points=-1, it means using dynamic_voxelize. Default: 35. max_voxels (int, optional): maximum voxels this function create. for second, 20000 is a good choice. Users should shuffle points before call this function because max_voxels may drop points. Default: 20000. Returns: voxels_out (torch.Tensor): Output voxels with the shape of [M, max_points, ndim]. Only contain points and returned when max_points != -1. coors_out (torch.Tensor): Output coordinates with the shape of [M, 3]. num_points_per_voxel_out (torch.Tensor): Num points per voxel with the shape of [M]. Only returned when max_points != -1. """ if max_points == -1 or max_voxels == -1: coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int) ext_module.dynamic_voxelize_forward(points, coors, voxel_size, coors_range, 3) return coors else: voxels = points.new_zeros( size=(max_voxels, max_points, points.size(1))) coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int) num_points_per_voxel = points.new_zeros( size=(max_voxels, ), dtype=torch.int) voxel_num = ext_module.hard_voxelize_forward( points, voxels, coors, num_points_per_voxel, voxel_size, coors_range, max_points, max_voxels, 3) # select the valid voxels voxels_out = voxels[:voxel_num] coors_out = coors[:voxel_num] num_points_per_voxel_out = num_points_per_voxel[:voxel_num] return voxels_out, coors_out, num_points_per_voxel_out voxelization = _Voxelization.apply class Voxelization(nn.Module): """Convert kitti points(N, >=3) to voxels. Please refer to `PVCNN `_ for more details. Args: voxel_size (tuple or float): The size of voxel with the shape of [3]. point_cloud_range (tuple or float): The coordinate range of voxel with the shape of [6]. max_num_points (int): maximum points contained in a voxel. if max_points=-1, it means using dynamic_voxelize. max_voxels (int, optional): maximum voxels this function create. for second, 20000 is a good choice. Users should shuffle points before call this function because max_voxels may drop points. Default: 20000. """ def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000): super().__init__() self.voxel_size = voxel_size self.point_cloud_range = point_cloud_range self.max_num_points = max_num_points if isinstance(max_voxels, tuple): self.max_voxels = max_voxels else: self.max_voxels = _pair(max_voxels) point_cloud_range = torch.tensor( point_cloud_range, dtype=torch.float32) voxel_size = torch.tensor(voxel_size, dtype=torch.float32) grid_size = (point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size grid_size = torch.round(grid_size).long() input_feat_shape = grid_size[:2] self.grid_size = grid_size # the origin shape is as [x-len, y-len, z-len] # [w, h, d] -> [d, h, w] self.pcd_shape = [*input_feat_shape, 1][::-1] def forward(self, input): if self.training: max_voxels = self.max_voxels[0] else: max_voxels = self.max_voxels[1] return voxelization(input, self.voxel_size, self.point_cloud_range, self.max_num_points, max_voxels) def __repr__(self): s = self.__class__.__name__ + '(' s += 'voxel_size=' + str(self.voxel_size) s += ', point_cloud_range=' + str(self.point_cloud_range) s += ', max_num_points=' + str(self.max_num_points) s += ', max_voxels=' + str(self.max_voxels) s += ')' return s ================================================ FILE: annotator/mmpkg/mmcv/parallel/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .collate import collate from .data_container import DataContainer from .data_parallel import MMDataParallel from .distributed import MMDistributedDataParallel from .registry import MODULE_WRAPPERS from .scatter_gather import scatter, scatter_kwargs from .utils import is_module_wrapper __all__ = [ 'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel', 'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS' ] ================================================ FILE: annotator/mmpkg/mmcv/parallel/_functions.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch.nn.parallel._functions import _get_stream def scatter(input, devices, streams=None): """Scatters tensor across multiple GPUs.""" if streams is None: streams = [None] * len(devices) if isinstance(input, list): chunk_size = (len(input) - 1) // len(devices) + 1 outputs = [ scatter(input[i], [devices[i // chunk_size]], [streams[i // chunk_size]]) for i in range(len(input)) ] return outputs elif isinstance(input, torch.Tensor): output = input.contiguous() # TODO: copy to a pinned buffer first (if copying from CPU) stream = streams[0] if output.numel() > 0 else None if devices != [-1]: with torch.cuda.device(devices[0]), torch.cuda.stream(stream): output = output.cuda(devices[0], non_blocking=True) else: # unsqueeze the first dimension thus the tensor's shape is the # same as those scattered with GPU. output = output.unsqueeze(0) return output else: raise Exception(f'Unknown type {type(input)}.') def synchronize_stream(output, devices, streams): if isinstance(output, list): chunk_size = len(output) // len(devices) for i in range(len(devices)): for j in range(chunk_size): synchronize_stream(output[i * chunk_size + j], [devices[i]], [streams[i]]) elif isinstance(output, torch.Tensor): if output.numel() != 0: with torch.cuda.device(devices[0]): main_stream = torch.cuda.current_stream() main_stream.wait_stream(streams[0]) output.record_stream(main_stream) else: raise Exception(f'Unknown type {type(output)}.') def get_input_device(input): if isinstance(input, list): for item in input: input_device = get_input_device(item) if input_device != -1: return input_device return -1 elif isinstance(input, torch.Tensor): return input.get_device() if input.is_cuda else -1 else: raise Exception(f'Unknown type {type(input)}.') class Scatter: @staticmethod def forward(target_gpus, input): input_device = get_input_device(input) streams = None if input_device == -1 and target_gpus != [-1]: # Perform CPU to GPU copies in a background stream streams = [_get_stream(device) for device in target_gpus] outputs = scatter(input, target_gpus, streams) # Synchronize with the copy stream if streams is not None: synchronize_stream(outputs, target_gpus, streams) return tuple(outputs) ================================================ FILE: annotator/mmpkg/mmcv/parallel/collate.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from collections.abc import Mapping, Sequence import torch import torch.nn.functional as F from torch.utils.data.dataloader import default_collate from .data_container import DataContainer def collate(batch, samples_per_gpu=1): """Puts each data field into a tensor/DataContainer with outer dimension batch size. Extend default_collate to add support for :type:`~mmcv.parallel.DataContainer`. There are 3 cases. 1. cpu_only = True, e.g., meta data 2. cpu_only = False, stack = True, e.g., images tensors 3. cpu_only = False, stack = False, e.g., gt bboxes """ if not isinstance(batch, Sequence): raise TypeError(f'{batch.dtype} is not supported.') if isinstance(batch[0], DataContainer): stacked = [] if batch[0].cpu_only: for i in range(0, len(batch), samples_per_gpu): stacked.append( [sample.data for sample in batch[i:i + samples_per_gpu]]) return DataContainer( stacked, batch[0].stack, batch[0].padding_value, cpu_only=True) elif batch[0].stack: for i in range(0, len(batch), samples_per_gpu): assert isinstance(batch[i].data, torch.Tensor) if batch[i].pad_dims is not None: ndim = batch[i].dim() assert ndim > batch[i].pad_dims max_shape = [0 for _ in range(batch[i].pad_dims)] for dim in range(1, batch[i].pad_dims + 1): max_shape[dim - 1] = batch[i].size(-dim) for sample in batch[i:i + samples_per_gpu]: for dim in range(0, ndim - batch[i].pad_dims): assert batch[i].size(dim) == sample.size(dim) for dim in range(1, batch[i].pad_dims + 1): max_shape[dim - 1] = max(max_shape[dim - 1], sample.size(-dim)) padded_samples = [] for sample in batch[i:i + samples_per_gpu]: pad = [0 for _ in range(batch[i].pad_dims * 2)] for dim in range(1, batch[i].pad_dims + 1): pad[2 * dim - 1] = max_shape[dim - 1] - sample.size(-dim) padded_samples.append( F.pad( sample.data, pad, value=sample.padding_value)) stacked.append(default_collate(padded_samples)) elif batch[i].pad_dims is None: stacked.append( default_collate([ sample.data for sample in batch[i:i + samples_per_gpu] ])) else: raise ValueError( 'pad_dims should be either None or integers (1-3)') else: for i in range(0, len(batch), samples_per_gpu): stacked.append( [sample.data for sample in batch[i:i + samples_per_gpu]]) return DataContainer(stacked, batch[0].stack, batch[0].padding_value) elif isinstance(batch[0], Sequence): transposed = zip(*batch) return [collate(samples, samples_per_gpu) for samples in transposed] elif isinstance(batch[0], Mapping): return { key: collate([d[key] for d in batch], samples_per_gpu) for key in batch[0] } else: return default_collate(batch) ================================================ FILE: annotator/mmpkg/mmcv/parallel/data_container.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import functools import torch def assert_tensor_type(func): @functools.wraps(func) def wrapper(*args, **kwargs): if not isinstance(args[0].data, torch.Tensor): raise AttributeError( f'{args[0].__class__.__name__} has no attribute ' f'{func.__name__} for type {args[0].datatype}') return func(*args, **kwargs) return wrapper class DataContainer: """A container for any type of objects. Typically tensors will be stacked in the collate function and sliced along some dimension in the scatter function. This behavior has some limitations. 1. All tensors have to be the same size. 2. Types are limited (numpy array or Tensor). We design `DataContainer` and `MMDataParallel` to overcome these limitations. The behavior can be either of the following. - copy to GPU, pad all tensors to the same size and stack them - copy to GPU without stacking - leave the objects as is and pass it to the model - pad_dims specifies the number of last few dimensions to do padding """ def __init__(self, data, stack=False, padding_value=0, cpu_only=False, pad_dims=2): self._data = data self._cpu_only = cpu_only self._stack = stack self._padding_value = padding_value assert pad_dims in [None, 1, 2, 3] self._pad_dims = pad_dims def __repr__(self): return f'{self.__class__.__name__}({repr(self.data)})' def __len__(self): return len(self._data) @property def data(self): return self._data @property def datatype(self): if isinstance(self.data, torch.Tensor): return self.data.type() else: return type(self.data) @property def cpu_only(self): return self._cpu_only @property def stack(self): return self._stack @property def padding_value(self): return self._padding_value @property def pad_dims(self): return self._pad_dims @assert_tensor_type def size(self, *args, **kwargs): return self.data.size(*args, **kwargs) @assert_tensor_type def dim(self): return self.data.dim() ================================================ FILE: annotator/mmpkg/mmcv/parallel/data_parallel.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from itertools import chain from torch.nn.parallel import DataParallel from .scatter_gather import scatter_kwargs class MMDataParallel(DataParallel): """The DataParallel module that supports DataContainer. MMDataParallel has two main differences with PyTorch DataParallel: - It supports a custom type :class:`DataContainer` which allows more flexible control of input data during both GPU and CPU inference. - It implement two more APIs ``train_step()`` and ``val_step()``. Args: module (:class:`nn.Module`): Module to be encapsulated. device_ids (list[int]): Device IDS of modules to be scattered to. Defaults to None when GPU is not available. output_device (str | int): Device ID for output. Defaults to None. dim (int): Dimension used to scatter the data. Defaults to 0. """ def __init__(self, *args, dim=0, **kwargs): super(MMDataParallel, self).__init__(*args, dim=dim, **kwargs) self.dim = dim def forward(self, *inputs, **kwargs): """Override the original forward function. The main difference lies in the CPU inference where the data in :class:`DataContainers` will still be gathered. """ if not self.device_ids: # We add the following line thus the module could gather and # convert data containers as those in GPU inference inputs, kwargs = self.scatter(inputs, kwargs, [-1]) return self.module(*inputs[0], **kwargs[0]) else: return super().forward(*inputs, **kwargs) def scatter(self, inputs, kwargs, device_ids): return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) def train_step(self, *inputs, **kwargs): if not self.device_ids: # We add the following line thus the module could gather and # convert data containers as those in GPU inference inputs, kwargs = self.scatter(inputs, kwargs, [-1]) return self.module.train_step(*inputs[0], **kwargs[0]) assert len(self.device_ids) == 1, \ ('MMDataParallel only supports single GPU training, if you need to' ' train with multiple GPUs, please use MMDistributedDataParallel' 'instead.') for t in chain(self.module.parameters(), self.module.buffers()): if t.device != self.src_device_obj: raise RuntimeError( 'module must have its parameters and buffers ' f'on device {self.src_device_obj} (device_ids[0]) but ' f'found one of them on device: {t.device}') inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) return self.module.train_step(*inputs[0], **kwargs[0]) def val_step(self, *inputs, **kwargs): if not self.device_ids: # We add the following line thus the module could gather and # convert data containers as those in GPU inference inputs, kwargs = self.scatter(inputs, kwargs, [-1]) return self.module.val_step(*inputs[0], **kwargs[0]) assert len(self.device_ids) == 1, \ ('MMDataParallel only supports single GPU training, if you need to' ' train with multiple GPUs, please use MMDistributedDataParallel' ' instead.') for t in chain(self.module.parameters(), self.module.buffers()): if t.device != self.src_device_obj: raise RuntimeError( 'module must have its parameters and buffers ' f'on device {self.src_device_obj} (device_ids[0]) but ' f'found one of them on device: {t.device}') inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) return self.module.val_step(*inputs[0], **kwargs[0]) ================================================ FILE: annotator/mmpkg/mmcv/parallel/distributed.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch.nn.parallel.distributed import (DistributedDataParallel, _find_tensors) from annotator.mmpkg.mmcv import print_log from annotator.mmpkg.mmcv.utils import TORCH_VERSION, digit_version from .scatter_gather import scatter_kwargs class MMDistributedDataParallel(DistributedDataParallel): """The DDP module that supports DataContainer. MMDDP has two main differences with PyTorch DDP: - It supports a custom type :class:`DataContainer` which allows more flexible control of input data. - It implement two APIs ``train_step()`` and ``val_step()``. """ def to_kwargs(self, inputs, kwargs, device_id): # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8 # to move all tensors to device_id return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim) def scatter(self, inputs, kwargs, device_ids): return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) def train_step(self, *inputs, **kwargs): """train_step() API for module wrapped by DistributedDataParallel. This method is basically the same as ``DistributedDataParallel.forward()``, while replacing ``self.module.forward()`` with ``self.module.train_step()``. It is compatible with PyTorch 1.1 - 1.5. """ # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the # end of backward to the beginning of forward. if ('parrots' not in TORCH_VERSION and digit_version(TORCH_VERSION) >= digit_version('1.7') and self.reducer._rebuild_buckets()): print_log( 'Reducer buckets have been rebuilt in this iteration.', logger='mmcv') if getattr(self, 'require_forward_param_sync', True): self._sync_params() if self.device_ids: inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) if len(self.device_ids) == 1: output = self.module.train_step(*inputs[0], **kwargs[0]) else: outputs = self.parallel_apply( self._module_copies[:len(inputs)], inputs, kwargs) output = self.gather(outputs, self.output_device) else: output = self.module.train_step(*inputs, **kwargs) if torch.is_grad_enabled() and getattr( self, 'require_backward_grad_sync', True): if self.find_unused_parameters: self.reducer.prepare_for_backward(list(_find_tensors(output))) else: self.reducer.prepare_for_backward([]) else: if ('parrots' not in TORCH_VERSION and digit_version(TORCH_VERSION) > digit_version('1.2')): self.require_forward_param_sync = False return output def val_step(self, *inputs, **kwargs): """val_step() API for module wrapped by DistributedDataParallel. This method is basically the same as ``DistributedDataParallel.forward()``, while replacing ``self.module.forward()`` with ``self.module.val_step()``. It is compatible with PyTorch 1.1 - 1.5. """ # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the # end of backward to the beginning of forward. if ('parrots' not in TORCH_VERSION and digit_version(TORCH_VERSION) >= digit_version('1.7') and self.reducer._rebuild_buckets()): print_log( 'Reducer buckets have been rebuilt in this iteration.', logger='mmcv') if getattr(self, 'require_forward_param_sync', True): self._sync_params() if self.device_ids: inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) if len(self.device_ids) == 1: output = self.module.val_step(*inputs[0], **kwargs[0]) else: outputs = self.parallel_apply( self._module_copies[:len(inputs)], inputs, kwargs) output = self.gather(outputs, self.output_device) else: output = self.module.val_step(*inputs, **kwargs) if torch.is_grad_enabled() and getattr( self, 'require_backward_grad_sync', True): if self.find_unused_parameters: self.reducer.prepare_for_backward(list(_find_tensors(output))) else: self.reducer.prepare_for_backward([]) else: if ('parrots' not in TORCH_VERSION and digit_version(TORCH_VERSION) > digit_version('1.2')): self.require_forward_param_sync = False return output ================================================ FILE: annotator/mmpkg/mmcv/parallel/distributed_deprecated.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.distributed as dist import torch.nn as nn from torch._utils import (_flatten_dense_tensors, _take_tensors, _unflatten_dense_tensors) from annotator.mmpkg.mmcv.utils import TORCH_VERSION, digit_version from .registry import MODULE_WRAPPERS from .scatter_gather import scatter_kwargs @MODULE_WRAPPERS.register_module() class MMDistributedDataParallel(nn.Module): def __init__(self, module, dim=0, broadcast_buffers=True, bucket_cap_mb=25): super(MMDistributedDataParallel, self).__init__() self.module = module self.dim = dim self.broadcast_buffers = broadcast_buffers self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024 self._sync_params() def _dist_broadcast_coalesced(self, tensors, buffer_size): for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, 0) for tensor, synced in zip( tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced) def _sync_params(self): module_states = list(self.module.state_dict().values()) if len(module_states) > 0: self._dist_broadcast_coalesced(module_states, self.broadcast_bucket_size) if self.broadcast_buffers: if (TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version('1.0')): buffers = [b.data for b in self.module._all_buffers()] else: buffers = [b.data for b in self.module.buffers()] if len(buffers) > 0: self._dist_broadcast_coalesced(buffers, self.broadcast_bucket_size) def scatter(self, inputs, kwargs, device_ids): return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) def forward(self, *inputs, **kwargs): inputs, kwargs = self.scatter(inputs, kwargs, [torch.cuda.current_device()]) return self.module(*inputs[0], **kwargs[0]) def train_step(self, *inputs, **kwargs): inputs, kwargs = self.scatter(inputs, kwargs, [torch.cuda.current_device()]) output = self.module.train_step(*inputs[0], **kwargs[0]) return output def val_step(self, *inputs, **kwargs): inputs, kwargs = self.scatter(inputs, kwargs, [torch.cuda.current_device()]) output = self.module.val_step(*inputs[0], **kwargs[0]) return output ================================================ FILE: annotator/mmpkg/mmcv/parallel/registry.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from torch.nn.parallel import DataParallel, DistributedDataParallel from annotator.mmpkg.mmcv.utils import Registry MODULE_WRAPPERS = Registry('module wrapper') MODULE_WRAPPERS.register_module(module=DataParallel) MODULE_WRAPPERS.register_module(module=DistributedDataParallel) ================================================ FILE: annotator/mmpkg/mmcv/parallel/scatter_gather.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch.nn.parallel._functions import Scatter as OrigScatter from ._functions import Scatter from .data_container import DataContainer def scatter(inputs, target_gpus, dim=0): """Scatter inputs to target gpus. The only difference from original :func:`scatter` is to add support for :type:`~mmcv.parallel.DataContainer`. """ def scatter_map(obj): if isinstance(obj, torch.Tensor): if target_gpus != [-1]: return OrigScatter.apply(target_gpus, None, dim, obj) else: # for CPU inference we use self-implemented scatter return Scatter.forward(target_gpus, obj) if isinstance(obj, DataContainer): if obj.cpu_only: return obj.data else: return Scatter.forward(target_gpus, obj.data) if isinstance(obj, tuple) and len(obj) > 0: return list(zip(*map(scatter_map, obj))) if isinstance(obj, list) and len(obj) > 0: out = list(map(list, zip(*map(scatter_map, obj)))) return out if isinstance(obj, dict) and len(obj) > 0: out = list(map(type(obj), zip(*map(scatter_map, obj.items())))) return out return [obj for targets in target_gpus] # After scatter_map is called, a scatter_map cell will exist. This cell # has a reference to the actual function scatter_map, which has references # to a closure that has a reference to the scatter_map cell (because the # fn is recursive). To avoid this reference cycle, we set the function to # None, clearing the cell try: return scatter_map(inputs) finally: scatter_map = None def scatter_kwargs(inputs, kwargs, target_gpus, dim=0): """Scatter with support for kwargs dictionary.""" inputs = scatter(inputs, target_gpus, dim) if inputs else [] kwargs = scatter(kwargs, target_gpus, dim) if kwargs else [] if len(inputs) < len(kwargs): inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) elif len(kwargs) < len(inputs): kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) inputs = tuple(inputs) kwargs = tuple(kwargs) return inputs, kwargs ================================================ FILE: annotator/mmpkg/mmcv/parallel/utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .registry import MODULE_WRAPPERS def is_module_wrapper(module): """Check if a module is a module wrapper. The following 3 modules in MMCV (and their subclasses) are regarded as module wrappers: DataParallel, DistributedDataParallel, MMDistributedDataParallel (the deprecated version). You may add you own module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS. Args: module (nn.Module): The module to be checked. Returns: bool: True if the input module is a module wrapper. """ module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values()) return isinstance(module, module_wrappers) ================================================ FILE: annotator/mmpkg/mmcv/runner/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base_module import BaseModule, ModuleList, Sequential from .base_runner import BaseRunner from .builder import RUNNERS, build_runner from .checkpoint import (CheckpointLoader, _load_checkpoint, _load_checkpoint_with_prefix, load_checkpoint, load_state_dict, save_checkpoint, weights_to_cpu) from .default_constructor import DefaultRunnerConstructor from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info, init_dist, master_only) from .epoch_based_runner import EpochBasedRunner, Runner from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistEvalHook, DistSamplerSeedHook, DvcliveLoggerHook, EMAHook, EvalHook, Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook, GradientCumulativeOptimizerHook, Hook, IterTimerHook, LoggerHook, LrUpdaterHook, MlflowLoggerHook, NeptuneLoggerHook, OptimizerHook, PaviLoggerHook, SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook, WandbLoggerHook) from .iter_based_runner import IterBasedRunner, IterLoader from .log_buffer import LogBuffer from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS, DefaultOptimizerConstructor, build_optimizer, build_optimizer_constructor) from .priority import Priority, get_priority from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed __all__ = [ 'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer', 'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook', 'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook', 'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook', 'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook', 'DvcliveLoggerHook', '_load_checkpoint', 'load_state_dict', 'load_checkpoint', 'weights_to_cpu', 'save_checkpoint', 'Priority', 'get_priority', 'get_host_info', 'get_time_str', 'obj_from_dict', 'init_dist', 'get_dist_info', 'master_only', 'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer', 'build_optimizer_constructor', 'IterLoader', 'set_random_seed', 'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'Fp16OptimizerHook', 'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads', 'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule', '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential', 'ModuleList', 'GradientCumulativeOptimizerHook', 'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor' ] ================================================ FILE: annotator/mmpkg/mmcv/runner/base_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import warnings from abc import ABCMeta from collections import defaultdict from logging import FileHandler import torch.nn as nn from annotator.mmpkg.mmcv.runner.dist_utils import master_only from annotator.mmpkg.mmcv.utils.logging import get_logger, logger_initialized, print_log class BaseModule(nn.Module, metaclass=ABCMeta): """Base module for all modules in openmmlab. ``BaseModule`` is a wrapper of ``torch.nn.Module`` with additional functionality of parameter initialization. Compared with ``torch.nn.Module``, ``BaseModule`` mainly adds three attributes. - ``init_cfg``: the config to control the initialization. - ``init_weights``: The function of parameter initialization and recording initialization information. - ``_params_init_info``: Used to track the parameter initialization information. This attribute only exists during executing the ``init_weights``. Args: init_cfg (dict, optional): Initialization config dict. """ def __init__(self, init_cfg=None): """Initialize BaseModule, inherited from `torch.nn.Module`""" # NOTE init_cfg can be defined in different levels, but init_cfg # in low levels has a higher priority. super(BaseModule, self).__init__() # define default value of init_cfg instead of hard code # in init_weights() function self._is_init = False self.init_cfg = copy.deepcopy(init_cfg) # Backward compatibility in derived classes # if pretrained is not None: # warnings.warn('DeprecationWarning: pretrained is a deprecated \ # key, please consider using init_cfg') # self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) @property def is_init(self): return self._is_init def init_weights(self): """Initialize the weights.""" is_top_level_module = False # check if it is top-level module if not hasattr(self, '_params_init_info'): # The `_params_init_info` is used to record the initialization # information of the parameters # the key should be the obj:`nn.Parameter` of model and the value # should be a dict containing # - init_info (str): The string that describes the initialization. # - tmp_mean_value (FloatTensor): The mean of the parameter, # which indicates whether the parameter has been modified. # this attribute would be deleted after all parameters # is initialized. self._params_init_info = defaultdict(dict) is_top_level_module = True # Initialize the `_params_init_info`, # When detecting the `tmp_mean_value` of # the corresponding parameter is changed, update related # initialization information for name, param in self.named_parameters(): self._params_init_info[param][ 'init_info'] = f'The value is the same before and ' \ f'after calling `init_weights` ' \ f'of {self.__class__.__name__} ' self._params_init_info[param][ 'tmp_mean_value'] = param.data.mean() # pass `params_init_info` to all submodules # All submodules share the same `params_init_info`, # so it will be updated when parameters are # modified at any level of the model. for sub_module in self.modules(): sub_module._params_init_info = self._params_init_info # Get the initialized logger, if not exist, # create a logger named `mmcv` logger_names = list(logger_initialized.keys()) logger_name = logger_names[0] if logger_names else 'mmcv' from ..cnn import initialize from ..cnn.utils.weight_init import update_init_info module_name = self.__class__.__name__ if not self._is_init: if self.init_cfg: print_log( f'initialize {module_name} with init_cfg {self.init_cfg}', logger=logger_name) initialize(self, self.init_cfg) if isinstance(self.init_cfg, dict): # prevent the parameters of # the pre-trained model # from being overwritten by # the `init_weights` if self.init_cfg['type'] == 'Pretrained': return for m in self.children(): if hasattr(m, 'init_weights'): m.init_weights() # users may overload the `init_weights` update_init_info( m, init_info=f'Initialized by ' f'user-defined `init_weights`' f' in {m.__class__.__name__} ') self._is_init = True else: warnings.warn(f'init_weights of {self.__class__.__name__} has ' f'been called more than once.') if is_top_level_module: self._dump_init_info(logger_name) for sub_module in self.modules(): del sub_module._params_init_info @master_only def _dump_init_info(self, logger_name): """Dump the initialization information to a file named `initialization.log.json` in workdir. Args: logger_name (str): The name of logger. """ logger = get_logger(logger_name) with_file_handler = False # dump the information to the logger file if there is a `FileHandler` for handler in logger.handlers: if isinstance(handler, FileHandler): handler.stream.write( 'Name of parameter - Initialization information\n') for name, param in self.named_parameters(): handler.stream.write( f'\n{name} - {param.shape}: ' f"\n{self._params_init_info[param]['init_info']} \n") handler.stream.flush() with_file_handler = True if not with_file_handler: for name, param in self.named_parameters(): print_log( f'\n{name} - {param.shape}: ' f"\n{self._params_init_info[param]['init_info']} \n ", logger=logger_name) def __repr__(self): s = super().__repr__() if self.init_cfg: s += f'\ninit_cfg={self.init_cfg}' return s class Sequential(BaseModule, nn.Sequential): """Sequential module in openmmlab. Args: init_cfg (dict, optional): Initialization config dict. """ def __init__(self, *args, init_cfg=None): BaseModule.__init__(self, init_cfg) nn.Sequential.__init__(self, *args) class ModuleList(BaseModule, nn.ModuleList): """ModuleList in openmmlab. Args: modules (iterable, optional): an iterable of modules to add. init_cfg (dict, optional): Initialization config dict. """ def __init__(self, modules=None, init_cfg=None): BaseModule.__init__(self, init_cfg) nn.ModuleList.__init__(self, modules) ================================================ FILE: annotator/mmpkg/mmcv/runner/base_runner.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import logging import os.path as osp import warnings from abc import ABCMeta, abstractmethod import torch from torch.optim import Optimizer import annotator.mmpkg.mmcv as mmcv from ..parallel import is_module_wrapper from .checkpoint import load_checkpoint from .dist_utils import get_dist_info from .hooks import HOOKS, Hook from .log_buffer import LogBuffer from .priority import Priority, get_priority from .utils import get_time_str class BaseRunner(metaclass=ABCMeta): """The base class of Runner, a training helper for PyTorch. All subclasses should implement the following APIs: - ``run()`` - ``train()`` - ``val()`` - ``save_checkpoint()`` Args: model (:obj:`torch.nn.Module`): The model to be run. batch_processor (callable): A callable method that process a data batch. The interface of this method should be `batch_processor(model, data, train_mode) -> dict` optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an optimizer (in most cases) or a dict of optimizers (in models that requires more than one optimizer, e.g., GAN). work_dir (str, optional): The working directory to save checkpoints and logs. Defaults to None. logger (:obj:`logging.Logger`): Logger used during training. Defaults to None. (The default value is just for backward compatibility) meta (dict | None): A dict records some import information such as environment info and seed, which will be logged in logger hook. Defaults to None. max_epochs (int, optional): Total training epochs. max_iters (int, optional): Total training iterations. """ def __init__(self, model, batch_processor=None, optimizer=None, work_dir=None, logger=None, meta=None, max_iters=None, max_epochs=None): if batch_processor is not None: if not callable(batch_processor): raise TypeError('batch_processor must be callable, ' f'but got {type(batch_processor)}') warnings.warn('batch_processor is deprecated, please implement ' 'train_step() and val_step() in the model instead.') # raise an error is `batch_processor` is not None and # `model.train_step()` exists. if is_module_wrapper(model): _model = model.module else: _model = model if hasattr(_model, 'train_step') or hasattr(_model, 'val_step'): raise RuntimeError( 'batch_processor and model.train_step()/model.val_step() ' 'cannot be both available.') else: assert hasattr(model, 'train_step') # check the type of `optimizer` if isinstance(optimizer, dict): for name, optim in optimizer.items(): if not isinstance(optim, Optimizer): raise TypeError( f'optimizer must be a dict of torch.optim.Optimizers, ' f'but optimizer["{name}"] is a {type(optim)}') elif not isinstance(optimizer, Optimizer) and optimizer is not None: raise TypeError( f'optimizer must be a torch.optim.Optimizer object ' f'or dict or None, but got {type(optimizer)}') # check the type of `logger` if not isinstance(logger, logging.Logger): raise TypeError(f'logger must be a logging.Logger object, ' f'but got {type(logger)}') # check the type of `meta` if meta is not None and not isinstance(meta, dict): raise TypeError( f'meta must be a dict or None, but got {type(meta)}') self.model = model self.batch_processor = batch_processor self.optimizer = optimizer self.logger = logger self.meta = meta # create work_dir if mmcv.is_str(work_dir): self.work_dir = osp.abspath(work_dir) mmcv.mkdir_or_exist(self.work_dir) elif work_dir is None: self.work_dir = None else: raise TypeError('"work_dir" must be a str or None') # get model name from the model class if hasattr(self.model, 'module'): self._model_name = self.model.module.__class__.__name__ else: self._model_name = self.model.__class__.__name__ self._rank, self._world_size = get_dist_info() self.timestamp = get_time_str() self.mode = None self._hooks = [] self._epoch = 0 self._iter = 0 self._inner_iter = 0 if max_epochs is not None and max_iters is not None: raise ValueError( 'Only one of `max_epochs` or `max_iters` can be set.') self._max_epochs = max_epochs self._max_iters = max_iters # TODO: Redesign LogBuffer, it is not flexible and elegant enough self.log_buffer = LogBuffer() @property def model_name(self): """str: Name of the model, usually the module class name.""" return self._model_name @property def rank(self): """int: Rank of current process. (distributed training)""" return self._rank @property def world_size(self): """int: Number of processes participating in the job. (distributed training)""" return self._world_size @property def hooks(self): """list[:obj:`Hook`]: A list of registered hooks.""" return self._hooks @property def epoch(self): """int: Current epoch.""" return self._epoch @property def iter(self): """int: Current iteration.""" return self._iter @property def inner_iter(self): """int: Iteration in an epoch.""" return self._inner_iter @property def max_epochs(self): """int: Maximum training epochs.""" return self._max_epochs @property def max_iters(self): """int: Maximum training iterations.""" return self._max_iters @abstractmethod def train(self): pass @abstractmethod def val(self): pass @abstractmethod def run(self, data_loaders, workflow, **kwargs): pass @abstractmethod def save_checkpoint(self, out_dir, filename_tmpl, save_optimizer=True, meta=None, create_symlink=True): pass def current_lr(self): """Get current learning rates. Returns: list[float] | dict[str, list[float]]: Current learning rates of all param groups. If the runner has a dict of optimizers, this method will return a dict. """ if isinstance(self.optimizer, torch.optim.Optimizer): lr = [group['lr'] for group in self.optimizer.param_groups] elif isinstance(self.optimizer, dict): lr = dict() for name, optim in self.optimizer.items(): lr[name] = [group['lr'] for group in optim.param_groups] else: raise RuntimeError( 'lr is not applicable because optimizer does not exist.') return lr def current_momentum(self): """Get current momentums. Returns: list[float] | dict[str, list[float]]: Current momentums of all param groups. If the runner has a dict of optimizers, this method will return a dict. """ def _get_momentum(optimizer): momentums = [] for group in optimizer.param_groups: if 'momentum' in group.keys(): momentums.append(group['momentum']) elif 'betas' in group.keys(): momentums.append(group['betas'][0]) else: momentums.append(0) return momentums if self.optimizer is None: raise RuntimeError( 'momentum is not applicable because optimizer does not exist.') elif isinstance(self.optimizer, torch.optim.Optimizer): momentums = _get_momentum(self.optimizer) elif isinstance(self.optimizer, dict): momentums = dict() for name, optim in self.optimizer.items(): momentums[name] = _get_momentum(optim) return momentums def register_hook(self, hook, priority='NORMAL'): """Register a hook into the hook list. The hook will be inserted into a priority queue, with the specified priority (See :class:`Priority` for details of priorities). For hooks with the same priority, they will be triggered in the same order as they are registered. Args: hook (:obj:`Hook`): The hook to be registered. priority (int or str or :obj:`Priority`): Hook priority. Lower value means higher priority. """ assert isinstance(hook, Hook) if hasattr(hook, 'priority'): raise ValueError('"priority" is a reserved attribute for hooks') priority = get_priority(priority) hook.priority = priority # insert the hook to a sorted list inserted = False for i in range(len(self._hooks) - 1, -1, -1): if priority >= self._hooks[i].priority: self._hooks.insert(i + 1, hook) inserted = True break if not inserted: self._hooks.insert(0, hook) def register_hook_from_cfg(self, hook_cfg): """Register a hook from its cfg. Args: hook_cfg (dict): Hook config. It should have at least keys 'type' and 'priority' indicating its type and priority. Notes: The specific hook class to register should not use 'type' and 'priority' arguments during initialization. """ hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = mmcv.build_from_cfg(hook_cfg, HOOKS) self.register_hook(hook, priority=priority) def call_hook(self, fn_name): """Call all hooks. Args: fn_name (str): The function name in each hook to be called, such as "before_train_epoch". """ for hook in self._hooks: getattr(hook, fn_name)(self) def get_hook_info(self): # Get hooks info in each stage stage_hook_map = {stage: [] for stage in Hook.stages} for hook in self.hooks: try: priority = Priority(hook.priority).name except ValueError: priority = hook.priority classname = hook.__class__.__name__ hook_info = f'({priority:<12}) {classname:<35}' for trigger_stage in hook.get_triggered_stages(): stage_hook_map[trigger_stage].append(hook_info) stage_hook_infos = [] for stage in Hook.stages: hook_infos = stage_hook_map[stage] if len(hook_infos) > 0: info = f'{stage}:\n' info += '\n'.join(hook_infos) info += '\n -------------------- ' stage_hook_infos.append(info) return '\n'.join(stage_hook_infos) def load_checkpoint(self, filename, map_location='cpu', strict=False, revise_keys=[(r'^module.', '')]): return load_checkpoint( self.model, filename, map_location, strict, self.logger, revise_keys=revise_keys) def resume(self, checkpoint, resume_optimizer=True, map_location='default'): if map_location == 'default': if torch.cuda.is_available(): device_id = torch.cuda.current_device() checkpoint = self.load_checkpoint( checkpoint, map_location=lambda storage, loc: storage.cuda(device_id)) else: checkpoint = self.load_checkpoint(checkpoint) else: checkpoint = self.load_checkpoint( checkpoint, map_location=map_location) self._epoch = checkpoint['meta']['epoch'] self._iter = checkpoint['meta']['iter'] if self.meta is None: self.meta = {} self.meta.setdefault('hook_msgs', {}) # load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages self.meta['hook_msgs'].update(checkpoint['meta'].get('hook_msgs', {})) # Re-calculate the number of iterations when resuming # models with different number of GPUs if 'config' in checkpoint['meta']: config = mmcv.Config.fromstring( checkpoint['meta']['config'], file_format='.py') previous_gpu_ids = config.get('gpu_ids', None) if previous_gpu_ids and len(previous_gpu_ids) > 0 and len( previous_gpu_ids) != self.world_size: self._iter = int(self._iter * len(previous_gpu_ids) / self.world_size) self.logger.info('the iteration number is changed due to ' 'change of GPU number') # resume meta information meta self.meta = checkpoint['meta'] if 'optimizer' in checkpoint and resume_optimizer: if isinstance(self.optimizer, Optimizer): self.optimizer.load_state_dict(checkpoint['optimizer']) elif isinstance(self.optimizer, dict): for k in self.optimizer.keys(): self.optimizer[k].load_state_dict( checkpoint['optimizer'][k]) else: raise TypeError( 'Optimizer should be dict or torch.optim.Optimizer ' f'but got {type(self.optimizer)}') self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter) def register_lr_hook(self, lr_config): if lr_config is None: return elif isinstance(lr_config, dict): assert 'policy' in lr_config policy_type = lr_config.pop('policy') # If the type of policy is all in lower case, e.g., 'cyclic', # then its first letter will be capitalized, e.g., to be 'Cyclic'. # This is for the convenient usage of Lr updater. # Since this is not applicable for ` # CosineAnnealingLrUpdater`, # the string will not be changed if it contains capital letters. if policy_type == policy_type.lower(): policy_type = policy_type.title() hook_type = policy_type + 'LrUpdaterHook' lr_config['type'] = hook_type hook = mmcv.build_from_cfg(lr_config, HOOKS) else: hook = lr_config self.register_hook(hook, priority='VERY_HIGH') def register_momentum_hook(self, momentum_config): if momentum_config is None: return if isinstance(momentum_config, dict): assert 'policy' in momentum_config policy_type = momentum_config.pop('policy') # If the type of policy is all in lower case, e.g., 'cyclic', # then its first letter will be capitalized, e.g., to be 'Cyclic'. # This is for the convenient usage of momentum updater. # Since this is not applicable for # `CosineAnnealingMomentumUpdater`, # the string will not be changed if it contains capital letters. if policy_type == policy_type.lower(): policy_type = policy_type.title() hook_type = policy_type + 'MomentumUpdaterHook' momentum_config['type'] = hook_type hook = mmcv.build_from_cfg(momentum_config, HOOKS) else: hook = momentum_config self.register_hook(hook, priority='HIGH') def register_optimizer_hook(self, optimizer_config): if optimizer_config is None: return if isinstance(optimizer_config, dict): optimizer_config.setdefault('type', 'OptimizerHook') hook = mmcv.build_from_cfg(optimizer_config, HOOKS) else: hook = optimizer_config self.register_hook(hook, priority='ABOVE_NORMAL') def register_checkpoint_hook(self, checkpoint_config): if checkpoint_config is None: return if isinstance(checkpoint_config, dict): checkpoint_config.setdefault('type', 'CheckpointHook') hook = mmcv.build_from_cfg(checkpoint_config, HOOKS) else: hook = checkpoint_config self.register_hook(hook, priority='NORMAL') def register_logger_hooks(self, log_config): if log_config is None: return log_interval = log_config['interval'] for info in log_config['hooks']: logger_hook = mmcv.build_from_cfg( info, HOOKS, default_args=dict(interval=log_interval)) self.register_hook(logger_hook, priority='VERY_LOW') def register_timer_hook(self, timer_config): if timer_config is None: return if isinstance(timer_config, dict): timer_config_ = copy.deepcopy(timer_config) hook = mmcv.build_from_cfg(timer_config_, HOOKS) else: hook = timer_config self.register_hook(hook, priority='LOW') def register_custom_hooks(self, custom_config): if custom_config is None: return if not isinstance(custom_config, list): custom_config = [custom_config] for item in custom_config: if isinstance(item, dict): self.register_hook_from_cfg(item) else: self.register_hook(item, priority='NORMAL') def register_profiler_hook(self, profiler_config): if profiler_config is None: return if isinstance(profiler_config, dict): profiler_config.setdefault('type', 'ProfilerHook') hook = mmcv.build_from_cfg(profiler_config, HOOKS) else: hook = profiler_config self.register_hook(hook) def register_training_hooks(self, lr_config, optimizer_config=None, checkpoint_config=None, log_config=None, momentum_config=None, timer_config=dict(type='IterTimerHook'), custom_hooks_config=None): """Register default and custom hooks for training. Default and custom hooks include: +----------------------+-------------------------+ | Hooks | Priority | +======================+=========================+ | LrUpdaterHook | VERY_HIGH (10) | +----------------------+-------------------------+ | MomentumUpdaterHook | HIGH (30) | +----------------------+-------------------------+ | OptimizerStepperHook | ABOVE_NORMAL (40) | +----------------------+-------------------------+ | CheckpointSaverHook | NORMAL (50) | +----------------------+-------------------------+ | IterTimerHook | LOW (70) | +----------------------+-------------------------+ | LoggerHook(s) | VERY_LOW (90) | +----------------------+-------------------------+ | CustomHook(s) | defaults to NORMAL (50) | +----------------------+-------------------------+ If custom hooks have same priority with default hooks, custom hooks will be triggered after default hooks. """ self.register_lr_hook(lr_config) self.register_momentum_hook(momentum_config) self.register_optimizer_hook(optimizer_config) self.register_checkpoint_hook(checkpoint_config) self.register_timer_hook(timer_config) self.register_logger_hooks(log_config) self.register_custom_hooks(custom_hooks_config) ================================================ FILE: annotator/mmpkg/mmcv/runner/builder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy from ..utils import Registry RUNNERS = Registry('runner') RUNNER_BUILDERS = Registry('runner builder') def build_runner_constructor(cfg): return RUNNER_BUILDERS.build(cfg) def build_runner(cfg, default_args=None): runner_cfg = copy.deepcopy(cfg) constructor_type = runner_cfg.pop('constructor', 'DefaultRunnerConstructor') runner_constructor = build_runner_constructor( dict( type=constructor_type, runner_cfg=runner_cfg, default_args=default_args)) runner = runner_constructor() return runner ================================================ FILE: annotator/mmpkg/mmcv/runner/checkpoint.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import io import os import os.path as osp import pkgutil import re import time import warnings from collections import OrderedDict from importlib import import_module from tempfile import TemporaryDirectory import torch import torchvision from torch.optim import Optimizer from torch.utils import model_zoo import annotator.mmpkg.mmcv as mmcv from ..fileio import FileClient from ..fileio import load as load_file from ..parallel import is_module_wrapper from ..utils import mkdir_or_exist from .dist_utils import get_dist_info ENV_MMCV_HOME = 'MMCV_HOME' ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' DEFAULT_CACHE_DIR = '~/.cache' def _get_mmcv_home(): mmcv_home = os.path.expanduser( os.getenv( ENV_MMCV_HOME, os.path.join( os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv'))) mkdir_or_exist(mmcv_home) return mmcv_home def load_state_dict(module, state_dict, strict=False, logger=None): """Load state_dict to a module. This method is modified from :meth:`torch.nn.Module.load_state_dict`. Default value for ``strict`` is set to ``False`` and the message for param mismatch will be shown even if strict is False. Args: module (Module): Module that receives the state_dict. state_dict (OrderedDict): Weights. strict (bool): whether to strictly enforce that the keys in :attr:`state_dict` match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. logger (:obj:`logging.Logger`, optional): Logger to log the error message. If not specified, print function will be used. """ unexpected_keys = [] all_missing_keys = [] err_msg = [] metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # use _load_from_state_dict to enable checkpoint version control def load(module, prefix=''): # recursively check parallel module in case that the model has a # complicated structure, e.g., nn.Module(nn.Module(DDP)) if is_module_wrapper(module): module = module.module local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, all_missing_keys, unexpected_keys, err_msg) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') load(module) load = None # break load->load reference cycle # ignore "num_batches_tracked" of BN layers missing_keys = [ key for key in all_missing_keys if 'num_batches_tracked' not in key ] if unexpected_keys: err_msg.append('unexpected key in source ' f'state_dict: {", ".join(unexpected_keys)}\n') if missing_keys: err_msg.append( f'missing keys in source state_dict: {", ".join(missing_keys)}\n') rank, _ = get_dist_info() if len(err_msg) > 0 and rank == 0: err_msg.insert( 0, 'The model and loaded state dict do not match exactly\n') err_msg = '\n'.join(err_msg) if strict: raise RuntimeError(err_msg) elif logger is not None: logger.warning(err_msg) else: print(err_msg) def get_torchvision_models(): model_urls = dict() for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): if ispkg: continue _zoo = import_module(f'torchvision.models.{name}') if hasattr(_zoo, 'model_urls'): _urls = getattr(_zoo, 'model_urls') model_urls.update(_urls) return model_urls def get_external_models(): mmcv_home = _get_mmcv_home() default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json') default_urls = load_file(default_json_path) assert isinstance(default_urls, dict) external_json_path = osp.join(mmcv_home, 'open_mmlab.json') if osp.exists(external_json_path): external_urls = load_file(external_json_path) assert isinstance(external_urls, dict) default_urls.update(external_urls) return default_urls def get_mmcls_models(): mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json') mmcls_urls = load_file(mmcls_json_path) return mmcls_urls def get_deprecated_model_names(): deprecate_json_path = osp.join(mmcv.__path__[0], 'model_zoo/deprecated.json') deprecate_urls = load_file(deprecate_json_path) assert isinstance(deprecate_urls, dict) return deprecate_urls def _process_mmcls_checkpoint(checkpoint): state_dict = checkpoint['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): if k.startswith('backbone.'): new_state_dict[k[9:]] = v new_checkpoint = dict(state_dict=new_state_dict) return new_checkpoint class CheckpointLoader: """A general checkpoint loader to manage all schemes.""" _schemes = {} @classmethod def _register_scheme(cls, prefixes, loader, force=False): if isinstance(prefixes, str): prefixes = [prefixes] else: assert isinstance(prefixes, (list, tuple)) for prefix in prefixes: if (prefix not in cls._schemes) or force: cls._schemes[prefix] = loader else: raise KeyError( f'{prefix} is already registered as a loader backend, ' 'add "force=True" if you want to override it') # sort, longer prefixes take priority cls._schemes = OrderedDict( sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True)) @classmethod def register_scheme(cls, prefixes, loader=None, force=False): """Register a loader to CheckpointLoader. This method can be used as a normal class method or a decorator. Args: prefixes (str or list[str] or tuple[str]): The prefix of the registered loader. loader (function, optional): The loader function to be registered. When this method is used as a decorator, loader is None. Defaults to None. force (bool, optional): Whether to override the loader if the prefix has already been registered. Defaults to False. """ if loader is not None: cls._register_scheme(prefixes, loader, force=force) return def _register(loader_cls): cls._register_scheme(prefixes, loader_cls, force=force) return loader_cls return _register @classmethod def _get_checkpoint_loader(cls, path): """Finds a loader that supports the given path. Falls back to the local loader if no other loader is found. Args: path (str): checkpoint path Returns: loader (function): checkpoint loader """ for p in cls._schemes: if path.startswith(p): return cls._schemes[p] @classmethod def load_checkpoint(cls, filename, map_location=None, logger=None): """load checkpoint through URL scheme path. Args: filename (str): checkpoint file name with given prefix map_location (str, optional): Same as :func:`torch.load`. Default: None logger (:mod:`logging.Logger`, optional): The logger for message. Default: None Returns: dict or OrderedDict: The loaded checkpoint. """ checkpoint_loader = cls._get_checkpoint_loader(filename) class_name = checkpoint_loader.__name__ mmcv.print_log( f'load checkpoint from {class_name[10:]} path: {filename}', logger) return checkpoint_loader(filename, map_location) @CheckpointLoader.register_scheme(prefixes='') def load_from_local(filename, map_location): """load checkpoint by local file path. Args: filename (str): local checkpoint file path map_location (str, optional): Same as :func:`torch.load`. Returns: dict or OrderedDict: The loaded checkpoint. """ if not osp.isfile(filename): raise IOError(f'{filename} is not a checkpoint file') checkpoint = torch.load(filename, map_location=map_location) return checkpoint @CheckpointLoader.register_scheme(prefixes=('http://', 'https://')) def load_from_http(filename, map_location=None, model_dir=None): """load checkpoint through HTTP or HTTPS scheme path. In distributed setting, this function only download checkpoint at local rank 0. Args: filename (str): checkpoint file path with modelzoo or torchvision prefix map_location (str, optional): Same as :func:`torch.load`. model_dir (string, optional): directory in which to save the object, Default: None Returns: dict or OrderedDict: The loaded checkpoint. """ rank, world_size = get_dist_info() rank = int(os.environ.get('LOCAL_RANK', rank)) if rank == 0: checkpoint = model_zoo.load_url( filename, model_dir=model_dir, map_location=map_location) if world_size > 1: torch.distributed.barrier() if rank > 0: checkpoint = model_zoo.load_url( filename, model_dir=model_dir, map_location=map_location) return checkpoint @CheckpointLoader.register_scheme(prefixes='pavi://') def load_from_pavi(filename, map_location=None): """load checkpoint through the file path prefixed with pavi. In distributed setting, this function download ckpt at all ranks to different temporary directories. Args: filename (str): checkpoint file path with pavi prefix map_location (str, optional): Same as :func:`torch.load`. Default: None Returns: dict or OrderedDict: The loaded checkpoint. """ assert filename.startswith('pavi://'), \ f'Expected filename startswith `pavi://`, but get {filename}' model_path = filename[7:] try: from pavi import modelcloud except ImportError: raise ImportError( 'Please install pavi to load checkpoint from modelcloud.') model = modelcloud.get(model_path) with TemporaryDirectory() as tmp_dir: downloaded_file = osp.join(tmp_dir, model.name) model.download(downloaded_file) checkpoint = torch.load(downloaded_file, map_location=map_location) return checkpoint @CheckpointLoader.register_scheme(prefixes='s3://') def load_from_ceph(filename, map_location=None, backend='petrel'): """load checkpoint through the file path prefixed with s3. In distributed setting, this function download ckpt at all ranks to different temporary directories. Args: filename (str): checkpoint file path with s3 prefix map_location (str, optional): Same as :func:`torch.load`. backend (str, optional): The storage backend type. Options are 'ceph', 'petrel'. Default: 'petrel'. .. warning:: :class:`mmcv.fileio.file_client.CephBackend` will be deprecated, please use :class:`mmcv.fileio.file_client.PetrelBackend` instead. Returns: dict or OrderedDict: The loaded checkpoint. """ allowed_backends = ['ceph', 'petrel'] if backend not in allowed_backends: raise ValueError(f'Load from Backend {backend} is not supported.') if backend == 'ceph': warnings.warn( 'CephBackend will be deprecated, please use PetrelBackend instead') # CephClient and PetrelBackend have the same prefix 's3://' and the latter # will be chosen as default. If PetrelBackend can not be instantiated # successfully, the CephClient will be chosen. try: file_client = FileClient(backend=backend) except ImportError: allowed_backends.remove(backend) file_client = FileClient(backend=allowed_backends[0]) with io.BytesIO(file_client.get(filename)) as buffer: checkpoint = torch.load(buffer, map_location=map_location) return checkpoint @CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://')) def load_from_torchvision(filename, map_location=None): """load checkpoint through the file path prefixed with modelzoo or torchvision. Args: filename (str): checkpoint file path with modelzoo or torchvision prefix map_location (str, optional): Same as :func:`torch.load`. Returns: dict or OrderedDict: The loaded checkpoint. """ model_urls = get_torchvision_models() if filename.startswith('modelzoo://'): warnings.warn('The URL scheme of "modelzoo://" is deprecated, please ' 'use "torchvision://" instead') model_name = filename[11:] else: model_name = filename[14:] return load_from_http(model_urls[model_name], map_location=map_location) @CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://')) def load_from_openmmlab(filename, map_location=None): """load checkpoint through the file path prefixed with open-mmlab or openmmlab. Args: filename (str): checkpoint file path with open-mmlab or openmmlab prefix map_location (str, optional): Same as :func:`torch.load`. Default: None Returns: dict or OrderedDict: The loaded checkpoint. """ model_urls = get_external_models() prefix_str = 'open-mmlab://' if filename.startswith(prefix_str): model_name = filename[13:] else: model_name = filename[12:] prefix_str = 'openmmlab://' deprecated_urls = get_deprecated_model_names() if model_name in deprecated_urls: warnings.warn(f'{prefix_str}{model_name} is deprecated in favor ' f'of {prefix_str}{deprecated_urls[model_name]}') model_name = deprecated_urls[model_name] model_url = model_urls[model_name] # check if is url if model_url.startswith(('http://', 'https://')): checkpoint = load_from_http(model_url, map_location=map_location) else: filename = osp.join(_get_mmcv_home(), model_url) if not osp.isfile(filename): raise IOError(f'{filename} is not a checkpoint file') checkpoint = torch.load(filename, map_location=map_location) return checkpoint @CheckpointLoader.register_scheme(prefixes='mmcls://') def load_from_mmcls(filename, map_location=None): """load checkpoint through the file path prefixed with mmcls. Args: filename (str): checkpoint file path with mmcls prefix map_location (str, optional): Same as :func:`torch.load`. Returns: dict or OrderedDict: The loaded checkpoint. """ model_urls = get_mmcls_models() model_name = filename[8:] checkpoint = load_from_http( model_urls[model_name], map_location=map_location) checkpoint = _process_mmcls_checkpoint(checkpoint) return checkpoint def _load_checkpoint(filename, map_location=None, logger=None): """Load checkpoint from somewhere (modelzoo, file, url). Args: filename (str): Accept local filepath, URL, ``torchvision://xxx``, ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for details. map_location (str, optional): Same as :func:`torch.load`. Default: None. logger (:mod:`logging.Logger`, optional): The logger for error message. Default: None Returns: dict or OrderedDict: The loaded checkpoint. It can be either an OrderedDict storing model weights or a dict containing other information, which depends on the checkpoint. """ return CheckpointLoader.load_checkpoint(filename, map_location, logger) def _load_checkpoint_with_prefix(prefix, filename, map_location=None): """Load partial pretrained model with specific prefix. Args: prefix (str): The prefix of sub-module. filename (str): Accept local filepath, URL, ``torchvision://xxx``, ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for details. map_location (str | None): Same as :func:`torch.load`. Default: None. Returns: dict or OrderedDict: The loaded checkpoint. """ checkpoint = _load_checkpoint(filename, map_location=map_location) if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] else: state_dict = checkpoint if not prefix.endswith('.'): prefix += '.' prefix_len = len(prefix) state_dict = { k[prefix_len:]: v for k, v in state_dict.items() if k.startswith(prefix) } assert state_dict, f'{prefix} is not in the pretrained model' return state_dict def load_checkpoint(model, filename, map_location=None, strict=False, logger=None, revise_keys=[(r'^module\.', '')]): """Load checkpoint from a file or URI. Args: model (Module): Module to load checkpoint. filename (str): Accept local filepath, URL, ``torchvision://xxx``, ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for details. map_location (str): Same as :func:`torch.load`. strict (bool): Whether to allow different params for the model and checkpoint. logger (:mod:`logging.Logger` or None): The logger for error message. revise_keys (list): A list of customized keywords to modify the state_dict in checkpoint. Each item is a (pattern, replacement) pair of the regular expression operations. Default: strip the prefix 'module.' by [(r'^module\\.', '')]. Returns: dict or OrderedDict: The loaded checkpoint. """ checkpoint = _load_checkpoint(filename, map_location, logger) # OrderedDict is a subclass of dict if not isinstance(checkpoint, dict): raise RuntimeError( f'No state_dict found in checkpoint file {filename}') # get state_dict from checkpoint if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] else: state_dict = checkpoint # strip prefix of state_dict metadata = getattr(state_dict, '_metadata', OrderedDict()) for p, r in revise_keys: state_dict = OrderedDict( {re.sub(p, r, k): v for k, v in state_dict.items()}) # Keep metadata in state_dict state_dict._metadata = metadata # load state_dict load_state_dict(model, state_dict, strict, logger) return checkpoint def weights_to_cpu(state_dict): """Copy a model state_dict to cpu. Args: state_dict (OrderedDict): Model weights on GPU. Returns: OrderedDict: Model weights on GPU. """ state_dict_cpu = OrderedDict() for key, val in state_dict.items(): state_dict_cpu[key] = val.cpu() # Keep metadata in state_dict state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict()) return state_dict_cpu def _save_to_state_dict(module, destination, prefix, keep_vars): """Saves module state to `destination` dictionary. This method is modified from :meth:`torch.nn.Module._save_to_state_dict`. Args: module (nn.Module): The module to generate state_dict. destination (dict): A dict where state will be stored. prefix (str): The prefix for parameters and buffers used in this module. """ for name, param in module._parameters.items(): if param is not None: destination[prefix + name] = param if keep_vars else param.detach() for name, buf in module._buffers.items(): # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d if buf is not None: destination[prefix + name] = buf if keep_vars else buf.detach() def get_state_dict(module, destination=None, prefix='', keep_vars=False): """Returns a dictionary containing a whole state of the module. Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names. This method is modified from :meth:`torch.nn.Module.state_dict` to recursively check parallel module in case that the model has a complicated structure, e.g., nn.Module(nn.Module(DDP)). Args: module (nn.Module): The module to generate state_dict. destination (OrderedDict): Returned dict for the state of the module. prefix (str): Prefix of the key. keep_vars (bool): Whether to keep the variable property of the parameters. Default: False. Returns: dict: A dictionary containing a whole state of the module. """ # recursively check parallel module in case that the model has a # complicated structure, e.g., nn.Module(nn.Module(DDP)) if is_module_wrapper(module): module = module.module # below is the same as torch.nn.Module.state_dict() if destination is None: destination = OrderedDict() destination._metadata = OrderedDict() destination._metadata[prefix[:-1]] = local_metadata = dict( version=module._version) _save_to_state_dict(module, destination, prefix, keep_vars) for name, child in module._modules.items(): if child is not None: get_state_dict( child, destination, prefix + name + '.', keep_vars=keep_vars) for hook in module._state_dict_hooks.values(): hook_result = hook(module, destination, prefix, local_metadata) if hook_result is not None: destination = hook_result return destination def save_checkpoint(model, filename, optimizer=None, meta=None, file_client_args=None): """Save checkpoint to file. The checkpoint will have 3 fields: ``meta``, ``state_dict`` and ``optimizer``. By default ``meta`` will contain version and time info. Args: model (Module): Module whose params are to be saved. filename (str): Checkpoint filename. optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. meta (dict, optional): Metadata to be saved in checkpoint. file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Default: None. `New in version 1.3.16.` """ if meta is None: meta = {} elif not isinstance(meta, dict): raise TypeError(f'meta must be a dict or None, but got {type(meta)}') meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) if is_module_wrapper(model): model = model.module if hasattr(model, 'CLASSES') and model.CLASSES is not None: # save class name to the meta meta.update(CLASSES=model.CLASSES) checkpoint = { 'meta': meta, 'state_dict': weights_to_cpu(get_state_dict(model)) } # save optimizer state dict in the checkpoint if isinstance(optimizer, Optimizer): checkpoint['optimizer'] = optimizer.state_dict() elif isinstance(optimizer, dict): checkpoint['optimizer'] = {} for name, optim in optimizer.items(): checkpoint['optimizer'][name] = optim.state_dict() if filename.startswith('pavi://'): if file_client_args is not None: raise ValueError( 'file_client_args should be "None" if filename starts with' f'"pavi://", but got {file_client_args}') try: from pavi import modelcloud from pavi import exception except ImportError: raise ImportError( 'Please install pavi to load checkpoint from modelcloud.') model_path = filename[7:] root = modelcloud.Folder() model_dir, model_name = osp.split(model_path) try: model = modelcloud.get(model_dir) except exception.NodeNotFoundError: model = root.create_training_model(model_dir) with TemporaryDirectory() as tmp_dir: checkpoint_file = osp.join(tmp_dir, model_name) with open(checkpoint_file, 'wb') as f: torch.save(checkpoint, f) f.flush() model.create_file(checkpoint_file, name=model_name) else: file_client = FileClient.infer_client(file_client_args, filename) with io.BytesIO() as f: torch.save(checkpoint, f) file_client.put(f.getvalue(), filename) ================================================ FILE: annotator/mmpkg/mmcv/runner/default_constructor.py ================================================ from .builder import RUNNER_BUILDERS, RUNNERS @RUNNER_BUILDERS.register_module() class DefaultRunnerConstructor: """Default constructor for runners. Custom existing `Runner` like `EpocBasedRunner` though `RunnerConstructor`. For example, We can inject some new properties and functions for `Runner`. Example: >>> from annotator.mmpkg.mmcv.runner import RUNNER_BUILDERS, build_runner >>> # Define a new RunnerReconstructor >>> @RUNNER_BUILDERS.register_module() >>> class MyRunnerConstructor: ... def __init__(self, runner_cfg, default_args=None): ... if not isinstance(runner_cfg, dict): ... raise TypeError('runner_cfg should be a dict', ... f'but got {type(runner_cfg)}') ... self.runner_cfg = runner_cfg ... self.default_args = default_args ... ... def __call__(self): ... runner = RUNNERS.build(self.runner_cfg, ... default_args=self.default_args) ... # Add new properties for existing runner ... runner.my_name = 'my_runner' ... runner.my_function = lambda self: print(self.my_name) ... ... >>> # build your runner >>> runner_cfg = dict(type='EpochBasedRunner', max_epochs=40, ... constructor='MyRunnerConstructor') >>> runner = build_runner(runner_cfg) """ def __init__(self, runner_cfg, default_args=None): if not isinstance(runner_cfg, dict): raise TypeError('runner_cfg should be a dict', f'but got {type(runner_cfg)}') self.runner_cfg = runner_cfg self.default_args = default_args def __call__(self): return RUNNERS.build(self.runner_cfg, default_args=self.default_args) ================================================ FILE: annotator/mmpkg/mmcv/runner/dist_utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import functools import os import subprocess from collections import OrderedDict import torch import torch.multiprocessing as mp from torch import distributed as dist from torch._utils import (_flatten_dense_tensors, _take_tensors, _unflatten_dense_tensors) def init_dist(launcher, backend='nccl', **kwargs): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') if launcher == 'pytorch': _init_dist_pytorch(backend, **kwargs) elif launcher == 'mpi': _init_dist_mpi(backend, **kwargs) elif launcher == 'slurm': _init_dist_slurm(backend, **kwargs) else: raise ValueError(f'Invalid launcher type: {launcher}') def _init_dist_pytorch(backend, **kwargs): # TODO: use local_rank instead of rank % num_gpus rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs) def _init_dist_mpi(backend, **kwargs): # TODO: use local_rank instead of rank % num_gpus rank = int(os.environ['OMPI_COMM_WORLD_RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs) def _init_dist_slurm(backend, port=None): """Initialize slurm distributed training environment. If argument ``port`` is not specified, then the master port will be system environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system environment variable, then a default port ``29500`` will be used. Args: backend (str): Backend of torch.distributed. port (int, optional): Master port. Defaults to None. """ proc_id = int(os.environ['SLURM_PROCID']) ntasks = int(os.environ['SLURM_NTASKS']) node_list = os.environ['SLURM_NODELIST'] num_gpus = torch.cuda.device_count() torch.cuda.set_device(proc_id % num_gpus) addr = subprocess.getoutput( f'scontrol show hostname {node_list} | head -n1') # specify master port if port is not None: os.environ['MASTER_PORT'] = str(port) elif 'MASTER_PORT' in os.environ: pass # use MASTER_PORT in the environment variable else: # 29500 is torch.distributed default port os.environ['MASTER_PORT'] = '29500' # use MASTER_ADDR in the environment variable if it already exists if 'MASTER_ADDR' not in os.environ: os.environ['MASTER_ADDR'] = addr os.environ['WORLD_SIZE'] = str(ntasks) os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) os.environ['RANK'] = str(proc_id) dist.init_process_group(backend=backend) def get_dist_info(): if dist.is_available() and dist.is_initialized(): rank = dist.get_rank() world_size = dist.get_world_size() else: rank = 0 world_size = 1 return rank, world_size def master_only(func): @functools.wraps(func) def wrapper(*args, **kwargs): rank, _ = get_dist_info() if rank == 0: return func(*args, **kwargs) return wrapper def allreduce_params(params, coalesce=True, bucket_size_mb=-1): """Allreduce parameters. Args: params (list[torch.Parameters]): List of parameters or buffers of a model. coalesce (bool, optional): Whether allreduce parameters as a whole. Defaults to True. bucket_size_mb (int, optional): Size of bucket, the unit is MB. Defaults to -1. """ _, world_size = get_dist_info() if world_size == 1: return params = [param.data for param in params] if coalesce: _allreduce_coalesced(params, world_size, bucket_size_mb) else: for tensor in params: dist.all_reduce(tensor.div_(world_size)) def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): """Allreduce gradients. Args: params (list[torch.Parameters]): List of parameters of a model coalesce (bool, optional): Whether allreduce parameters as a whole. Defaults to True. bucket_size_mb (int, optional): Size of bucket, the unit is MB. Defaults to -1. """ grads = [ param.grad.data for param in params if param.requires_grad and param.grad is not None ] _, world_size = get_dist_info() if world_size == 1: return if coalesce: _allreduce_coalesced(grads, world_size, bucket_size_mb) else: for tensor in grads: dist.all_reduce(tensor.div_(world_size)) def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced) ================================================ FILE: annotator/mmpkg/mmcv/runner/epoch_based_runner.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp import platform import shutil import time import warnings import torch import annotator.mmpkg.mmcv as mmcv from .base_runner import BaseRunner from .builder import RUNNERS from .checkpoint import save_checkpoint from .utils import get_host_info @RUNNERS.register_module() class EpochBasedRunner(BaseRunner): """Epoch-based Runner. This runner train models epoch by epoch. """ def run_iter(self, data_batch, train_mode, **kwargs): if self.batch_processor is not None: outputs = self.batch_processor( self.model, data_batch, train_mode=train_mode, **kwargs) elif train_mode: outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) else: outputs = self.model.val_step(data_batch, self.optimizer, **kwargs) if not isinstance(outputs, dict): raise TypeError('"batch_processor()" or "model.train_step()"' 'and "model.val_step()" must return a dict') if 'log_vars' in outputs: self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) self.outputs = outputs def train(self, data_loader, **kwargs): self.model.train() self.mode = 'train' self.data_loader = data_loader self._max_iters = self._max_epochs * len(self.data_loader) self.call_hook('before_train_epoch') time.sleep(2) # Prevent possible deadlock during epoch transition for i, data_batch in enumerate(self.data_loader): self._inner_iter = i self.call_hook('before_train_iter') self.run_iter(data_batch, train_mode=True, **kwargs) self.call_hook('after_train_iter') self._iter += 1 self.call_hook('after_train_epoch') self._epoch += 1 @torch.no_grad() def val(self, data_loader, **kwargs): self.model.eval() self.mode = 'val' self.data_loader = data_loader self.call_hook('before_val_epoch') time.sleep(2) # Prevent possible deadlock during epoch transition for i, data_batch in enumerate(self.data_loader): self._inner_iter = i self.call_hook('before_val_iter') self.run_iter(data_batch, train_mode=False) self.call_hook('after_val_iter') self.call_hook('after_val_epoch') def run(self, data_loaders, workflow, max_epochs=None, **kwargs): """Start running. Args: data_loaders (list[:obj:`DataLoader`]): Dataloaders for training and validation. workflow (list[tuple]): A list of (phase, epochs) to specify the running order and epochs. E.g, [('train', 2), ('val', 1)] means running 2 epochs for training and 1 epoch for validation, iteratively. """ assert isinstance(data_loaders, list) assert mmcv.is_list_of(workflow, tuple) assert len(data_loaders) == len(workflow) if max_epochs is not None: warnings.warn( 'setting max_epochs in run is deprecated, ' 'please set max_epochs in runner_config', DeprecationWarning) self._max_epochs = max_epochs assert self._max_epochs is not None, ( 'max_epochs must be specified during instantiation') for i, flow in enumerate(workflow): mode, epochs = flow if mode == 'train': self._max_iters = self._max_epochs * len(data_loaders[i]) break work_dir = self.work_dir if self.work_dir is not None else 'NONE' self.logger.info('Start running, host: %s, work_dir: %s', get_host_info(), work_dir) self.logger.info('Hooks will be executed in the following order:\n%s', self.get_hook_info()) self.logger.info('workflow: %s, max: %d epochs', workflow, self._max_epochs) self.call_hook('before_run') while self.epoch < self._max_epochs: for i, flow in enumerate(workflow): mode, epochs = flow if isinstance(mode, str): # self.train() if not hasattr(self, mode): raise ValueError( f'runner has no method named "{mode}" to run an ' 'epoch') epoch_runner = getattr(self, mode) else: raise TypeError( 'mode in workflow must be a str, but got {}'.format( type(mode))) for _ in range(epochs): if mode == 'train' and self.epoch >= self._max_epochs: break epoch_runner(data_loaders[i], **kwargs) time.sleep(1) # wait for some hooks like loggers to finish self.call_hook('after_run') def save_checkpoint(self, out_dir, filename_tmpl='epoch_{}.pth', save_optimizer=True, meta=None, create_symlink=True): """Save the checkpoint. Args: out_dir (str): The directory that checkpoints are saved. filename_tmpl (str, optional): The checkpoint filename template, which contains a placeholder for the epoch number. Defaults to 'epoch_{}.pth'. save_optimizer (bool, optional): Whether to save the optimizer to the checkpoint. Defaults to True. meta (dict, optional): The meta information to be saved in the checkpoint. Defaults to None. create_symlink (bool, optional): Whether to create a symlink "latest.pth" to point to the latest checkpoint. Defaults to True. """ if meta is None: meta = {} elif not isinstance(meta, dict): raise TypeError( f'meta should be a dict or None, but got {type(meta)}') if self.meta is not None: meta.update(self.meta) # Note: meta.update(self.meta) should be done before # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise # there will be problems with resumed checkpoints. # More details in https://github.com/open-mmlab/mmcv/pull/1108 meta.update(epoch=self.epoch + 1, iter=self.iter) filename = filename_tmpl.format(self.epoch + 1) filepath = osp.join(out_dir, filename) optimizer = self.optimizer if save_optimizer else None save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) # in some environments, `os.symlink` is not supported, you may need to # set `create_symlink` to False if create_symlink: dst_file = osp.join(out_dir, 'latest.pth') if platform.system() != 'Windows': mmcv.symlink(filename, dst_file) else: shutil.copy(filepath, dst_file) @RUNNERS.register_module() class Runner(EpochBasedRunner): """Deprecated name of EpochBasedRunner.""" def __init__(self, *args, **kwargs): warnings.warn( 'Runner was deprecated, please use EpochBasedRunner instead') super().__init__(*args, **kwargs) ================================================ FILE: annotator/mmpkg/mmcv/runner/fp16_utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import functools import warnings from collections import abc from inspect import getfullargspec import numpy as np import torch import torch.nn as nn from annotator.mmpkg.mmcv.utils import TORCH_VERSION, digit_version from .dist_utils import allreduce_grads as _allreduce_grads try: # If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported # and used; otherwise, auto fp16 will adopt mmcv's implementation. # Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16 # manually, so the behavior may not be consistent with real amp. from torch.cuda.amp import autocast except ImportError: pass def cast_tensor_type(inputs, src_type, dst_type): """Recursively convert Tensor in inputs from src_type to dst_type. Args: inputs: Inputs that to be casted. src_type (torch.dtype): Source type.. dst_type (torch.dtype): Destination type. Returns: The same type with inputs, but all contained Tensors have been cast. """ if isinstance(inputs, nn.Module): return inputs elif isinstance(inputs, torch.Tensor): return inputs.to(dst_type) elif isinstance(inputs, str): return inputs elif isinstance(inputs, np.ndarray): return inputs elif isinstance(inputs, abc.Mapping): return type(inputs)({ k: cast_tensor_type(v, src_type, dst_type) for k, v in inputs.items() }) elif isinstance(inputs, abc.Iterable): return type(inputs)( cast_tensor_type(item, src_type, dst_type) for item in inputs) else: return inputs def auto_fp16(apply_to=None, out_fp32=False): """Decorator to enable fp16 training automatically. This decorator is useful when you write custom modules and want to support mixed precision training. If inputs arguments are fp32 tensors, they will be converted to fp16 automatically. Arguments other than fp32 tensors are ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, otherwise, original mmcv implementation will be adopted. Args: apply_to (Iterable, optional): The argument names to be converted. `None` indicates all arguments. out_fp32 (bool): Whether to convert the output back to fp32. Example: >>> import torch.nn as nn >>> class MyModule1(nn.Module): >>> >>> # Convert x and y to fp16 >>> @auto_fp16() >>> def forward(self, x, y): >>> pass >>> import torch.nn as nn >>> class MyModule2(nn.Module): >>> >>> # convert pred to fp16 >>> @auto_fp16(apply_to=('pred', )) >>> def do_something(self, pred, others): >>> pass """ def auto_fp16_wrapper(old_func): @functools.wraps(old_func) def new_func(*args, **kwargs): # check if the module has set the attribute `fp16_enabled`, if not, # just fallback to the original method. if not isinstance(args[0], torch.nn.Module): raise TypeError('@auto_fp16 can only be used to decorate the ' 'method of nn.Module') if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): return old_func(*args, **kwargs) # get the arg spec of the decorated method args_info = getfullargspec(old_func) # get the argument names to be casted args_to_cast = args_info.args if apply_to is None else apply_to # convert the args that need to be processed new_args = [] # NOTE: default args are not taken into consideration if args: arg_names = args_info.args[:len(args)] for i, arg_name in enumerate(arg_names): if arg_name in args_to_cast: new_args.append( cast_tensor_type(args[i], torch.float, torch.half)) else: new_args.append(args[i]) # convert the kwargs that need to be processed new_kwargs = {} if kwargs: for arg_name, arg_value in kwargs.items(): if arg_name in args_to_cast: new_kwargs[arg_name] = cast_tensor_type( arg_value, torch.float, torch.half) else: new_kwargs[arg_name] = arg_value # apply converted arguments to the decorated method if (TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) >= digit_version('1.6.0')): with autocast(enabled=True): output = old_func(*new_args, **new_kwargs) else: output = old_func(*new_args, **new_kwargs) # cast the results back to fp32 if necessary if out_fp32: output = cast_tensor_type(output, torch.half, torch.float) return output return new_func return auto_fp16_wrapper def force_fp32(apply_to=None, out_fp16=False): """Decorator to convert input arguments to fp32 in force. This decorator is useful when you write custom modules and want to support mixed precision training. If there are some inputs that must be processed in fp32 mode, then this decorator can handle it. If inputs arguments are fp16 tensors, they will be converted to fp32 automatically. Arguments other than fp16 tensors are ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, otherwise, original mmcv implementation will be adopted. Args: apply_to (Iterable, optional): The argument names to be converted. `None` indicates all arguments. out_fp16 (bool): Whether to convert the output back to fp16. Example: >>> import torch.nn as nn >>> class MyModule1(nn.Module): >>> >>> # Convert x and y to fp32 >>> @force_fp32() >>> def loss(self, x, y): >>> pass >>> import torch.nn as nn >>> class MyModule2(nn.Module): >>> >>> # convert pred to fp32 >>> @force_fp32(apply_to=('pred', )) >>> def post_process(self, pred, others): >>> pass """ def force_fp32_wrapper(old_func): @functools.wraps(old_func) def new_func(*args, **kwargs): # check if the module has set the attribute `fp16_enabled`, if not, # just fallback to the original method. if not isinstance(args[0], torch.nn.Module): raise TypeError('@force_fp32 can only be used to decorate the ' 'method of nn.Module') if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): return old_func(*args, **kwargs) # get the arg spec of the decorated method args_info = getfullargspec(old_func) # get the argument names to be casted args_to_cast = args_info.args if apply_to is None else apply_to # convert the args that need to be processed new_args = [] if args: arg_names = args_info.args[:len(args)] for i, arg_name in enumerate(arg_names): if arg_name in args_to_cast: new_args.append( cast_tensor_type(args[i], torch.half, torch.float)) else: new_args.append(args[i]) # convert the kwargs that need to be processed new_kwargs = dict() if kwargs: for arg_name, arg_value in kwargs.items(): if arg_name in args_to_cast: new_kwargs[arg_name] = cast_tensor_type( arg_value, torch.half, torch.float) else: new_kwargs[arg_name] = arg_value # apply converted arguments to the decorated method if (TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) >= digit_version('1.6.0')): with autocast(enabled=False): output = old_func(*new_args, **new_kwargs) else: output = old_func(*new_args, **new_kwargs) # cast the results back to fp32 if necessary if out_fp16: output = cast_tensor_type(output, torch.float, torch.half) return output return new_func return force_fp32_wrapper def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): warnings.warning( '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be ' 'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads') _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb) def wrap_fp16_model(model): """Wrap the FP32 model to FP16. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, otherwise, original mmcv implementation will be adopted. For PyTorch >= 1.6, this function will 1. Set fp16 flag inside the model to True. Otherwise: 1. Convert FP32 model to FP16. 2. Remain some necessary layers to be FP32, e.g., normalization layers. 3. Set `fp16_enabled` flag inside the model to True. Args: model (nn.Module): Model in FP32. """ if (TORCH_VERSION == 'parrots' or digit_version(TORCH_VERSION) < digit_version('1.6.0')): # convert model to fp16 model.half() # patch the normalization layers to make it work in fp32 mode patch_norm_fp32(model) # set `fp16_enabled` flag for m in model.modules(): if hasattr(m, 'fp16_enabled'): m.fp16_enabled = True def patch_norm_fp32(module): """Recursively convert normalization layers from FP16 to FP32. Args: module (nn.Module): The modules to be converted in FP16. Returns: nn.Module: The converted module, the normalization layers have been converted to FP32. """ if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)): module.float() if isinstance(module, nn.GroupNorm) or torch.__version__ < '1.3': module.forward = patch_forward_method(module.forward, torch.half, torch.float) for child in module.children(): patch_norm_fp32(child) return module def patch_forward_method(func, src_type, dst_type, convert_output=True): """Patch the forward method of a module. Args: func (callable): The original forward method. src_type (torch.dtype): Type of input arguments to be converted from. dst_type (torch.dtype): Type of input arguments to be converted to. convert_output (bool): Whether to convert the output back to src_type. Returns: callable: The patched forward method. """ def new_forward(*args, **kwargs): output = func(*cast_tensor_type(args, src_type, dst_type), **cast_tensor_type(kwargs, src_type, dst_type)) if convert_output: output = cast_tensor_type(output, dst_type, src_type) return output return new_forward class LossScaler: """Class that manages loss scaling in mixed precision training which supports both dynamic or static mode. The implementation refers to https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py. Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling. It's important to understand how :class:`LossScaler` operates. Loss scaling is designed to combat the problem of underflowing gradients encountered at long times when training fp16 networks. Dynamic loss scaling begins by attempting a very high loss scale. Ironically, this may result in OVERflowing gradients. If overflowing gradients are encountered, :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch, and :class:`LossScaler` adjusts the loss scale to a lower value. If a certain number of iterations occur without overflowing gradients detected,:class:`LossScaler` increases the loss scale once more. In this way :class:`LossScaler` attempts to "ride the edge" of always using the highest loss scale possible without incurring overflow. Args: init_scale (float): Initial loss scale value, default: 2**32. scale_factor (float): Factor used when adjusting the loss scale. Default: 2. mode (str): Loss scaling mode. 'dynamic' or 'static' scale_window (int): Number of consecutive iterations without an overflow to wait before increasing the loss scale. Default: 1000. """ def __init__(self, init_scale=2**32, mode='dynamic', scale_factor=2., scale_window=1000): self.cur_scale = init_scale self.cur_iter = 0 assert mode in ('dynamic', 'static'), 'mode can only be dynamic or static' self.mode = mode self.last_overflow_iter = -1 self.scale_factor = scale_factor self.scale_window = scale_window def has_overflow(self, params): """Check if params contain overflow.""" if self.mode != 'dynamic': return False for p in params: if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data): return True return False def _has_inf_or_nan(x): """Check if params contain NaN.""" try: cpu_sum = float(x.float().sum()) except RuntimeError as instance: if 'value cannot be converted' not in instance.args[0]: raise return True else: if cpu_sum == float('inf') or cpu_sum == -float('inf') \ or cpu_sum != cpu_sum: return True return False def update_scale(self, overflow): """update the current loss scale value when overflow happens.""" if self.mode != 'dynamic': return if overflow: self.cur_scale = max(self.cur_scale / self.scale_factor, 1) self.last_overflow_iter = self.cur_iter else: if (self.cur_iter - self.last_overflow_iter) % \ self.scale_window == 0: self.cur_scale *= self.scale_factor self.cur_iter += 1 def state_dict(self): """Returns the state of the scaler as a :class:`dict`.""" return dict( cur_scale=self.cur_scale, cur_iter=self.cur_iter, mode=self.mode, last_overflow_iter=self.last_overflow_iter, scale_factor=self.scale_factor, scale_window=self.scale_window) def load_state_dict(self, state_dict): """Loads the loss_scaler state dict. Args: state_dict (dict): scaler state. """ self.cur_scale = state_dict['cur_scale'] self.cur_iter = state_dict['cur_iter'] self.mode = state_dict['mode'] self.last_overflow_iter = state_dict['last_overflow_iter'] self.scale_factor = state_dict['scale_factor'] self.scale_window = state_dict['scale_window'] @property def loss_scale(self): return self.cur_scale ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .checkpoint import CheckpointHook from .closure import ClosureHook from .ema import EMAHook from .evaluation import DistEvalHook, EvalHook from .hook import HOOKS, Hook from .iter_timer import IterTimerHook from .logger import (DvcliveLoggerHook, LoggerHook, MlflowLoggerHook, NeptuneLoggerHook, PaviLoggerHook, TensorboardLoggerHook, TextLoggerHook, WandbLoggerHook) from .lr_updater import LrUpdaterHook from .memory import EmptyCacheHook from .momentum_updater import MomentumUpdaterHook from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook, GradientCumulativeOptimizerHook, OptimizerHook) from .profiler import ProfilerHook from .sampler_seed import DistSamplerSeedHook from .sync_buffer import SyncBuffersHook __all__ = [ 'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook', 'OptimizerHook', 'Fp16OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook', 'NeptuneLoggerHook', 'WandbLoggerHook', 'DvcliveLoggerHook', 'MomentumUpdaterHook', 'SyncBuffersHook', 'EMAHook', 'EvalHook', 'DistEvalHook', 'ProfilerHook', 'GradientCumulativeOptimizerHook', 'GradientCumulativeFp16OptimizerHook' ] ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/checkpoint.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp import warnings from annotator.mmpkg.mmcv.fileio import FileClient from ..dist_utils import allreduce_params, master_only from .hook import HOOKS, Hook @HOOKS.register_module() class CheckpointHook(Hook): """Save checkpoints periodically. Args: interval (int): The saving period. If ``by_epoch=True``, interval indicates epochs, otherwise it indicates iterations. Default: -1, which means "never". by_epoch (bool): Saving checkpoints by epoch or by iteration. Default: True. save_optimizer (bool): Whether to save optimizer state_dict in the checkpoint. It is usually used for resuming experiments. Default: True. out_dir (str, optional): The root directory to save checkpoints. If not specified, ``runner.work_dir`` will be used by default. If specified, the ``out_dir`` will be the concatenation of ``out_dir`` and the last level directory of ``runner.work_dir``. `Changed in version 1.3.16.` max_keep_ckpts (int, optional): The maximum checkpoints to keep. In some cases we want only the latest few checkpoints and would like to delete old ones to save the disk space. Default: -1, which means unlimited. save_last (bool, optional): Whether to force the last checkpoint to be saved regardless of interval. Default: True. sync_buffer (bool, optional): Whether to synchronize buffers in different gpus. Default: False. file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Default: None. `New in version 1.3.16.` .. warning:: Before v1.3.16, the ``out_dir`` argument indicates the path where the checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the root directory and the final path to save checkpoint is the concatenation of ``out_dir`` and the last level directory of ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A" and the value of ``runner.work_dir`` is "/path/of/B", then the final path will be "/path/of/A/B". """ def __init__(self, interval=-1, by_epoch=True, save_optimizer=True, out_dir=None, max_keep_ckpts=-1, save_last=True, sync_buffer=False, file_client_args=None, **kwargs): self.interval = interval self.by_epoch = by_epoch self.save_optimizer = save_optimizer self.out_dir = out_dir self.max_keep_ckpts = max_keep_ckpts self.save_last = save_last self.args = kwargs self.sync_buffer = sync_buffer self.file_client_args = file_client_args def before_run(self, runner): if not self.out_dir: self.out_dir = runner.work_dir self.file_client = FileClient.infer_client(self.file_client_args, self.out_dir) # if `self.out_dir` is not equal to `runner.work_dir`, it means that # `self.out_dir` is set so the final `self.out_dir` is the # concatenation of `self.out_dir` and the last level directory of # `runner.work_dir` if self.out_dir != runner.work_dir: basename = osp.basename(runner.work_dir.rstrip(osp.sep)) self.out_dir = self.file_client.join_path(self.out_dir, basename) runner.logger.info((f'Checkpoints will be saved to {self.out_dir} by ' f'{self.file_client.name}.')) # disable the create_symlink option because some file backends do not # allow to create a symlink if 'create_symlink' in self.args: if self.args[ 'create_symlink'] and not self.file_client.allow_symlink: self.args['create_symlink'] = False warnings.warn( ('create_symlink is set as True by the user but is changed' 'to be False because creating symbolic link is not ' f'allowed in {self.file_client.name}')) else: self.args['create_symlink'] = self.file_client.allow_symlink def after_train_epoch(self, runner): if not self.by_epoch: return # save checkpoint for following cases: # 1. every ``self.interval`` epochs # 2. reach the last epoch of training if self.every_n_epochs( runner, self.interval) or (self.save_last and self.is_last_epoch(runner)): runner.logger.info( f'Saving checkpoint at {runner.epoch + 1} epochs') if self.sync_buffer: allreduce_params(runner.model.buffers()) self._save_checkpoint(runner) @master_only def _save_checkpoint(self, runner): """Save the current checkpoint and delete unwanted checkpoint.""" runner.save_checkpoint( self.out_dir, save_optimizer=self.save_optimizer, **self.args) if runner.meta is not None: if self.by_epoch: cur_ckpt_filename = self.args.get( 'filename_tmpl', 'epoch_{}.pth').format(runner.epoch + 1) else: cur_ckpt_filename = self.args.get( 'filename_tmpl', 'iter_{}.pth').format(runner.iter + 1) runner.meta.setdefault('hook_msgs', dict()) runner.meta['hook_msgs']['last_ckpt'] = self.file_client.join_path( self.out_dir, cur_ckpt_filename) # remove other checkpoints if self.max_keep_ckpts > 0: if self.by_epoch: name = 'epoch_{}.pth' current_ckpt = runner.epoch + 1 else: name = 'iter_{}.pth' current_ckpt = runner.iter + 1 redundant_ckpts = range( current_ckpt - self.max_keep_ckpts * self.interval, 0, -self.interval) filename_tmpl = self.args.get('filename_tmpl', name) for _step in redundant_ckpts: ckpt_path = self.file_client.join_path( self.out_dir, filename_tmpl.format(_step)) if self.file_client.isfile(ckpt_path): self.file_client.remove(ckpt_path) else: break def after_train_iter(self, runner): if self.by_epoch: return # save checkpoint for following cases: # 1. every ``self.interval`` iterations # 2. reach the last iteration of training if self.every_n_iters( runner, self.interval) or (self.save_last and self.is_last_iter(runner)): runner.logger.info( f'Saving checkpoint at {runner.iter + 1} iterations') if self.sync_buffer: allreduce_params(runner.model.buffers()) self._save_checkpoint(runner) ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/closure.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .hook import HOOKS, Hook @HOOKS.register_module() class ClosureHook(Hook): def __init__(self, fn_name, fn): assert hasattr(self, fn_name) assert callable(fn) setattr(self, fn_name, fn) ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/ema.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ...parallel import is_module_wrapper from ..hooks.hook import HOOKS, Hook @HOOKS.register_module() class EMAHook(Hook): r"""Exponential Moving Average Hook. Use Exponential Moving Average on all parameters of model in training process. All parameters have a ema backup, which update by the formula as below. EMAHook takes priority over EvalHook and CheckpointSaverHook. .. math:: \text{Xema\_{t+1}} = (1 - \text{momentum}) \times \text{Xema\_{t}} + \text{momentum} \times X_t Args: momentum (float): The momentum used for updating ema parameter. Defaults to 0.0002. interval (int): Update ema parameter every interval iteration. Defaults to 1. warm_up (int): During first warm_up steps, we may use smaller momentum to update ema parameters more slowly. Defaults to 100. resume_from (str): The checkpoint path. Defaults to None. """ def __init__(self, momentum=0.0002, interval=1, warm_up=100, resume_from=None): assert isinstance(interval, int) and interval > 0 self.warm_up = warm_up self.interval = interval assert momentum > 0 and momentum < 1 self.momentum = momentum**interval self.checkpoint = resume_from def before_run(self, runner): """To resume model with it's ema parameters more friendly. Register ema parameter as ``named_buffer`` to model """ model = runner.model if is_module_wrapper(model): model = model.module self.param_ema_buffer = {} self.model_parameters = dict(model.named_parameters(recurse=True)) for name, value in self.model_parameters.items(): # "." is not allowed in module's buffer name buffer_name = f"ema_{name.replace('.', '_')}" self.param_ema_buffer[name] = buffer_name model.register_buffer(buffer_name, value.data.clone()) self.model_buffers = dict(model.named_buffers(recurse=True)) if self.checkpoint is not None: runner.resume(self.checkpoint) def after_train_iter(self, runner): """Update ema parameter every self.interval iterations.""" curr_step = runner.iter # We warm up the momentum considering the instability at beginning momentum = min(self.momentum, (1 + curr_step) / (self.warm_up + curr_step)) if curr_step % self.interval != 0: return for name, parameter in self.model_parameters.items(): buffer_name = self.param_ema_buffer[name] buffer_parameter = self.model_buffers[buffer_name] buffer_parameter.mul_(1 - momentum).add_(momentum, parameter.data) def after_train_epoch(self, runner): """We load parameter values from ema backup to model before the EvalHook.""" self._swap_ema_parameters() def before_train_epoch(self, runner): """We recover model's parameter from ema backup after last epoch's EvalHook.""" self._swap_ema_parameters() def _swap_ema_parameters(self): """Swap the parameter of model with parameter in ema_buffer.""" for name, value in self.model_parameters.items(): temp = value.data.clone() ema_buffer = self.model_buffers[self.param_ema_buffer[name]] value.data.copy_(ema_buffer.data) ema_buffer.data.copy_(temp) ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/evaluation.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp import warnings from math import inf import torch.distributed as dist from torch.nn.modules.batchnorm import _BatchNorm from torch.utils.data import DataLoader from annotator.mmpkg.mmcv.fileio import FileClient from annotator.mmpkg.mmcv.utils import is_seq_of from .hook import Hook from .logger import LoggerHook class EvalHook(Hook): """Non-Distributed evaluation hook. This hook will regularly perform evaluation in a given interval when performing in non-distributed environment. Args: dataloader (DataLoader): A PyTorch dataloader, whose dataset has implemented ``evaluate`` function. start (int | None, optional): Evaluation starting epoch. It enables evaluation before the training starts if ``start`` <= the resuming epoch. If None, whether to evaluate is merely decided by ``interval``. Default: None. interval (int): Evaluation interval. Default: 1. by_epoch (bool): Determine perform evaluation by epoch or by iteration. If set to True, it will perform by epoch. Otherwise, by iteration. Default: True. save_best (str, optional): If a metric is specified, it would measure the best checkpoint during evaluation. The information about best checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep best score value and best checkpoint path, which will be also loaded when resume checkpoint. Options are the evaluation metrics on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox detection and instance segmentation. ``AR@100`` for proposal recall. If ``save_best`` is ``auto``, the first key of the returned ``OrderedDict`` result will be used. Default: None. rule (str | None, optional): Comparison rule for best score. If set to None, it will infer a reasonable rule. Keys such as 'acc', 'top' .etc will be inferred by 'greater' rule. Keys contain 'loss' will be inferred by 'less' rule. Options are 'greater', 'less', None. Default: None. test_fn (callable, optional): test a model with samples from a dataloader, and return the test results. If ``None``, the default test function ``mmcv.engine.single_gpu_test`` will be used. (default: ``None``) greater_keys (List[str] | None, optional): Metric keys that will be inferred by 'greater' comparison rule. If ``None``, _default_greater_keys will be used. (default: ``None``) less_keys (List[str] | None, optional): Metric keys that will be inferred by 'less' comparison rule. If ``None``, _default_less_keys will be used. (default: ``None``) out_dir (str, optional): The root directory to save checkpoints. If not specified, `runner.work_dir` will be used by default. If specified, the `out_dir` will be the concatenation of `out_dir` and the last level directory of `runner.work_dir`. `New in version 1.3.16.` file_client_args (dict): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Default: None. `New in version 1.3.16.` **eval_kwargs: Evaluation arguments fed into the evaluate function of the dataset. Notes: If new arguments are added for EvalHook, tools/test.py, tools/eval_metric.py may be affected. """ # Since the key for determine greater or less is related to the downstream # tasks, downstream repos may need to overwrite the following inner # variable accordingly. rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y} init_value_map = {'greater': -inf, 'less': inf} _default_greater_keys = [ 'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU', 'mAcc', 'aAcc' ] _default_less_keys = ['loss'] def __init__(self, dataloader, start=None, interval=1, by_epoch=True, save_best=None, rule=None, test_fn=None, greater_keys=None, less_keys=None, out_dir=None, file_client_args=None, **eval_kwargs): if not isinstance(dataloader, DataLoader): raise TypeError(f'dataloader must be a pytorch DataLoader, ' f'but got {type(dataloader)}') if interval <= 0: raise ValueError(f'interval must be a positive number, ' f'but got {interval}') assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean' if start is not None and start < 0: raise ValueError(f'The evaluation start epoch {start} is smaller ' f'than 0') self.dataloader = dataloader self.interval = interval self.start = start self.by_epoch = by_epoch assert isinstance(save_best, str) or save_best is None, \ '""save_best"" should be a str or None ' \ f'rather than {type(save_best)}' self.save_best = save_best self.eval_kwargs = eval_kwargs self.initial_flag = True if test_fn is None: from annotator.mmpkg.mmcv.engine import single_gpu_test self.test_fn = single_gpu_test else: self.test_fn = test_fn if greater_keys is None: self.greater_keys = self._default_greater_keys else: if not isinstance(greater_keys, (list, tuple)): greater_keys = (greater_keys, ) assert is_seq_of(greater_keys, str) self.greater_keys = greater_keys if less_keys is None: self.less_keys = self._default_less_keys else: if not isinstance(less_keys, (list, tuple)): less_keys = (less_keys, ) assert is_seq_of(less_keys, str) self.less_keys = less_keys if self.save_best is not None: self.best_ckpt_path = None self._init_rule(rule, self.save_best) self.out_dir = out_dir self.file_client_args = file_client_args def _init_rule(self, rule, key_indicator): """Initialize rule, key_indicator, comparison_func, and best score. Here is the rule to determine which rule is used for key indicator when the rule is not specific (note that the key indicator matching is case-insensitive): 1. If the key indicator is in ``self.greater_keys``, the rule will be specified as 'greater'. 2. Or if the key indicator is in ``self.less_keys``, the rule will be specified as 'less'. 3. Or if the key indicator is equal to the substring in any one item in ``self.greater_keys``, the rule will be specified as 'greater'. 4. Or if the key indicator is equal to the substring in any one item in ``self.less_keys``, the rule will be specified as 'less'. Args: rule (str | None): Comparison rule for best score. key_indicator (str | None): Key indicator to determine the comparison rule. """ if rule not in self.rule_map and rule is not None: raise KeyError(f'rule must be greater, less or None, ' f'but got {rule}.') if rule is None: if key_indicator != 'auto': # `_lc` here means we use the lower case of keys for # case-insensitive matching key_indicator_lc = key_indicator.lower() greater_keys = [key.lower() for key in self.greater_keys] less_keys = [key.lower() for key in self.less_keys] if key_indicator_lc in greater_keys: rule = 'greater' elif key_indicator_lc in less_keys: rule = 'less' elif any(key in key_indicator_lc for key in greater_keys): rule = 'greater' elif any(key in key_indicator_lc for key in less_keys): rule = 'less' else: raise ValueError(f'Cannot infer the rule for key ' f'{key_indicator}, thus a specific rule ' f'must be specified.') self.rule = rule self.key_indicator = key_indicator if self.rule is not None: self.compare_func = self.rule_map[self.rule] def before_run(self, runner): if not self.out_dir: self.out_dir = runner.work_dir self.file_client = FileClient.infer_client(self.file_client_args, self.out_dir) # if `self.out_dir` is not equal to `runner.work_dir`, it means that # `self.out_dir` is set so the final `self.out_dir` is the # concatenation of `self.out_dir` and the last level directory of # `runner.work_dir` if self.out_dir != runner.work_dir: basename = osp.basename(runner.work_dir.rstrip(osp.sep)) self.out_dir = self.file_client.join_path(self.out_dir, basename) runner.logger.info( (f'The best checkpoint will be saved to {self.out_dir} by ' f'{self.file_client.name}')) if self.save_best is not None: if runner.meta is None: warnings.warn('runner.meta is None. Creating an empty one.') runner.meta = dict() runner.meta.setdefault('hook_msgs', dict()) self.best_ckpt_path = runner.meta['hook_msgs'].get( 'best_ckpt', None) def before_train_iter(self, runner): """Evaluate the model only at the start of training by iteration.""" if self.by_epoch or not self.initial_flag: return if self.start is not None and runner.iter >= self.start: self.after_train_iter(runner) self.initial_flag = False def before_train_epoch(self, runner): """Evaluate the model only at the start of training by epoch.""" if not (self.by_epoch and self.initial_flag): return if self.start is not None and runner.epoch >= self.start: self.after_train_epoch(runner) self.initial_flag = False def after_train_iter(self, runner): """Called after every training iter to evaluate the results.""" if not self.by_epoch and self._should_evaluate(runner): # Because the priority of EvalHook is higher than LoggerHook, the # training log and the evaluating log are mixed. Therefore, # we need to dump the training log and clear it before evaluating # log is generated. In addition, this problem will only appear in # `IterBasedRunner` whose `self.by_epoch` is False, because # `EpochBasedRunner` whose `self.by_epoch` is True calls # `_do_evaluate` in `after_train_epoch` stage, and at this stage # the training log has been printed, so it will not cause any # problem. more details at # https://github.com/open-mmlab/mmsegmentation/issues/694 for hook in runner._hooks: if isinstance(hook, LoggerHook): hook.after_train_iter(runner) runner.log_buffer.clear() self._do_evaluate(runner) def after_train_epoch(self, runner): """Called after every training epoch to evaluate the results.""" if self.by_epoch and self._should_evaluate(runner): self._do_evaluate(runner) def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" results = self.test_fn(runner.model, self.dataloader) runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) key_score = self.evaluate(runner, results) # the key_score may be `None` so it needs to skip the action to save # the best checkpoint if self.save_best and key_score: self._save_ckpt(runner, key_score) def _should_evaluate(self, runner): """Judge whether to perform evaluation. Here is the rule to judge whether to perform evaluation: 1. It will not perform evaluation during the epoch/iteration interval, which is determined by ``self.interval``. 2. It will not perform evaluation if the start time is larger than current time. 3. It will not perform evaluation when current time is larger than the start time but during epoch/iteration interval. Returns: bool: The flag indicating whether to perform evaluation. """ if self.by_epoch: current = runner.epoch check_time = self.every_n_epochs else: current = runner.iter check_time = self.every_n_iters if self.start is None: if not check_time(runner, self.interval): # No evaluation during the interval. return False elif (current + 1) < self.start: # No evaluation if start is larger than the current time. return False else: # Evaluation only at epochs/iters 3, 5, 7... # if start==3 and interval==2 if (current + 1 - self.start) % self.interval: return False return True def _save_ckpt(self, runner, key_score): """Save the best checkpoint. It will compare the score according to the compare function, write related information (best score, best checkpoint path) and save the best checkpoint into ``work_dir``. """ if self.by_epoch: current = f'epoch_{runner.epoch + 1}' cur_type, cur_time = 'epoch', runner.epoch + 1 else: current = f'iter_{runner.iter + 1}' cur_type, cur_time = 'iter', runner.iter + 1 best_score = runner.meta['hook_msgs'].get( 'best_score', self.init_value_map[self.rule]) if self.compare_func(key_score, best_score): best_score = key_score runner.meta['hook_msgs']['best_score'] = best_score if self.best_ckpt_path and self.file_client.isfile( self.best_ckpt_path): self.file_client.remove(self.best_ckpt_path) runner.logger.info( (f'The previous best checkpoint {self.best_ckpt_path} was ' 'removed')) best_ckpt_name = f'best_{self.key_indicator}_{current}.pth' self.best_ckpt_path = self.file_client.join_path( self.out_dir, best_ckpt_name) runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path runner.save_checkpoint( self.out_dir, best_ckpt_name, create_symlink=False) runner.logger.info( f'Now best checkpoint is saved as {best_ckpt_name}.') runner.logger.info( f'Best {self.key_indicator} is {best_score:0.4f} ' f'at {cur_time} {cur_type}.') def evaluate(self, runner, results): """Evaluate the results. Args: runner (:obj:`mmcv.Runner`): The underlined training runner. results (list): Output results. """ eval_res = self.dataloader.dataset.evaluate( results, logger=runner.logger, **self.eval_kwargs) for name, val in eval_res.items(): runner.log_buffer.output[name] = val runner.log_buffer.ready = True if self.save_best is not None: # If the performance of model is pool, the `eval_res` may be an # empty dict and it will raise exception when `self.save_best` is # not None. More details at # https://github.com/open-mmlab/mmdetection/issues/6265. if not eval_res: warnings.warn( 'Since `eval_res` is an empty dict, the behavior to save ' 'the best checkpoint will be skipped in this evaluation.') return None if self.key_indicator == 'auto': # infer from eval_results self._init_rule(self.rule, list(eval_res.keys())[0]) return eval_res[self.key_indicator] return None class DistEvalHook(EvalHook): """Distributed evaluation hook. This hook will regularly perform evaluation in a given interval when performing in distributed environment. Args: dataloader (DataLoader): A PyTorch dataloader, whose dataset has implemented ``evaluate`` function. start (int | None, optional): Evaluation starting epoch. It enables evaluation before the training starts if ``start`` <= the resuming epoch. If None, whether to evaluate is merely decided by ``interval``. Default: None. interval (int): Evaluation interval. Default: 1. by_epoch (bool): Determine perform evaluation by epoch or by iteration. If set to True, it will perform by epoch. Otherwise, by iteration. default: True. save_best (str, optional): If a metric is specified, it would measure the best checkpoint during evaluation. The information about best checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep best score value and best checkpoint path, which will be also loaded when resume checkpoint. Options are the evaluation metrics on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox detection and instance segmentation. ``AR@100`` for proposal recall. If ``save_best`` is ``auto``, the first key of the returned ``OrderedDict`` result will be used. Default: None. rule (str | None, optional): Comparison rule for best score. If set to None, it will infer a reasonable rule. Keys such as 'acc', 'top' .etc will be inferred by 'greater' rule. Keys contain 'loss' will be inferred by 'less' rule. Options are 'greater', 'less', None. Default: None. test_fn (callable, optional): test a model with samples from a dataloader in a multi-gpu manner, and return the test results. If ``None``, the default test function ``mmcv.engine.multi_gpu_test`` will be used. (default: ``None``) tmpdir (str | None): Temporary directory to save the results of all processes. Default: None. gpu_collect (bool): Whether to use gpu or cpu to collect results. Default: False. broadcast_bn_buffer (bool): Whether to broadcast the buffer(running_mean and running_var) of rank 0 to other rank before evaluation. Default: True. out_dir (str, optional): The root directory to save checkpoints. If not specified, `runner.work_dir` will be used by default. If specified, the `out_dir` will be the concatenation of `out_dir` and the last level directory of `runner.work_dir`. file_client_args (dict): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Default: None. **eval_kwargs: Evaluation arguments fed into the evaluate function of the dataset. """ def __init__(self, dataloader, start=None, interval=1, by_epoch=True, save_best=None, rule=None, test_fn=None, greater_keys=None, less_keys=None, broadcast_bn_buffer=True, tmpdir=None, gpu_collect=False, out_dir=None, file_client_args=None, **eval_kwargs): if test_fn is None: from annotator.mmpkg.mmcv.engine import multi_gpu_test test_fn = multi_gpu_test super().__init__( dataloader, start=start, interval=interval, by_epoch=by_epoch, save_best=save_best, rule=rule, test_fn=test_fn, greater_keys=greater_keys, less_keys=less_keys, out_dir=out_dir, file_client_args=file_client_args, **eval_kwargs) self.broadcast_bn_buffer = broadcast_bn_buffer self.tmpdir = tmpdir self.gpu_collect = gpu_collect def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" # Synchronization of BatchNorm's buffer (running_mean # and running_var) is not supported in the DDP of pytorch, # which may cause the inconsistent performance of models in # different ranks, so we broadcast BatchNorm's buffers # of rank 0 to other ranks to avoid this. if self.broadcast_bn_buffer: model = runner.model for name, module in model.named_modules(): if isinstance(module, _BatchNorm) and module.track_running_stats: dist.broadcast(module.running_var, 0) dist.broadcast(module.running_mean, 0) tmpdir = self.tmpdir if tmpdir is None: tmpdir = osp.join(runner.work_dir, '.eval_hook') results = self.test_fn( runner.model, self.dataloader, tmpdir=tmpdir, gpu_collect=self.gpu_collect) if runner.rank == 0: print('\n') runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) key_score = self.evaluate(runner, results) # the key_score may be `None` so it needs to skip the action to # save the best checkpoint if self.save_best and key_score: self._save_ckpt(runner, key_score) ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/hook.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from annotator.mmpkg.mmcv.utils import Registry, is_method_overridden HOOKS = Registry('hook') class Hook: stages = ('before_run', 'before_train_epoch', 'before_train_iter', 'after_train_iter', 'after_train_epoch', 'before_val_epoch', 'before_val_iter', 'after_val_iter', 'after_val_epoch', 'after_run') def before_run(self, runner): pass def after_run(self, runner): pass def before_epoch(self, runner): pass def after_epoch(self, runner): pass def before_iter(self, runner): pass def after_iter(self, runner): pass def before_train_epoch(self, runner): self.before_epoch(runner) def before_val_epoch(self, runner): self.before_epoch(runner) def after_train_epoch(self, runner): self.after_epoch(runner) def after_val_epoch(self, runner): self.after_epoch(runner) def before_train_iter(self, runner): self.before_iter(runner) def before_val_iter(self, runner): self.before_iter(runner) def after_train_iter(self, runner): self.after_iter(runner) def after_val_iter(self, runner): self.after_iter(runner) def every_n_epochs(self, runner, n): return (runner.epoch + 1) % n == 0 if n > 0 else False def every_n_inner_iters(self, runner, n): return (runner.inner_iter + 1) % n == 0 if n > 0 else False def every_n_iters(self, runner, n): return (runner.iter + 1) % n == 0 if n > 0 else False def end_of_epoch(self, runner): return runner.inner_iter + 1 == len(runner.data_loader) def is_last_epoch(self, runner): return runner.epoch + 1 == runner._max_epochs def is_last_iter(self, runner): return runner.iter + 1 == runner._max_iters def get_triggered_stages(self): trigger_stages = set() for stage in Hook.stages: if is_method_overridden(stage, Hook, self): trigger_stages.add(stage) # some methods will be triggered in multi stages # use this dict to map method to stages. method_stages_map = { 'before_epoch': ['before_train_epoch', 'before_val_epoch'], 'after_epoch': ['after_train_epoch', 'after_val_epoch'], 'before_iter': ['before_train_iter', 'before_val_iter'], 'after_iter': ['after_train_iter', 'after_val_iter'], } for method, map_stages in method_stages_map.items(): if is_method_overridden(method, Hook, self): trigger_stages.update(map_stages) return [stage for stage in Hook.stages if stage in trigger_stages] ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/iter_timer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import time from .hook import HOOKS, Hook @HOOKS.register_module() class IterTimerHook(Hook): def before_epoch(self, runner): self.t = time.time() def before_iter(self, runner): runner.log_buffer.update({'data_time': time.time() - self.t}) def after_iter(self, runner): runner.log_buffer.update({'time': time.time() - self.t}) self.t = time.time() ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/logger/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base import LoggerHook from .dvclive import DvcliveLoggerHook from .mlflow import MlflowLoggerHook from .neptune import NeptuneLoggerHook from .pavi import PaviLoggerHook from .tensorboard import TensorboardLoggerHook from .text import TextLoggerHook from .wandb import WandbLoggerHook __all__ = [ 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook', 'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook', 'NeptuneLoggerHook', 'DvcliveLoggerHook' ] ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/logger/base.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numbers from abc import ABCMeta, abstractmethod import numpy as np import torch from ..hook import Hook class LoggerHook(Hook): """Base class for logger hooks. Args: interval (int): Logging interval (every k iterations). ignore_last (bool): Ignore the log of last iterations in each epoch if less than `interval`. reset_flag (bool): Whether to clear the output buffer after logging. by_epoch (bool): Whether EpochBasedRunner is used. """ __metaclass__ = ABCMeta def __init__(self, interval=10, ignore_last=True, reset_flag=False, by_epoch=True): self.interval = interval self.ignore_last = ignore_last self.reset_flag = reset_flag self.by_epoch = by_epoch @abstractmethod def log(self, runner): pass @staticmethod def is_scalar(val, include_np=True, include_torch=True): """Tell the input variable is a scalar or not. Args: val: Input variable. include_np (bool): Whether include 0-d np.ndarray as a scalar. include_torch (bool): Whether include 0-d torch.Tensor as a scalar. Returns: bool: True or False. """ if isinstance(val, numbers.Number): return True elif include_np and isinstance(val, np.ndarray) and val.ndim == 0: return True elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1: return True else: return False def get_mode(self, runner): if runner.mode == 'train': if 'time' in runner.log_buffer.output: mode = 'train' else: mode = 'val' elif runner.mode == 'val': mode = 'val' else: raise ValueError(f"runner mode should be 'train' or 'val', " f'but got {runner.mode}') return mode def get_epoch(self, runner): if runner.mode == 'train': epoch = runner.epoch + 1 elif runner.mode == 'val': # normal val mode # runner.epoch += 1 has been done before val workflow epoch = runner.epoch else: raise ValueError(f"runner mode should be 'train' or 'val', " f'but got {runner.mode}') return epoch def get_iter(self, runner, inner_iter=False): """Get the current training iteration step.""" if self.by_epoch and inner_iter: current_iter = runner.inner_iter + 1 else: current_iter = runner.iter + 1 return current_iter def get_lr_tags(self, runner): tags = {} lrs = runner.current_lr() if isinstance(lrs, dict): for name, value in lrs.items(): tags[f'learning_rate/{name}'] = value[0] else: tags['learning_rate'] = lrs[0] return tags def get_momentum_tags(self, runner): tags = {} momentums = runner.current_momentum() if isinstance(momentums, dict): for name, value in momentums.items(): tags[f'momentum/{name}'] = value[0] else: tags['momentum'] = momentums[0] return tags def get_loggable_tags(self, runner, allow_scalar=True, allow_text=False, add_mode=True, tags_to_skip=('time', 'data_time')): tags = {} for var, val in runner.log_buffer.output.items(): if var in tags_to_skip: continue if self.is_scalar(val) and not allow_scalar: continue if isinstance(val, str) and not allow_text: continue if add_mode: var = f'{self.get_mode(runner)}/{var}' tags[var] = val tags.update(self.get_lr_tags(runner)) tags.update(self.get_momentum_tags(runner)) return tags def before_run(self, runner): for hook in runner.hooks[::-1]: if isinstance(hook, LoggerHook): hook.reset_flag = True break def before_epoch(self, runner): runner.log_buffer.clear() # clear logs of last epoch def after_train_iter(self, runner): if self.by_epoch and self.every_n_inner_iters(runner, self.interval): runner.log_buffer.average(self.interval) elif not self.by_epoch and self.every_n_iters(runner, self.interval): runner.log_buffer.average(self.interval) elif self.end_of_epoch(runner) and not self.ignore_last: # not precise but more stable runner.log_buffer.average(self.interval) if runner.log_buffer.ready: self.log(runner) if self.reset_flag: runner.log_buffer.clear_output() def after_train_epoch(self, runner): if runner.log_buffer.ready: self.log(runner) if self.reset_flag: runner.log_buffer.clear_output() def after_val_epoch(self, runner): runner.log_buffer.average() self.log(runner) if self.reset_flag: runner.log_buffer.clear_output() ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/logger/dvclive.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ...dist_utils import master_only from ..hook import HOOKS from .base import LoggerHook @HOOKS.register_module() class DvcliveLoggerHook(LoggerHook): """Class to log metrics with dvclive. It requires `dvclive`_ to be installed. Args: path (str): Directory where dvclive will write TSV log files. interval (int): Logging interval (every k iterations). Default 10. ignore_last (bool): Ignore the log of last iterations in each epoch if less than `interval`. Default: True. reset_flag (bool): Whether to clear the output buffer after logging. Default: True. by_epoch (bool): Whether EpochBasedRunner is used. Default: True. .. _dvclive: https://dvc.org/doc/dvclive """ def __init__(self, path, interval=10, ignore_last=True, reset_flag=True, by_epoch=True): super(DvcliveLoggerHook, self).__init__(interval, ignore_last, reset_flag, by_epoch) self.path = path self.import_dvclive() def import_dvclive(self): try: import dvclive except ImportError: raise ImportError( 'Please run "pip install dvclive" to install dvclive') self.dvclive = dvclive @master_only def before_run(self, runner): self.dvclive.init(self.path) @master_only def log(self, runner): tags = self.get_loggable_tags(runner) if tags: for k, v in tags.items(): self.dvclive.log(k, v, step=self.get_iter(runner)) ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/logger/mlflow.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ...dist_utils import master_only from ..hook import HOOKS from .base import LoggerHook @HOOKS.register_module() class MlflowLoggerHook(LoggerHook): def __init__(self, exp_name=None, tags=None, log_model=True, interval=10, ignore_last=True, reset_flag=False, by_epoch=True): """Class to log metrics and (optionally) a trained model to MLflow. It requires `MLflow`_ to be installed. Args: exp_name (str, optional): Name of the experiment to be used. Default None. If not None, set the active experiment. If experiment does not exist, an experiment with provided name will be created. tags (dict of str: str, optional): Tags for the current run. Default None. If not None, set tags for the current run. log_model (bool, optional): Whether to log an MLflow artifact. Default True. If True, log runner.model as an MLflow artifact for the current run. interval (int): Logging interval (every k iterations). ignore_last (bool): Ignore the log of last iterations in each epoch if less than `interval`. reset_flag (bool): Whether to clear the output buffer after logging by_epoch (bool): Whether EpochBasedRunner is used. .. _MLflow: https://www.mlflow.org/docs/latest/index.html """ super(MlflowLoggerHook, self).__init__(interval, ignore_last, reset_flag, by_epoch) self.import_mlflow() self.exp_name = exp_name self.tags = tags self.log_model = log_model def import_mlflow(self): try: import mlflow import mlflow.pytorch as mlflow_pytorch except ImportError: raise ImportError( 'Please run "pip install mlflow" to install mlflow') self.mlflow = mlflow self.mlflow_pytorch = mlflow_pytorch @master_only def before_run(self, runner): super(MlflowLoggerHook, self).before_run(runner) if self.exp_name is not None: self.mlflow.set_experiment(self.exp_name) if self.tags is not None: self.mlflow.set_tags(self.tags) @master_only def log(self, runner): tags = self.get_loggable_tags(runner) if tags: self.mlflow.log_metrics(tags, step=self.get_iter(runner)) @master_only def after_run(self, runner): if self.log_model: self.mlflow_pytorch.log_model(runner.model, 'models') ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/logger/neptune.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ...dist_utils import master_only from ..hook import HOOKS from .base import LoggerHook @HOOKS.register_module() class NeptuneLoggerHook(LoggerHook): """Class to log metrics to NeptuneAI. It requires `neptune-client` to be installed. Args: init_kwargs (dict): a dict contains the initialization keys as below: - project (str): Name of a project in a form of namespace/project_name. If None, the value of NEPTUNE_PROJECT environment variable will be taken. - api_token (str): User’s API token. If None, the value of NEPTUNE_API_TOKEN environment variable will be taken. Note: It is strongly recommended to use NEPTUNE_API_TOKEN environment variable rather than placing your API token in plain text in your source code. - name (str, optional, default is 'Untitled'): Editable name of the run. Name is displayed in the run's Details and in Runs table as a column. Check https://docs.neptune.ai/api-reference/neptune#init for more init arguments. interval (int): Logging interval (every k iterations). ignore_last (bool): Ignore the log of last iterations in each epoch if less than `interval`. reset_flag (bool): Whether to clear the output buffer after logging by_epoch (bool): Whether EpochBasedRunner is used. .. _NeptuneAI: https://docs.neptune.ai/you-should-know/logging-metadata """ def __init__(self, init_kwargs=None, interval=10, ignore_last=True, reset_flag=True, with_step=True, by_epoch=True): super(NeptuneLoggerHook, self).__init__(interval, ignore_last, reset_flag, by_epoch) self.import_neptune() self.init_kwargs = init_kwargs self.with_step = with_step def import_neptune(self): try: import neptune.new as neptune except ImportError: raise ImportError( 'Please run "pip install neptune-client" to install neptune') self.neptune = neptune self.run = None @master_only def before_run(self, runner): if self.init_kwargs: self.run = self.neptune.init(**self.init_kwargs) else: self.run = self.neptune.init() @master_only def log(self, runner): tags = self.get_loggable_tags(runner) if tags: for tag_name, tag_value in tags.items(): if self.with_step: self.run[tag_name].log( tag_value, step=self.get_iter(runner)) else: tags['global_step'] = self.get_iter(runner) self.run[tag_name].log(tags) @master_only def after_run(self, runner): self.run.stop() ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/logger/pavi.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import json import os import os.path as osp import torch import yaml import annotator.mmpkg.mmcv as mmcv from ....parallel.utils import is_module_wrapper from ...dist_utils import master_only from ..hook import HOOKS from .base import LoggerHook @HOOKS.register_module() class PaviLoggerHook(LoggerHook): def __init__(self, init_kwargs=None, add_graph=False, add_last_ckpt=False, interval=10, ignore_last=True, reset_flag=False, by_epoch=True, img_key='img_info'): super(PaviLoggerHook, self).__init__(interval, ignore_last, reset_flag, by_epoch) self.init_kwargs = init_kwargs self.add_graph = add_graph self.add_last_ckpt = add_last_ckpt self.img_key = img_key @master_only def before_run(self, runner): super(PaviLoggerHook, self).before_run(runner) try: from pavi import SummaryWriter except ImportError: raise ImportError('Please run "pip install pavi" to install pavi.') self.run_name = runner.work_dir.split('/')[-1] if not self.init_kwargs: self.init_kwargs = dict() self.init_kwargs['name'] = self.run_name self.init_kwargs['model'] = runner._model_name if runner.meta is not None: if 'config_dict' in runner.meta: config_dict = runner.meta['config_dict'] assert isinstance( config_dict, dict), ('meta["config_dict"] has to be of a dict, ' f'but got {type(config_dict)}') elif 'config_file' in runner.meta: config_file = runner.meta['config_file'] config_dict = dict(mmcv.Config.fromfile(config_file)) else: config_dict = None if config_dict is not None: # 'max_.*iter' is parsed in pavi sdk as the maximum iterations # to properly set up the progress bar. config_dict = config_dict.copy() config_dict.setdefault('max_iter', runner.max_iters) # non-serializable values are first converted in # mmcv.dump to json config_dict = json.loads( mmcv.dump(config_dict, file_format='json')) session_text = yaml.dump(config_dict) self.init_kwargs['session_text'] = session_text self.writer = SummaryWriter(**self.init_kwargs) def get_step(self, runner): """Get the total training step/epoch.""" if self.get_mode(runner) == 'val' and self.by_epoch: return self.get_epoch(runner) else: return self.get_iter(runner) @master_only def log(self, runner): tags = self.get_loggable_tags(runner, add_mode=False) if tags: self.writer.add_scalars( self.get_mode(runner), tags, self.get_step(runner)) @master_only def after_run(self, runner): if self.add_last_ckpt: ckpt_path = osp.join(runner.work_dir, 'latest.pth') if osp.islink(ckpt_path): ckpt_path = osp.join(runner.work_dir, os.readlink(ckpt_path)) if osp.isfile(ckpt_path): # runner.epoch += 1 has been done before `after_run`. iteration = runner.epoch if self.by_epoch else runner.iter return self.writer.add_snapshot_file( tag=self.run_name, snapshot_file_path=ckpt_path, iteration=iteration) # flush the buffer and send a task ending signal to Pavi self.writer.close() @master_only def before_epoch(self, runner): if runner.epoch == 0 and self.add_graph: if is_module_wrapper(runner.model): _model = runner.model.module else: _model = runner.model device = next(_model.parameters()).device data = next(iter(runner.data_loader)) image = data[self.img_key][0:1].to(device) with torch.no_grad(): self.writer.add_graph(_model, image) ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/logger/tensorboard.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp from annotator.mmpkg.mmcv.utils import TORCH_VERSION, digit_version from ...dist_utils import master_only from ..hook import HOOKS from .base import LoggerHook @HOOKS.register_module() class TensorboardLoggerHook(LoggerHook): def __init__(self, log_dir=None, interval=10, ignore_last=True, reset_flag=False, by_epoch=True): super(TensorboardLoggerHook, self).__init__(interval, ignore_last, reset_flag, by_epoch) self.log_dir = log_dir @master_only def before_run(self, runner): super(TensorboardLoggerHook, self).before_run(runner) if (TORCH_VERSION == 'parrots' or digit_version(TORCH_VERSION) < digit_version('1.1')): try: from tensorboardX import SummaryWriter except ImportError: raise ImportError('Please install tensorboardX to use ' 'TensorboardLoggerHook.') else: try: from torch.utils.tensorboard import SummaryWriter except ImportError: raise ImportError( 'Please run "pip install future tensorboard" to install ' 'the dependencies to use torch.utils.tensorboard ' '(applicable to PyTorch 1.1 or higher)') if self.log_dir is None: self.log_dir = osp.join(runner.work_dir, 'tf_logs') self.writer = SummaryWriter(self.log_dir) @master_only def log(self, runner): tags = self.get_loggable_tags(runner, allow_text=True) for tag, val in tags.items(): if isinstance(val, str): self.writer.add_text(tag, val, self.get_iter(runner)) else: self.writer.add_scalar(tag, val, self.get_iter(runner)) @master_only def after_run(self, runner): self.writer.close() ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/logger/text.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import datetime import os import os.path as osp from collections import OrderedDict import torch import torch.distributed as dist import annotator.mmpkg.mmcv as mmcv from annotator.mmpkg.mmcv.fileio.file_client import FileClient from annotator.mmpkg.mmcv.utils import is_tuple_of, scandir from ..hook import HOOKS from .base import LoggerHook @HOOKS.register_module() class TextLoggerHook(LoggerHook): """Logger hook in text. In this logger hook, the information will be printed on terminal and saved in json file. Args: by_epoch (bool, optional): Whether EpochBasedRunner is used. Default: True. interval (int, optional): Logging interval (every k iterations). Default: 10. ignore_last (bool, optional): Ignore the log of last iterations in each epoch if less than :attr:`interval`. Default: True. reset_flag (bool, optional): Whether to clear the output buffer after logging. Default: False. interval_exp_name (int, optional): Logging interval for experiment name. This feature is to help users conveniently get the experiment information from screen or log file. Default: 1000. out_dir (str, optional): Logs are saved in ``runner.work_dir`` default. If ``out_dir`` is specified, logs will be copied to a new directory which is the concatenation of ``out_dir`` and the last level directory of ``runner.work_dir``. Default: None. `New in version 1.3.16.` out_suffix (str or tuple[str], optional): Those filenames ending with ``out_suffix`` will be copied to ``out_dir``. Default: ('.log.json', '.log', '.py'). `New in version 1.3.16.` keep_local (bool, optional): Whether to keep local log when :attr:`out_dir` is specified. If False, the local log will be removed. Default: True. `New in version 1.3.16.` file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Default: None. `New in version 1.3.16.` """ def __init__(self, by_epoch=True, interval=10, ignore_last=True, reset_flag=False, interval_exp_name=1000, out_dir=None, out_suffix=('.log.json', '.log', '.py'), keep_local=True, file_client_args=None): super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag, by_epoch) self.by_epoch = by_epoch self.time_sec_tot = 0 self.interval_exp_name = interval_exp_name if out_dir is None and file_client_args is not None: raise ValueError( 'file_client_args should be "None" when `out_dir` is not' 'specified.') self.out_dir = out_dir if not (out_dir is None or isinstance(out_dir, str) or is_tuple_of(out_dir, str)): raise TypeError('out_dir should be "None" or string or tuple of ' 'string, but got {out_dir}') self.out_suffix = out_suffix self.keep_local = keep_local self.file_client_args = file_client_args if self.out_dir is not None: self.file_client = FileClient.infer_client(file_client_args, self.out_dir) def before_run(self, runner): super(TextLoggerHook, self).before_run(runner) if self.out_dir is not None: self.file_client = FileClient.infer_client(self.file_client_args, self.out_dir) # The final `self.out_dir` is the concatenation of `self.out_dir` # and the last level directory of `runner.work_dir` basename = osp.basename(runner.work_dir.rstrip(osp.sep)) self.out_dir = self.file_client.join_path(self.out_dir, basename) runner.logger.info( (f'Text logs will be saved to {self.out_dir} by ' f'{self.file_client.name} after the training process.')) self.start_iter = runner.iter self.json_log_path = osp.join(runner.work_dir, f'{runner.timestamp}.log.json') if runner.meta is not None: self._dump_log(runner.meta, runner) def _get_max_memory(self, runner): device = getattr(runner.model, 'output_device', None) mem = torch.cuda.max_memory_allocated(device=device) mem_mb = torch.tensor([mem / (1024 * 1024)], dtype=torch.int, device=device) if runner.world_size > 1: dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX) return mem_mb.item() def _log_info(self, log_dict, runner): # print exp name for users to distinguish experiments # at every ``interval_exp_name`` iterations and the end of each epoch if runner.meta is not None and 'exp_name' in runner.meta: if (self.every_n_iters(runner, self.interval_exp_name)) or ( self.by_epoch and self.end_of_epoch(runner)): exp_info = f'Exp name: {runner.meta["exp_name"]}' runner.logger.info(exp_info) if log_dict['mode'] == 'train': if isinstance(log_dict['lr'], dict): lr_str = [] for k, val in log_dict['lr'].items(): lr_str.append(f'lr_{k}: {val:.3e}') lr_str = ' '.join(lr_str) else: lr_str = f'lr: {log_dict["lr"]:.3e}' # by epoch: Epoch [4][100/1000] # by iter: Iter [100/100000] if self.by_epoch: log_str = f'Epoch [{log_dict["epoch"]}]' \ f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t' else: log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t' log_str += f'{lr_str}, ' if 'time' in log_dict.keys(): self.time_sec_tot += (log_dict['time'] * self.interval) time_sec_avg = self.time_sec_tot / ( runner.iter - self.start_iter + 1) eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1) eta_str = str(datetime.timedelta(seconds=int(eta_sec))) log_str += f'eta: {eta_str}, ' log_str += f'time: {log_dict["time"]:.3f}, ' \ f'data_time: {log_dict["data_time"]:.3f}, ' # statistic memory if torch.cuda.is_available(): log_str += f'memory: {log_dict["memory"]}, ' else: # val/test time # here 1000 is the length of the val dataloader # by epoch: Epoch[val] [4][1000] # by iter: Iter[val] [1000] if self.by_epoch: log_str = f'Epoch({log_dict["mode"]}) ' \ f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t' else: log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t' log_items = [] for name, val in log_dict.items(): # TODO: resolve this hack # these items have been in log_str if name in [ 'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time', 'memory', 'epoch' ]: continue if isinstance(val, float): val = f'{val:.4f}' log_items.append(f'{name}: {val}') log_str += ', '.join(log_items) runner.logger.info(log_str) def _dump_log(self, log_dict, runner): # dump log in json format json_log = OrderedDict() for k, v in log_dict.items(): json_log[k] = self._round_float(v) # only append log at last line if runner.rank == 0: with open(self.json_log_path, 'a+') as f: mmcv.dump(json_log, f, file_format='json') f.write('\n') def _round_float(self, items): if isinstance(items, list): return [self._round_float(item) for item in items] elif isinstance(items, float): return round(items, 5) else: return items def log(self, runner): if 'eval_iter_num' in runner.log_buffer.output: # this doesn't modify runner.iter and is regardless of by_epoch cur_iter = runner.log_buffer.output.pop('eval_iter_num') else: cur_iter = self.get_iter(runner, inner_iter=True) log_dict = OrderedDict( mode=self.get_mode(runner), epoch=self.get_epoch(runner), iter=cur_iter) # only record lr of the first param group cur_lr = runner.current_lr() if isinstance(cur_lr, list): log_dict['lr'] = cur_lr[0] else: assert isinstance(cur_lr, dict) log_dict['lr'] = {} for k, lr_ in cur_lr.items(): assert isinstance(lr_, list) log_dict['lr'].update({k: lr_[0]}) if 'time' in runner.log_buffer.output: # statistic memory if torch.cuda.is_available(): log_dict['memory'] = self._get_max_memory(runner) log_dict = dict(log_dict, **runner.log_buffer.output) self._log_info(log_dict, runner) self._dump_log(log_dict, runner) return log_dict def after_run(self, runner): # copy or upload logs to self.out_dir if self.out_dir is not None: for filename in scandir(runner.work_dir, self.out_suffix, True): local_filepath = osp.join(runner.work_dir, filename) out_filepath = self.file_client.join_path( self.out_dir, filename) with open(local_filepath, 'r') as f: self.file_client.put_text(f.read(), out_filepath) runner.logger.info( (f'The file {local_filepath} has been uploaded to ' f'{out_filepath}.')) if not self.keep_local: os.remove(local_filepath) runner.logger.info( (f'{local_filepath} was removed due to the ' '`self.keep_local=False`')) ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/logger/wandb.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ...dist_utils import master_only from ..hook import HOOKS from .base import LoggerHook @HOOKS.register_module() class WandbLoggerHook(LoggerHook): def __init__(self, init_kwargs=None, interval=10, ignore_last=True, reset_flag=False, commit=True, by_epoch=True, with_step=True): super(WandbLoggerHook, self).__init__(interval, ignore_last, reset_flag, by_epoch) self.import_wandb() self.init_kwargs = init_kwargs self.commit = commit self.with_step = with_step def import_wandb(self): try: import wandb except ImportError: raise ImportError( 'Please run "pip install wandb" to install wandb') self.wandb = wandb @master_only def before_run(self, runner): super(WandbLoggerHook, self).before_run(runner) if self.wandb is None: self.import_wandb() if self.init_kwargs: self.wandb.init(**self.init_kwargs) else: self.wandb.init() @master_only def log(self, runner): tags = self.get_loggable_tags(runner) if tags: if self.with_step: self.wandb.log( tags, step=self.get_iter(runner), commit=self.commit) else: tags['global_step'] = self.get_iter(runner) self.wandb.log(tags, commit=self.commit) @master_only def after_run(self, runner): self.wandb.join() ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/lr_updater.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numbers from math import cos, pi import annotator.mmpkg.mmcv as mmcv from .hook import HOOKS, Hook class LrUpdaterHook(Hook): """LR Scheduler in MMCV. Args: by_epoch (bool): LR changes epoch by epoch warmup (string): Type of warmup used. It can be None(use no warmup), 'constant', 'linear' or 'exp' warmup_iters (int): The number of iterations or epochs that warmup lasts warmup_ratio (float): LR used at the beginning of warmup equals to warmup_ratio * initial_lr warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters means the number of epochs that warmup lasts, otherwise means the number of iteration that warmup lasts """ def __init__(self, by_epoch=True, warmup=None, warmup_iters=0, warmup_ratio=0.1, warmup_by_epoch=False): # validate the "warmup" argument if warmup is not None: if warmup not in ['constant', 'linear', 'exp']: raise ValueError( f'"{warmup}" is not a supported type for warming up, valid' ' types are "constant" and "linear"') if warmup is not None: assert warmup_iters > 0, \ '"warmup_iters" must be a positive integer' assert 0 < warmup_ratio <= 1.0, \ '"warmup_ratio" must be in range (0,1]' self.by_epoch = by_epoch self.warmup = warmup self.warmup_iters = warmup_iters self.warmup_ratio = warmup_ratio self.warmup_by_epoch = warmup_by_epoch if self.warmup_by_epoch: self.warmup_epochs = self.warmup_iters self.warmup_iters = None else: self.warmup_epochs = None self.base_lr = [] # initial lr for all param groups self.regular_lr = [] # expected lr if no warming up is performed def _set_lr(self, runner, lr_groups): if isinstance(runner.optimizer, dict): for k, optim in runner.optimizer.items(): for param_group, lr in zip(optim.param_groups, lr_groups[k]): param_group['lr'] = lr else: for param_group, lr in zip(runner.optimizer.param_groups, lr_groups): param_group['lr'] = lr def get_lr(self, runner, base_lr): raise NotImplementedError def get_regular_lr(self, runner): if isinstance(runner.optimizer, dict): lr_groups = {} for k in runner.optimizer.keys(): _lr_group = [ self.get_lr(runner, _base_lr) for _base_lr in self.base_lr[k] ] lr_groups.update({k: _lr_group}) return lr_groups else: return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr] def get_warmup_lr(self, cur_iters): def _get_warmup_lr(cur_iters, regular_lr): if self.warmup == 'constant': warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr] elif self.warmup == 'linear': k = (1 - cur_iters / self.warmup_iters) * (1 - self.warmup_ratio) warmup_lr = [_lr * (1 - k) for _lr in regular_lr] elif self.warmup == 'exp': k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters) warmup_lr = [_lr * k for _lr in regular_lr] return warmup_lr if isinstance(self.regular_lr, dict): lr_groups = {} for key, regular_lr in self.regular_lr.items(): lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr) return lr_groups else: return _get_warmup_lr(cur_iters, self.regular_lr) def before_run(self, runner): # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved, # it will be set according to the optimizer params if isinstance(runner.optimizer, dict): self.base_lr = {} for k, optim in runner.optimizer.items(): for group in optim.param_groups: group.setdefault('initial_lr', group['lr']) _base_lr = [ group['initial_lr'] for group in optim.param_groups ] self.base_lr.update({k: _base_lr}) else: for group in runner.optimizer.param_groups: group.setdefault('initial_lr', group['lr']) self.base_lr = [ group['initial_lr'] for group in runner.optimizer.param_groups ] def before_train_epoch(self, runner): if self.warmup_iters is None: epoch_len = len(runner.data_loader) self.warmup_iters = self.warmup_epochs * epoch_len if not self.by_epoch: return self.regular_lr = self.get_regular_lr(runner) self._set_lr(runner, self.regular_lr) def before_train_iter(self, runner): cur_iter = runner.iter if not self.by_epoch: self.regular_lr = self.get_regular_lr(runner) if self.warmup is None or cur_iter >= self.warmup_iters: self._set_lr(runner, self.regular_lr) else: warmup_lr = self.get_warmup_lr(cur_iter) self._set_lr(runner, warmup_lr) elif self.by_epoch: if self.warmup is None or cur_iter > self.warmup_iters: return elif cur_iter == self.warmup_iters: self._set_lr(runner, self.regular_lr) else: warmup_lr = self.get_warmup_lr(cur_iter) self._set_lr(runner, warmup_lr) @HOOKS.register_module() class FixedLrUpdaterHook(LrUpdaterHook): def __init__(self, **kwargs): super(FixedLrUpdaterHook, self).__init__(**kwargs) def get_lr(self, runner, base_lr): return base_lr @HOOKS.register_module() class StepLrUpdaterHook(LrUpdaterHook): """Step LR scheduler with min_lr clipping. Args: step (int | list[int]): Step to decay the LR. If an int value is given, regard it as the decay interval. If a list is given, decay LR at these steps. gamma (float, optional): Decay LR ratio. Default: 0.1. min_lr (float, optional): Minimum LR value to keep. If LR after decay is lower than `min_lr`, it will be clipped to this value. If None is given, we don't perform lr clipping. Default: None. """ def __init__(self, step, gamma=0.1, min_lr=None, **kwargs): if isinstance(step, list): assert mmcv.is_list_of(step, int) assert all([s > 0 for s in step]) elif isinstance(step, int): assert step > 0 else: raise TypeError('"step" must be a list or integer') self.step = step self.gamma = gamma self.min_lr = min_lr super(StepLrUpdaterHook, self).__init__(**kwargs) def get_lr(self, runner, base_lr): progress = runner.epoch if self.by_epoch else runner.iter # calculate exponential term if isinstance(self.step, int): exp = progress // self.step else: exp = len(self.step) for i, s in enumerate(self.step): if progress < s: exp = i break lr = base_lr * (self.gamma**exp) if self.min_lr is not None: # clip to a minimum value lr = max(lr, self.min_lr) return lr @HOOKS.register_module() class ExpLrUpdaterHook(LrUpdaterHook): def __init__(self, gamma, **kwargs): self.gamma = gamma super(ExpLrUpdaterHook, self).__init__(**kwargs) def get_lr(self, runner, base_lr): progress = runner.epoch if self.by_epoch else runner.iter return base_lr * self.gamma**progress @HOOKS.register_module() class PolyLrUpdaterHook(LrUpdaterHook): def __init__(self, power=1., min_lr=0., **kwargs): self.power = power self.min_lr = min_lr super(PolyLrUpdaterHook, self).__init__(**kwargs) def get_lr(self, runner, base_lr): if self.by_epoch: progress = runner.epoch max_progress = runner.max_epochs else: progress = runner.iter max_progress = runner.max_iters coeff = (1 - progress / max_progress)**self.power return (base_lr - self.min_lr) * coeff + self.min_lr @HOOKS.register_module() class InvLrUpdaterHook(LrUpdaterHook): def __init__(self, gamma, power=1., **kwargs): self.gamma = gamma self.power = power super(InvLrUpdaterHook, self).__init__(**kwargs) def get_lr(self, runner, base_lr): progress = runner.epoch if self.by_epoch else runner.iter return base_lr * (1 + self.gamma * progress)**(-self.power) @HOOKS.register_module() class CosineAnnealingLrUpdaterHook(LrUpdaterHook): def __init__(self, min_lr=None, min_lr_ratio=None, **kwargs): assert (min_lr is None) ^ (min_lr_ratio is None) self.min_lr = min_lr self.min_lr_ratio = min_lr_ratio super(CosineAnnealingLrUpdaterHook, self).__init__(**kwargs) def get_lr(self, runner, base_lr): if self.by_epoch: progress = runner.epoch max_progress = runner.max_epochs else: progress = runner.iter max_progress = runner.max_iters if self.min_lr_ratio is not None: target_lr = base_lr * self.min_lr_ratio else: target_lr = self.min_lr return annealing_cos(base_lr, target_lr, progress / max_progress) @HOOKS.register_module() class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook): """Flat + Cosine lr schedule. Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501 Args: start_percent (float): When to start annealing the learning rate after the percentage of the total training steps. The value should be in range [0, 1). Default: 0.75 min_lr (float, optional): The minimum lr. Default: None. min_lr_ratio (float, optional): The ratio of minimum lr to the base lr. Either `min_lr` or `min_lr_ratio` should be specified. Default: None. """ def __init__(self, start_percent=0.75, min_lr=None, min_lr_ratio=None, **kwargs): assert (min_lr is None) ^ (min_lr_ratio is None) if start_percent < 0 or start_percent > 1 or not isinstance( start_percent, float): raise ValueError( 'expected float between 0 and 1 start_percent, but ' f'got {start_percent}') self.start_percent = start_percent self.min_lr = min_lr self.min_lr_ratio = min_lr_ratio super(FlatCosineAnnealingLrUpdaterHook, self).__init__(**kwargs) def get_lr(self, runner, base_lr): if self.by_epoch: start = round(runner.max_epochs * self.start_percent) progress = runner.epoch - start max_progress = runner.max_epochs - start else: start = round(runner.max_iters * self.start_percent) progress = runner.iter - start max_progress = runner.max_iters - start if self.min_lr_ratio is not None: target_lr = base_lr * self.min_lr_ratio else: target_lr = self.min_lr if progress < 0: return base_lr else: return annealing_cos(base_lr, target_lr, progress / max_progress) @HOOKS.register_module() class CosineRestartLrUpdaterHook(LrUpdaterHook): """Cosine annealing with restarts learning rate scheme. Args: periods (list[int]): Periods for each cosine anneling cycle. restart_weights (list[float], optional): Restart weights at each restart iteration. Default: [1]. min_lr (float, optional): The minimum lr. Default: None. min_lr_ratio (float, optional): The ratio of minimum lr to the base lr. Either `min_lr` or `min_lr_ratio` should be specified. Default: None. """ def __init__(self, periods, restart_weights=[1], min_lr=None, min_lr_ratio=None, **kwargs): assert (min_lr is None) ^ (min_lr_ratio is None) self.periods = periods self.min_lr = min_lr self.min_lr_ratio = min_lr_ratio self.restart_weights = restart_weights assert (len(self.periods) == len(self.restart_weights) ), 'periods and restart_weights should have the same length.' super(CosineRestartLrUpdaterHook, self).__init__(**kwargs) self.cumulative_periods = [ sum(self.periods[0:i + 1]) for i in range(0, len(self.periods)) ] def get_lr(self, runner, base_lr): if self.by_epoch: progress = runner.epoch else: progress = runner.iter if self.min_lr_ratio is not None: target_lr = base_lr * self.min_lr_ratio else: target_lr = self.min_lr idx = get_position_from_periods(progress, self.cumulative_periods) current_weight = self.restart_weights[idx] nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1] current_periods = self.periods[idx] alpha = min((progress - nearest_restart) / current_periods, 1) return annealing_cos(base_lr, target_lr, alpha, current_weight) def get_position_from_periods(iteration, cumulative_periods): """Get the position from a period list. It will return the index of the right-closest number in the period list. For example, the cumulative_periods = [100, 200, 300, 400], if iteration == 50, return 0; if iteration == 210, return 2; if iteration == 300, return 3. Args: iteration (int): Current iteration. cumulative_periods (list[int]): Cumulative period list. Returns: int: The position of the right-closest number in the period list. """ for i, period in enumerate(cumulative_periods): if iteration < period: return i raise ValueError(f'Current iteration {iteration} exceeds ' f'cumulative_periods {cumulative_periods}') @HOOKS.register_module() class CyclicLrUpdaterHook(LrUpdaterHook): """Cyclic LR Scheduler. Implement the cyclical learning rate policy (CLR) described in https://arxiv.org/pdf/1506.01186.pdf Different from the original paper, we use cosine annealing rather than triangular policy inside a cycle. This improves the performance in the 3D detection area. Args: by_epoch (bool): Whether to update LR by epoch. target_ratio (tuple[float]): Relative ratio of the highest LR and the lowest LR to the initial LR. cyclic_times (int): Number of cycles during training step_ratio_up (float): The ratio of the increasing process of LR in the total cycle. anneal_strategy (str): {'cos', 'linear'} Specifies the annealing strategy: 'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'. """ def __init__(self, by_epoch=False, target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4, anneal_strategy='cos', **kwargs): if isinstance(target_ratio, float): target_ratio = (target_ratio, target_ratio / 1e5) elif isinstance(target_ratio, tuple): target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \ if len(target_ratio) == 1 else target_ratio else: raise ValueError('target_ratio should be either float ' f'or tuple, got {type(target_ratio)}') assert len(target_ratio) == 2, \ '"target_ratio" must be list or tuple of two floats' assert 0 <= step_ratio_up < 1.0, \ '"step_ratio_up" must be in range [0,1)' self.target_ratio = target_ratio self.cyclic_times = cyclic_times self.step_ratio_up = step_ratio_up self.lr_phases = [] # init lr_phases # validate anneal_strategy if anneal_strategy not in ['cos', 'linear']: raise ValueError('anneal_strategy must be one of "cos" or ' f'"linear", instead got {anneal_strategy}') elif anneal_strategy == 'cos': self.anneal_func = annealing_cos elif anneal_strategy == 'linear': self.anneal_func = annealing_linear assert not by_epoch, \ 'currently only support "by_epoch" = False' super(CyclicLrUpdaterHook, self).__init__(by_epoch, **kwargs) def before_run(self, runner): super(CyclicLrUpdaterHook, self).before_run(runner) # initiate lr_phases # total lr_phases are separated as up and down max_iter_per_phase = runner.max_iters // self.cyclic_times iter_up_phase = int(self.step_ratio_up * max_iter_per_phase) self.lr_phases.append( [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]]) self.lr_phases.append([ iter_up_phase, max_iter_per_phase, max_iter_per_phase, self.target_ratio[0], self.target_ratio[1] ]) def get_lr(self, runner, base_lr): curr_iter = runner.iter for (start_iter, end_iter, max_iter_per_phase, start_ratio, end_ratio) in self.lr_phases: curr_iter %= max_iter_per_phase if start_iter <= curr_iter < end_iter: progress = curr_iter - start_iter return self.anneal_func(base_lr * start_ratio, base_lr * end_ratio, progress / (end_iter - start_iter)) @HOOKS.register_module() class OneCycleLrUpdaterHook(LrUpdaterHook): """One Cycle LR Scheduler. The 1cycle learning rate policy changes the learning rate after every batch. The one cycle learning rate policy is described in https://arxiv.org/pdf/1708.07120.pdf Args: max_lr (float or list): Upper learning rate boundaries in the cycle for each parameter group. total_steps (int, optional): The total number of steps in the cycle. Note that if a value is not provided here, it will be the max_iter of runner. Default: None. pct_start (float): The percentage of the cycle (in number of steps) spent increasing the learning rate. Default: 0.3 anneal_strategy (str): {'cos', 'linear'} Specifies the annealing strategy: 'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos' div_factor (float): Determines the initial learning rate via initial_lr = max_lr/div_factor Default: 25 final_div_factor (float): Determines the minimum learning rate via min_lr = initial_lr/final_div_factor Default: 1e4 three_phase (bool): If three_phase is True, use a third phase of the schedule to annihilate the learning rate according to final_div_factor instead of modifying the second phase (the first two phases will be symmetrical about the step indicated by pct_start). Default: False """ def __init__(self, max_lr, total_steps=None, pct_start=0.3, anneal_strategy='cos', div_factor=25, final_div_factor=1e4, three_phase=False, **kwargs): # validate by_epoch, currently only support by_epoch = False if 'by_epoch' not in kwargs: kwargs['by_epoch'] = False else: assert not kwargs['by_epoch'], \ 'currently only support "by_epoch" = False' if not isinstance(max_lr, (numbers.Number, list, dict)): raise ValueError('the type of max_lr must be the one of list or ' f'dict, but got {type(max_lr)}') self._max_lr = max_lr if total_steps is not None: if not isinstance(total_steps, int): raise ValueError('the type of total_steps must be int, but' f'got {type(total_steps)}') self.total_steps = total_steps # validate pct_start if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float): raise ValueError('expected float between 0 and 1 pct_start, but ' f'got {pct_start}') self.pct_start = pct_start # validate anneal_strategy if anneal_strategy not in ['cos', 'linear']: raise ValueError('anneal_strategy must be one of "cos" or ' f'"linear", instead got {anneal_strategy}') elif anneal_strategy == 'cos': self.anneal_func = annealing_cos elif anneal_strategy == 'linear': self.anneal_func = annealing_linear self.div_factor = div_factor self.final_div_factor = final_div_factor self.three_phase = three_phase self.lr_phases = [] # init lr_phases super(OneCycleLrUpdaterHook, self).__init__(**kwargs) def before_run(self, runner): if hasattr(self, 'total_steps'): total_steps = self.total_steps else: total_steps = runner.max_iters if total_steps < runner.max_iters: raise ValueError( 'The total steps must be greater than or equal to max ' f'iterations {runner.max_iters} of runner, but total steps ' f'is {total_steps}.') if isinstance(runner.optimizer, dict): self.base_lr = {} for k, optim in runner.optimizer.items(): _max_lr = format_param(k, optim, self._max_lr) self.base_lr[k] = [lr / self.div_factor for lr in _max_lr] for group, lr in zip(optim.param_groups, self.base_lr[k]): group.setdefault('initial_lr', lr) else: k = type(runner.optimizer).__name__ _max_lr = format_param(k, runner.optimizer, self._max_lr) self.base_lr = [lr / self.div_factor for lr in _max_lr] for group, lr in zip(runner.optimizer.param_groups, self.base_lr): group.setdefault('initial_lr', lr) if self.three_phase: self.lr_phases.append( [float(self.pct_start * total_steps) - 1, 1, self.div_factor]) self.lr_phases.append([ float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1 ]) self.lr_phases.append( [total_steps - 1, 1, 1 / self.final_div_factor]) else: self.lr_phases.append( [float(self.pct_start * total_steps) - 1, 1, self.div_factor]) self.lr_phases.append( [total_steps - 1, self.div_factor, 1 / self.final_div_factor]) def get_lr(self, runner, base_lr): curr_iter = runner.iter start_iter = 0 for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases): if curr_iter <= end_iter: pct = (curr_iter - start_iter) / (end_iter - start_iter) lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr, pct) break start_iter = end_iter return lr def annealing_cos(start, end, factor, weight=1): """Calculate annealing cos learning rate. Cosine anneal from `weight * start + (1 - weight) * end` to `end` as percentage goes from 0.0 to 1.0. Args: start (float): The starting learning rate of the cosine annealing. end (float): The ending learing rate of the cosine annealing. factor (float): The coefficient of `pi` when calculating the current percentage. Range from 0.0 to 1.0. weight (float, optional): The combination factor of `start` and `end` when calculating the actual starting learning rate. Default to 1. """ cos_out = cos(pi * factor) + 1 return end + 0.5 * weight * (start - end) * cos_out def annealing_linear(start, end, factor): """Calculate annealing linear learning rate. Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0. Args: start (float): The starting learning rate of the linear annealing. end (float): The ending learing rate of the linear annealing. factor (float): The coefficient of `pi` when calculating the current percentage. Range from 0.0 to 1.0. """ return start + (end - start) * factor def format_param(name, optim, param): if isinstance(param, numbers.Number): return [param] * len(optim.param_groups) elif isinstance(param, (list, tuple)): # multi param groups if len(param) != len(optim.param_groups): raise ValueError(f'expected {len(optim.param_groups)} ' f'values for {name}, got {len(param)}') return param else: # multi optimizers if name not in param: raise KeyError(f'{name} is not found in {param.keys()}') return param[name] ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/memory.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from .hook import HOOKS, Hook @HOOKS.register_module() class EmptyCacheHook(Hook): def __init__(self, before_epoch=False, after_epoch=True, after_iter=False): self._before_epoch = before_epoch self._after_epoch = after_epoch self._after_iter = after_iter def after_iter(self, runner): if self._after_iter: torch.cuda.empty_cache() def before_epoch(self, runner): if self._before_epoch: torch.cuda.empty_cache() def after_epoch(self, runner): if self._after_epoch: torch.cuda.empty_cache() ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/momentum_updater.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import annotator.mmpkg.mmcv as mmcv from .hook import HOOKS, Hook from .lr_updater import annealing_cos, annealing_linear, format_param class MomentumUpdaterHook(Hook): def __init__(self, by_epoch=True, warmup=None, warmup_iters=0, warmup_ratio=0.9): # validate the "warmup" argument if warmup is not None: if warmup not in ['constant', 'linear', 'exp']: raise ValueError( f'"{warmup}" is not a supported type for warming up, valid' ' types are "constant" and "linear"') if warmup is not None: assert warmup_iters > 0, \ '"warmup_iters" must be a positive integer' assert 0 < warmup_ratio <= 1.0, \ '"warmup_momentum" must be in range (0,1]' self.by_epoch = by_epoch self.warmup = warmup self.warmup_iters = warmup_iters self.warmup_ratio = warmup_ratio self.base_momentum = [] # initial momentum for all param groups self.regular_momentum = [ ] # expected momentum if no warming up is performed def _set_momentum(self, runner, momentum_groups): if isinstance(runner.optimizer, dict): for k, optim in runner.optimizer.items(): for param_group, mom in zip(optim.param_groups, momentum_groups[k]): if 'momentum' in param_group.keys(): param_group['momentum'] = mom elif 'betas' in param_group.keys(): param_group['betas'] = (mom, param_group['betas'][1]) else: for param_group, mom in zip(runner.optimizer.param_groups, momentum_groups): if 'momentum' in param_group.keys(): param_group['momentum'] = mom elif 'betas' in param_group.keys(): param_group['betas'] = (mom, param_group['betas'][1]) def get_momentum(self, runner, base_momentum): raise NotImplementedError def get_regular_momentum(self, runner): if isinstance(runner.optimizer, dict): momentum_groups = {} for k in runner.optimizer.keys(): _momentum_group = [ self.get_momentum(runner, _base_momentum) for _base_momentum in self.base_momentum[k] ] momentum_groups.update({k: _momentum_group}) return momentum_groups else: return [ self.get_momentum(runner, _base_momentum) for _base_momentum in self.base_momentum ] def get_warmup_momentum(self, cur_iters): def _get_warmup_momentum(cur_iters, regular_momentum): if self.warmup == 'constant': warmup_momentum = [ _momentum / self.warmup_ratio for _momentum in self.regular_momentum ] elif self.warmup == 'linear': k = (1 - cur_iters / self.warmup_iters) * (1 - self.warmup_ratio) warmup_momentum = [ _momentum / (1 - k) for _momentum in self.regular_mom ] elif self.warmup == 'exp': k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters) warmup_momentum = [ _momentum / k for _momentum in self.regular_mom ] return warmup_momentum if isinstance(self.regular_momentum, dict): momentum_groups = {} for key, regular_momentum in self.regular_momentum.items(): momentum_groups[key] = _get_warmup_momentum( cur_iters, regular_momentum) return momentum_groups else: return _get_warmup_momentum(cur_iters, self.regular_momentum) def before_run(self, runner): # NOTE: when resuming from a checkpoint, # if 'initial_momentum' is not saved, # it will be set according to the optimizer params if isinstance(runner.optimizer, dict): self.base_momentum = {} for k, optim in runner.optimizer.items(): for group in optim.param_groups: if 'momentum' in group.keys(): group.setdefault('initial_momentum', group['momentum']) else: group.setdefault('initial_momentum', group['betas'][0]) _base_momentum = [ group['initial_momentum'] for group in optim.param_groups ] self.base_momentum.update({k: _base_momentum}) else: for group in runner.optimizer.param_groups: if 'momentum' in group.keys(): group.setdefault('initial_momentum', group['momentum']) else: group.setdefault('initial_momentum', group['betas'][0]) self.base_momentum = [ group['initial_momentum'] for group in runner.optimizer.param_groups ] def before_train_epoch(self, runner): if not self.by_epoch: return self.regular_mom = self.get_regular_momentum(runner) self._set_momentum(runner, self.regular_mom) def before_train_iter(self, runner): cur_iter = runner.iter if not self.by_epoch: self.regular_mom = self.get_regular_momentum(runner) if self.warmup is None or cur_iter >= self.warmup_iters: self._set_momentum(runner, self.regular_mom) else: warmup_momentum = self.get_warmup_momentum(cur_iter) self._set_momentum(runner, warmup_momentum) elif self.by_epoch: if self.warmup is None or cur_iter > self.warmup_iters: return elif cur_iter == self.warmup_iters: self._set_momentum(runner, self.regular_mom) else: warmup_momentum = self.get_warmup_momentum(cur_iter) self._set_momentum(runner, warmup_momentum) @HOOKS.register_module() class StepMomentumUpdaterHook(MomentumUpdaterHook): """Step momentum scheduler with min value clipping. Args: step (int | list[int]): Step to decay the momentum. If an int value is given, regard it as the decay interval. If a list is given, decay momentum at these steps. gamma (float, optional): Decay momentum ratio. Default: 0.5. min_momentum (float, optional): Minimum momentum value to keep. If momentum after decay is lower than this value, it will be clipped accordingly. If None is given, we don't perform lr clipping. Default: None. """ def __init__(self, step, gamma=0.5, min_momentum=None, **kwargs): if isinstance(step, list): assert mmcv.is_list_of(step, int) assert all([s > 0 for s in step]) elif isinstance(step, int): assert step > 0 else: raise TypeError('"step" must be a list or integer') self.step = step self.gamma = gamma self.min_momentum = min_momentum super(StepMomentumUpdaterHook, self).__init__(**kwargs) def get_momentum(self, runner, base_momentum): progress = runner.epoch if self.by_epoch else runner.iter # calculate exponential term if isinstance(self.step, int): exp = progress // self.step else: exp = len(self.step) for i, s in enumerate(self.step): if progress < s: exp = i break momentum = base_momentum * (self.gamma**exp) if self.min_momentum is not None: # clip to a minimum value momentum = max(momentum, self.min_momentum) return momentum @HOOKS.register_module() class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook): def __init__(self, min_momentum=None, min_momentum_ratio=None, **kwargs): assert (min_momentum is None) ^ (min_momentum_ratio is None) self.min_momentum = min_momentum self.min_momentum_ratio = min_momentum_ratio super(CosineAnnealingMomentumUpdaterHook, self).__init__(**kwargs) def get_momentum(self, runner, base_momentum): if self.by_epoch: progress = runner.epoch max_progress = runner.max_epochs else: progress = runner.iter max_progress = runner.max_iters if self.min_momentum_ratio is not None: target_momentum = base_momentum * self.min_momentum_ratio else: target_momentum = self.min_momentum return annealing_cos(base_momentum, target_momentum, progress / max_progress) @HOOKS.register_module() class CyclicMomentumUpdaterHook(MomentumUpdaterHook): """Cyclic momentum Scheduler. Implement the cyclical momentum scheduler policy described in https://arxiv.org/pdf/1708.07120.pdf This momentum scheduler usually used together with the CyclicLRUpdater to improve the performance in the 3D detection area. Attributes: target_ratio (tuple[float]): Relative ratio of the lowest momentum and the highest momentum to the initial momentum. cyclic_times (int): Number of cycles during training step_ratio_up (float): The ratio of the increasing process of momentum in the total cycle. by_epoch (bool): Whether to update momentum by epoch. """ def __init__(self, by_epoch=False, target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4, **kwargs): if isinstance(target_ratio, float): target_ratio = (target_ratio, target_ratio / 1e5) elif isinstance(target_ratio, tuple): target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \ if len(target_ratio) == 1 else target_ratio else: raise ValueError('target_ratio should be either float ' f'or tuple, got {type(target_ratio)}') assert len(target_ratio) == 2, \ '"target_ratio" must be list or tuple of two floats' assert 0 <= step_ratio_up < 1.0, \ '"step_ratio_up" must be in range [0,1)' self.target_ratio = target_ratio self.cyclic_times = cyclic_times self.step_ratio_up = step_ratio_up self.momentum_phases = [] # init momentum_phases # currently only support by_epoch=False assert not by_epoch, \ 'currently only support "by_epoch" = False' super(CyclicMomentumUpdaterHook, self).__init__(by_epoch, **kwargs) def before_run(self, runner): super(CyclicMomentumUpdaterHook, self).before_run(runner) # initiate momentum_phases # total momentum_phases are separated as up and down max_iter_per_phase = runner.max_iters // self.cyclic_times iter_up_phase = int(self.step_ratio_up * max_iter_per_phase) self.momentum_phases.append( [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]]) self.momentum_phases.append([ iter_up_phase, max_iter_per_phase, max_iter_per_phase, self.target_ratio[0], self.target_ratio[1] ]) def get_momentum(self, runner, base_momentum): curr_iter = runner.iter for (start_iter, end_iter, max_iter_per_phase, start_ratio, end_ratio) in self.momentum_phases: curr_iter %= max_iter_per_phase if start_iter <= curr_iter < end_iter: progress = curr_iter - start_iter return annealing_cos(base_momentum * start_ratio, base_momentum * end_ratio, progress / (end_iter - start_iter)) @HOOKS.register_module() class OneCycleMomentumUpdaterHook(MomentumUpdaterHook): """OneCycle momentum Scheduler. This momentum scheduler usually used together with the OneCycleLrUpdater to improve the performance. Args: base_momentum (float or list): Lower momentum boundaries in the cycle for each parameter group. Note that momentum is cycled inversely to learning rate; at the peak of a cycle, momentum is 'base_momentum' and learning rate is 'max_lr'. Default: 0.85 max_momentum (float or list): Upper momentum boundaries in the cycle for each parameter group. Functionally, it defines the cycle amplitude (max_momentum - base_momentum). Note that momentum is cycled inversely to learning rate; at the start of a cycle, momentum is 'max_momentum' and learning rate is 'base_lr' Default: 0.95 pct_start (float): The percentage of the cycle (in number of steps) spent increasing the learning rate. Default: 0.3 anneal_strategy (str): {'cos', 'linear'} Specifies the annealing strategy: 'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos' three_phase (bool): If three_phase is True, use a third phase of the schedule to annihilate the learning rate according to final_div_factor instead of modifying the second phase (the first two phases will be symmetrical about the step indicated by pct_start). Default: False """ def __init__(self, base_momentum=0.85, max_momentum=0.95, pct_start=0.3, anneal_strategy='cos', three_phase=False, **kwargs): # validate by_epoch, currently only support by_epoch=False if 'by_epoch' not in kwargs: kwargs['by_epoch'] = False else: assert not kwargs['by_epoch'], \ 'currently only support "by_epoch" = False' if not isinstance(base_momentum, (float, list, dict)): raise ValueError('base_momentum must be the type among of float,' 'list or dict.') self._base_momentum = base_momentum if not isinstance(max_momentum, (float, list, dict)): raise ValueError('max_momentum must be the type among of float,' 'list or dict.') self._max_momentum = max_momentum # validate pct_start if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float): raise ValueError('Expected float between 0 and 1 pct_start, but ' f'got {pct_start}') self.pct_start = pct_start # validate anneal_strategy if anneal_strategy not in ['cos', 'linear']: raise ValueError('anneal_strategy must by one of "cos" or ' f'"linear", instead got {anneal_strategy}') elif anneal_strategy == 'cos': self.anneal_func = annealing_cos elif anneal_strategy == 'linear': self.anneal_func = annealing_linear self.three_phase = three_phase self.momentum_phases = [] # init momentum_phases super(OneCycleMomentumUpdaterHook, self).__init__(**kwargs) def before_run(self, runner): if isinstance(runner.optimizer, dict): for k, optim in runner.optimizer.items(): if ('momentum' not in optim.defaults and 'betas' not in optim.defaults): raise ValueError('optimizer must support momentum with' 'option enabled') self.use_beta1 = 'betas' in optim.defaults _base_momentum = format_param(k, optim, self._base_momentum) _max_momentum = format_param(k, optim, self._max_momentum) for group, b_momentum, m_momentum in zip( optim.param_groups, _base_momentum, _max_momentum): if self.use_beta1: _, beta2 = group['betas'] group['betas'] = (m_momentum, beta2) else: group['momentum'] = m_momentum group['base_momentum'] = b_momentum group['max_momentum'] = m_momentum else: optim = runner.optimizer if ('momentum' not in optim.defaults and 'betas' not in optim.defaults): raise ValueError('optimizer must support momentum with' 'option enabled') self.use_beta1 = 'betas' in optim.defaults k = type(optim).__name__ _base_momentum = format_param(k, optim, self._base_momentum) _max_momentum = format_param(k, optim, self._max_momentum) for group, b_momentum, m_momentum in zip(optim.param_groups, _base_momentum, _max_momentum): if self.use_beta1: _, beta2 = group['betas'] group['betas'] = (m_momentum, beta2) else: group['momentum'] = m_momentum group['base_momentum'] = b_momentum group['max_momentum'] = m_momentum if self.three_phase: self.momentum_phases.append({ 'end_iter': float(self.pct_start * runner.max_iters) - 1, 'start_momentum': 'max_momentum', 'end_momentum': 'base_momentum' }) self.momentum_phases.append({ 'end_iter': float(2 * self.pct_start * runner.max_iters) - 2, 'start_momentum': 'base_momentum', 'end_momentum': 'max_momentum' }) self.momentum_phases.append({ 'end_iter': runner.max_iters - 1, 'start_momentum': 'max_momentum', 'end_momentum': 'max_momentum' }) else: self.momentum_phases.append({ 'end_iter': float(self.pct_start * runner.max_iters) - 1, 'start_momentum': 'max_momentum', 'end_momentum': 'base_momentum' }) self.momentum_phases.append({ 'end_iter': runner.max_iters - 1, 'start_momentum': 'base_momentum', 'end_momentum': 'max_momentum' }) def _set_momentum(self, runner, momentum_groups): if isinstance(runner.optimizer, dict): for k, optim in runner.optimizer.items(): for param_group, mom in zip(optim.param_groups, momentum_groups[k]): if 'momentum' in param_group.keys(): param_group['momentum'] = mom elif 'betas' in param_group.keys(): param_group['betas'] = (mom, param_group['betas'][1]) else: for param_group, mom in zip(runner.optimizer.param_groups, momentum_groups): if 'momentum' in param_group.keys(): param_group['momentum'] = mom elif 'betas' in param_group.keys(): param_group['betas'] = (mom, param_group['betas'][1]) def get_momentum(self, runner, param_group): curr_iter = runner.iter start_iter = 0 for i, phase in enumerate(self.momentum_phases): end_iter = phase['end_iter'] if curr_iter <= end_iter or i == len(self.momentum_phases) - 1: pct = (curr_iter - start_iter) / (end_iter - start_iter) momentum = self.anneal_func( param_group[phase['start_momentum']], param_group[phase['end_momentum']], pct) break start_iter = end_iter return momentum def get_regular_momentum(self, runner): if isinstance(runner.optimizer, dict): momentum_groups = {} for k, optim in runner.optimizer.items(): _momentum_group = [ self.get_momentum(runner, param_group) for param_group in optim.param_groups ] momentum_groups.update({k: _momentum_group}) return momentum_groups else: momentum_groups = [] for param_group in runner.optimizer.param_groups: momentum_groups.append(self.get_momentum(runner, param_group)) return momentum_groups ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/optimizer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy from collections import defaultdict from itertools import chain from torch.nn.utils import clip_grad from annotator.mmpkg.mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version from ..dist_utils import allreduce_grads from ..fp16_utils import LossScaler, wrap_fp16_model from .hook import HOOKS, Hook try: # If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported # and used; otherwise, auto fp16 will adopt mmcv's implementation. from torch.cuda.amp import GradScaler except ImportError: pass @HOOKS.register_module() class OptimizerHook(Hook): def __init__(self, grad_clip=None): self.grad_clip = grad_clip def clip_grads(self, params): params = list( filter(lambda p: p.requires_grad and p.grad is not None, params)) if len(params) > 0: return clip_grad.clip_grad_norm_(params, **self.grad_clip) def after_train_iter(self, runner): runner.optimizer.zero_grad() runner.outputs['loss'].backward() if self.grad_clip is not None: grad_norm = self.clip_grads(runner.model.parameters()) if grad_norm is not None: # Add grad norm to the logger runner.log_buffer.update({'grad_norm': float(grad_norm)}, runner.outputs['num_samples']) runner.optimizer.step() @HOOKS.register_module() class GradientCumulativeOptimizerHook(OptimizerHook): """Optimizer Hook implements multi-iters gradient cumulating. Args: cumulative_iters (int, optional): Num of gradient cumulative iters. The optimizer will step every `cumulative_iters` iters. Defaults to 1. Examples: >>> # Use cumulative_iters to simulate a large batch size >>> # It is helpful when the hardware cannot handle a large batch size. >>> loader = DataLoader(data, batch_size=64) >>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4) >>> # almost equals to >>> loader = DataLoader(data, batch_size=256) >>> optim_hook = OptimizerHook() """ def __init__(self, cumulative_iters=1, **kwargs): super(GradientCumulativeOptimizerHook, self).__init__(**kwargs) assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \ f'cumulative_iters only accepts positive int, but got ' \ f'{type(cumulative_iters)} instead.' self.cumulative_iters = cumulative_iters self.divisible_iters = 0 self.remainder_iters = 0 self.initialized = False def has_batch_norm(self, module): if isinstance(module, _BatchNorm): return True for m in module.children(): if self.has_batch_norm(m): return True return False def _init(self, runner): if runner.iter % self.cumulative_iters != 0: runner.logger.warning( 'Resume iter number is not divisible by cumulative_iters in ' 'GradientCumulativeOptimizerHook, which means the gradient of ' 'some iters is lost and the result may be influenced slightly.' ) if self.has_batch_norm(runner.model) and self.cumulative_iters > 1: runner.logger.warning( 'GradientCumulativeOptimizerHook may slightly decrease ' 'performance if the model has BatchNorm layers.') residual_iters = runner.max_iters - runner.iter self.divisible_iters = ( residual_iters // self.cumulative_iters * self.cumulative_iters) self.remainder_iters = residual_iters - self.divisible_iters self.initialized = True def after_train_iter(self, runner): if not self.initialized: self._init(runner) if runner.iter < self.divisible_iters: loss_factor = self.cumulative_iters else: loss_factor = self.remainder_iters loss = runner.outputs['loss'] loss = loss / loss_factor loss.backward() if (self.every_n_iters(runner, self.cumulative_iters) or self.is_last_iter(runner)): if self.grad_clip is not None: grad_norm = self.clip_grads(runner.model.parameters()) if grad_norm is not None: # Add grad norm to the logger runner.log_buffer.update({'grad_norm': float(grad_norm)}, runner.outputs['num_samples']) runner.optimizer.step() runner.optimizer.zero_grad() if (TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) >= digit_version('1.6.0')): @HOOKS.register_module() class Fp16OptimizerHook(OptimizerHook): """FP16 optimizer hook (using PyTorch's implementation). If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, to take care of the optimization procedure. Args: loss_scale (float | str | dict): Scale factor configuration. If loss_scale is a float, static loss scaling will be used with the specified scale. If loss_scale is a string, it must be 'dynamic', then dynamic loss scaling will be used. It can also be a dict containing arguments of GradScalar. Defaults to 512. For Pytorch >= 1.6, mmcv uses official implementation of GradScaler. If you use a dict version of loss_scale to create GradScaler, please refer to: https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler for the parameters. Examples: >>> loss_scale = dict( ... init_scale=65536.0, ... growth_factor=2.0, ... backoff_factor=0.5, ... growth_interval=2000 ... ) >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale) """ def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1, loss_scale=512., distributed=True): self.grad_clip = grad_clip self.coalesce = coalesce self.bucket_size_mb = bucket_size_mb self.distributed = distributed self._scale_update_param = None if loss_scale == 'dynamic': self.loss_scaler = GradScaler() elif isinstance(loss_scale, float): self._scale_update_param = loss_scale self.loss_scaler = GradScaler(init_scale=loss_scale) elif isinstance(loss_scale, dict): self.loss_scaler = GradScaler(**loss_scale) else: raise ValueError('loss_scale must be of type float, dict, or ' f'"dynamic", got {loss_scale}') def before_run(self, runner): """Preparing steps before Mixed Precision Training.""" # wrap model mode to fp16 wrap_fp16_model(runner.model) # resume from state dict if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']: scaler_state_dict = runner.meta['fp16']['loss_scaler'] self.loss_scaler.load_state_dict(scaler_state_dict) def copy_grads_to_fp32(self, fp16_net, fp32_weights): """Copy gradients from fp16 model to fp32 weight copy.""" for fp32_param, fp16_param in zip(fp32_weights, fp16_net.parameters()): if fp16_param.grad is not None: if fp32_param.grad is None: fp32_param.grad = fp32_param.data.new( fp32_param.size()) fp32_param.grad.copy_(fp16_param.grad) def copy_params_to_fp16(self, fp16_net, fp32_weights): """Copy updated params from fp32 weight copy to fp16 model.""" for fp16_param, fp32_param in zip(fp16_net.parameters(), fp32_weights): fp16_param.data.copy_(fp32_param.data) def after_train_iter(self, runner): """Backward optimization steps for Mixed Precision Training. For dynamic loss scaling, please refer to https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler. 1. Scale the loss by a scale factor. 2. Backward the loss to obtain the gradients. 3. Unscale the optimizer’s gradient tensors. 4. Call optimizer.step() and update scale factor. 5. Save loss_scaler state_dict for resume purpose. """ # clear grads of last iteration runner.model.zero_grad() runner.optimizer.zero_grad() self.loss_scaler.scale(runner.outputs['loss']).backward() self.loss_scaler.unscale_(runner.optimizer) # grad clip if self.grad_clip is not None: grad_norm = self.clip_grads(runner.model.parameters()) if grad_norm is not None: # Add grad norm to the logger runner.log_buffer.update({'grad_norm': float(grad_norm)}, runner.outputs['num_samples']) # backward and update scaler self.loss_scaler.step(runner.optimizer) self.loss_scaler.update(self._scale_update_param) # save state_dict of loss_scaler runner.meta.setdefault( 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() @HOOKS.register_module() class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook, Fp16OptimizerHook): """Fp16 optimizer Hook (using PyTorch's implementation) implements multi-iters gradient cumulating. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, to take care of the optimization procedure. """ def __init__(self, *args, **kwargs): super(GradientCumulativeFp16OptimizerHook, self).__init__(*args, **kwargs) def after_train_iter(self, runner): if not self.initialized: self._init(runner) if runner.iter < self.divisible_iters: loss_factor = self.cumulative_iters else: loss_factor = self.remainder_iters loss = runner.outputs['loss'] loss = loss / loss_factor self.loss_scaler.scale(loss).backward() if (self.every_n_iters(runner, self.cumulative_iters) or self.is_last_iter(runner)): # copy fp16 grads in the model to fp32 params in the optimizer self.loss_scaler.unscale_(runner.optimizer) if self.grad_clip is not None: grad_norm = self.clip_grads(runner.model.parameters()) if grad_norm is not None: # Add grad norm to the logger runner.log_buffer.update( {'grad_norm': float(grad_norm)}, runner.outputs['num_samples']) # backward and update scaler self.loss_scaler.step(runner.optimizer) self.loss_scaler.update(self._scale_update_param) # save state_dict of loss_scaler runner.meta.setdefault( 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() # clear grads runner.model.zero_grad() runner.optimizer.zero_grad() else: @HOOKS.register_module() class Fp16OptimizerHook(OptimizerHook): """FP16 optimizer hook (mmcv's implementation). The steps of fp16 optimizer is as follows. 1. Scale the loss value. 2. BP in the fp16 model. 2. Copy gradients from fp16 model to fp32 weights. 3. Update fp32 weights. 4. Copy updated parameters from fp32 weights to fp16 model. Refer to https://arxiv.org/abs/1710.03740 for more details. Args: loss_scale (float | str | dict): Scale factor configuration. If loss_scale is a float, static loss scaling will be used with the specified scale. If loss_scale is a string, it must be 'dynamic', then dynamic loss scaling will be used. It can also be a dict containing arguments of LossScaler. Defaults to 512. """ def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1, loss_scale=512., distributed=True): self.grad_clip = grad_clip self.coalesce = coalesce self.bucket_size_mb = bucket_size_mb self.distributed = distributed if loss_scale == 'dynamic': self.loss_scaler = LossScaler(mode='dynamic') elif isinstance(loss_scale, float): self.loss_scaler = LossScaler( init_scale=loss_scale, mode='static') elif isinstance(loss_scale, dict): self.loss_scaler = LossScaler(**loss_scale) else: raise ValueError('loss_scale must be of type float, dict, or ' f'"dynamic", got {loss_scale}') def before_run(self, runner): """Preparing steps before Mixed Precision Training. 1. Make a master copy of fp32 weights for optimization. 2. Convert the main model from fp32 to fp16. """ # keep a copy of fp32 weights old_groups = runner.optimizer.param_groups runner.optimizer.param_groups = copy.deepcopy( runner.optimizer.param_groups) state = defaultdict(dict) p_map = { old_p: p for old_p, p in zip( chain(*(g['params'] for g in old_groups)), chain(*(g['params'] for g in runner.optimizer.param_groups))) } for k, v in runner.optimizer.state.items(): state[p_map[k]] = v runner.optimizer.state = state # convert model to fp16 wrap_fp16_model(runner.model) # resume from state dict if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']: scaler_state_dict = runner.meta['fp16']['loss_scaler'] self.loss_scaler.load_state_dict(scaler_state_dict) def copy_grads_to_fp32(self, fp16_net, fp32_weights): """Copy gradients from fp16 model to fp32 weight copy.""" for fp32_param, fp16_param in zip(fp32_weights, fp16_net.parameters()): if fp16_param.grad is not None: if fp32_param.grad is None: fp32_param.grad = fp32_param.data.new( fp32_param.size()) fp32_param.grad.copy_(fp16_param.grad) def copy_params_to_fp16(self, fp16_net, fp32_weights): """Copy updated params from fp32 weight copy to fp16 model.""" for fp16_param, fp32_param in zip(fp16_net.parameters(), fp32_weights): fp16_param.data.copy_(fp32_param.data) def after_train_iter(self, runner): """Backward optimization steps for Mixed Precision Training. For dynamic loss scaling, please refer `loss_scalar.py` 1. Scale the loss by a scale factor. 2. Backward the loss to obtain the gradients (fp16). 3. Copy gradients from the model to the fp32 weight copy. 4. Scale the gradients back and update the fp32 weight copy. 5. Copy back the params from fp32 weight copy to the fp16 model. 6. Save loss_scaler state_dict for resume purpose. """ # clear grads of last iteration runner.model.zero_grad() runner.optimizer.zero_grad() # scale the loss value scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale scaled_loss.backward() # copy fp16 grads in the model to fp32 params in the optimizer fp32_weights = [] for param_group in runner.optimizer.param_groups: fp32_weights += param_group['params'] self.copy_grads_to_fp32(runner.model, fp32_weights) # allreduce grads if self.distributed: allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb) has_overflow = self.loss_scaler.has_overflow(fp32_weights) # if has overflow, skip this iteration if not has_overflow: # scale the gradients back for param in fp32_weights: if param.grad is not None: param.grad.div_(self.loss_scaler.loss_scale) if self.grad_clip is not None: grad_norm = self.clip_grads(fp32_weights) if grad_norm is not None: # Add grad norm to the logger runner.log_buffer.update( {'grad_norm': float(grad_norm)}, runner.outputs['num_samples']) # update fp32 params runner.optimizer.step() # copy fp32 params to the fp16 model self.copy_params_to_fp16(runner.model, fp32_weights) self.loss_scaler.update_scale(has_overflow) if has_overflow: runner.logger.warning('Check overflow, downscale loss scale ' f'to {self.loss_scaler.cur_scale}') # save state_dict of loss_scaler runner.meta.setdefault( 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() @HOOKS.register_module() class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook, Fp16OptimizerHook): """Fp16 optimizer Hook (using mmcv implementation) implements multi- iters gradient cumulating.""" def __init__(self, *args, **kwargs): super(GradientCumulativeFp16OptimizerHook, self).__init__(*args, **kwargs) def after_train_iter(self, runner): if not self.initialized: self._init(runner) if runner.iter < self.divisible_iters: loss_factor = self.cumulative_iters else: loss_factor = self.remainder_iters loss = runner.outputs['loss'] loss = loss / loss_factor # scale the loss value scaled_loss = loss * self.loss_scaler.loss_scale scaled_loss.backward() if (self.every_n_iters(runner, self.cumulative_iters) or self.is_last_iter(runner)): # copy fp16 grads in the model to fp32 params in the optimizer fp32_weights = [] for param_group in runner.optimizer.param_groups: fp32_weights += param_group['params'] self.copy_grads_to_fp32(runner.model, fp32_weights) # allreduce grads if self.distributed: allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb) has_overflow = self.loss_scaler.has_overflow(fp32_weights) # if has overflow, skip this iteration if not has_overflow: # scale the gradients back for param in fp32_weights: if param.grad is not None: param.grad.div_(self.loss_scaler.loss_scale) if self.grad_clip is not None: grad_norm = self.clip_grads(fp32_weights) if grad_norm is not None: # Add grad norm to the logger runner.log_buffer.update( {'grad_norm': float(grad_norm)}, runner.outputs['num_samples']) # update fp32 params runner.optimizer.step() # copy fp32 params to the fp16 model self.copy_params_to_fp16(runner.model, fp32_weights) else: runner.logger.warning( 'Check overflow, downscale loss scale ' f'to {self.loss_scaler.cur_scale}') self.loss_scaler.update_scale(has_overflow) # save state_dict of loss_scaler runner.meta.setdefault( 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() # clear grads runner.model.zero_grad() runner.optimizer.zero_grad() ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/profiler.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from typing import Callable, List, Optional, Union import torch from ..dist_utils import master_only from .hook import HOOKS, Hook @HOOKS.register_module() class ProfilerHook(Hook): """Profiler to analyze performance during training. PyTorch Profiler is a tool that allows the collection of the performance metrics during the training. More details on Profiler can be found at https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile Args: by_epoch (bool): Profile performance by epoch or by iteration. Default: True. profile_iters (int): Number of iterations for profiling. If ``by_epoch=True``, profile_iters indicates that they are the first profile_iters epochs at the beginning of the training, otherwise it indicates the first profile_iters iterations. Default: 1. activities (list[str]): List of activity groups (CPU, CUDA) to use in profiling. Default: ['cpu', 'cuda']. schedule (dict, optional): Config of generating the callable schedule. if schedule is None, profiler will not add step markers into the trace and table view. Default: None. on_trace_ready (callable, dict): Either a handler or a dict of generate handler. Default: None. record_shapes (bool): Save information about operator's input shapes. Default: False. profile_memory (bool): Track tensor memory allocation/deallocation. Default: False. with_stack (bool): Record source information (file and line number) for the ops. Default: False. with_flops (bool): Use formula to estimate the FLOPS of specific operators (matrix multiplication and 2D convolution). Default: False. json_trace_path (str, optional): Exports the collected trace in Chrome JSON format. Default: None. Example: >>> runner = ... # instantiate a Runner >>> # tensorboard trace >>> trace_config = dict(type='tb_trace', dir_name='work_dir') >>> profiler_config = dict(on_trace_ready=trace_config) >>> runner.register_profiler_hook(profiler_config) >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)]) """ def __init__(self, by_epoch: bool = True, profile_iters: int = 1, activities: List[str] = ['cpu', 'cuda'], schedule: Optional[dict] = None, on_trace_ready: Optional[Union[Callable, dict]] = None, record_shapes: bool = False, profile_memory: bool = False, with_stack: bool = False, with_flops: bool = False, json_trace_path: Optional[str] = None) -> None: try: from torch import profiler # torch version >= 1.8.1 except ImportError: raise ImportError('profiler is the new feature of torch1.8.1, ' f'but your version is {torch.__version__}') assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.' self.by_epoch = by_epoch if profile_iters < 1: raise ValueError('profile_iters should be greater than 0, but got ' f'{profile_iters}') self.profile_iters = profile_iters if not isinstance(activities, list): raise ValueError( f'activities should be list, but got {type(activities)}') self.activities = [] for activity in activities: activity = activity.lower() if activity == 'cpu': self.activities.append(profiler.ProfilerActivity.CPU) elif activity == 'cuda': self.activities.append(profiler.ProfilerActivity.CUDA) else: raise ValueError( f'activity should be "cpu" or "cuda", but got {activity}') if schedule is not None: self.schedule = profiler.schedule(**schedule) else: self.schedule = None self.on_trace_ready = on_trace_ready self.record_shapes = record_shapes self.profile_memory = profile_memory self.with_stack = with_stack self.with_flops = with_flops self.json_trace_path = json_trace_path @master_only def before_run(self, runner): if self.by_epoch and runner.max_epochs < self.profile_iters: raise ValueError('self.profile_iters should not be greater than ' f'{runner.max_epochs}') if not self.by_epoch and runner.max_iters < self.profile_iters: raise ValueError('self.profile_iters should not be greater than ' f'{runner.max_iters}') if callable(self.on_trace_ready): # handler _on_trace_ready = self.on_trace_ready elif isinstance(self.on_trace_ready, dict): # config of handler trace_cfg = self.on_trace_ready.copy() trace_type = trace_cfg.pop('type') # log_trace handler if trace_type == 'log_trace': def _log_handler(prof): print(prof.key_averages().table(**trace_cfg)) _on_trace_ready = _log_handler elif trace_type == 'tb_trace': # tensorboard_trace handler try: import torch_tb_profiler # noqa: F401 except ImportError: raise ImportError('please run "pip install ' 'torch-tb-profiler" to install ' 'torch_tb_profiler') _on_trace_ready = torch.profiler.tensorboard_trace_handler( **trace_cfg) else: raise ValueError('trace_type should be "log_trace" or ' f'"tb_trace", but got {trace_type}') elif self.on_trace_ready is None: _on_trace_ready = None # type: ignore else: raise ValueError('on_trace_ready should be handler, dict or None, ' f'but got {type(self.on_trace_ready)}') if runner.max_epochs > 1: warnings.warn(f'profiler will profile {runner.max_epochs} epochs ' 'instead of 1 epoch. Since profiler will slow down ' 'the training, it is recommended to train 1 epoch ' 'with ProfilerHook and adjust your setting according' ' to the profiler summary. During normal training ' '(epoch > 1), you may disable the ProfilerHook.') self.profiler = torch.profiler.profile( activities=self.activities, schedule=self.schedule, on_trace_ready=_on_trace_ready, record_shapes=self.record_shapes, profile_memory=self.profile_memory, with_stack=self.with_stack, with_flops=self.with_flops) self.profiler.__enter__() runner.logger.info('profiler is profiling...') @master_only def after_train_epoch(self, runner): if self.by_epoch and runner.epoch == self.profile_iters - 1: runner.logger.info('profiler may take a few minutes...') self.profiler.__exit__(None, None, None) if self.json_trace_path is not None: self.profiler.export_chrome_trace(self.json_trace_path) @master_only def after_train_iter(self, runner): self.profiler.step() if not self.by_epoch and runner.iter == self.profile_iters - 1: runner.logger.info('profiler may take a few minutes...') self.profiler.__exit__(None, None, None) if self.json_trace_path is not None: self.profiler.export_chrome_trace(self.json_trace_path) ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/sampler_seed.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .hook import HOOKS, Hook @HOOKS.register_module() class DistSamplerSeedHook(Hook): """Data-loading sampler for distributed training. When distributed training, it is only useful in conjunction with :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same purpose with :obj:`IterLoader`. """ def before_epoch(self, runner): if hasattr(runner.data_loader.sampler, 'set_epoch'): # in case the data loader uses `SequentialSampler` in Pytorch runner.data_loader.sampler.set_epoch(runner.epoch) elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'): # batch sampler in pytorch warps the sampler as its attributes. runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch) ================================================ FILE: annotator/mmpkg/mmcv/runner/hooks/sync_buffer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ..dist_utils import allreduce_params from .hook import HOOKS, Hook @HOOKS.register_module() class SyncBuffersHook(Hook): """Synchronize model buffers such as running_mean and running_var in BN at the end of each epoch. Args: distributed (bool): Whether distributed training is used. It is effective only for distributed training. Defaults to True. """ def __init__(self, distributed=True): self.distributed = distributed def after_epoch(self, runner): """All-reduce model buffers at the end of each epoch.""" if self.distributed: allreduce_params(runner.model.buffers()) ================================================ FILE: annotator/mmpkg/mmcv/runner/iter_based_runner.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp import platform import shutil import time import warnings import torch from torch.optim import Optimizer import annotator.mmpkg.mmcv as mmcv from .base_runner import BaseRunner from .builder import RUNNERS from .checkpoint import save_checkpoint from .hooks import IterTimerHook from .utils import get_host_info class IterLoader: def __init__(self, dataloader): self._dataloader = dataloader self.iter_loader = iter(self._dataloader) self._epoch = 0 @property def epoch(self): return self._epoch def __next__(self): try: data = next(self.iter_loader) except StopIteration: self._epoch += 1 if hasattr(self._dataloader.sampler, 'set_epoch'): self._dataloader.sampler.set_epoch(self._epoch) time.sleep(2) # Prevent possible deadlock during epoch transition self.iter_loader = iter(self._dataloader) data = next(self.iter_loader) return data def __len__(self): return len(self._dataloader) @RUNNERS.register_module() class IterBasedRunner(BaseRunner): """Iteration-based Runner. This runner train models iteration by iteration. """ def train(self, data_loader, **kwargs): self.model.train() self.mode = 'train' self.data_loader = data_loader self._epoch = data_loader.epoch data_batch = next(data_loader) self.call_hook('before_train_iter') outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) if not isinstance(outputs, dict): raise TypeError('model.train_step() must return a dict') if 'log_vars' in outputs: self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) self.outputs = outputs self.call_hook('after_train_iter') self._inner_iter += 1 self._iter += 1 @torch.no_grad() def val(self, data_loader, **kwargs): self.model.eval() self.mode = 'val' self.data_loader = data_loader data_batch = next(data_loader) self.call_hook('before_val_iter') outputs = self.model.val_step(data_batch, **kwargs) if not isinstance(outputs, dict): raise TypeError('model.val_step() must return a dict') if 'log_vars' in outputs: self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) self.outputs = outputs self.call_hook('after_val_iter') self._inner_iter += 1 def run(self, data_loaders, workflow, max_iters=None, **kwargs): """Start running. Args: data_loaders (list[:obj:`DataLoader`]): Dataloaders for training and validation. workflow (list[tuple]): A list of (phase, iters) to specify the running order and iterations. E.g, [('train', 10000), ('val', 1000)] means running 10000 iterations for training and 1000 iterations for validation, iteratively. """ assert isinstance(data_loaders, list) assert mmcv.is_list_of(workflow, tuple) assert len(data_loaders) == len(workflow) if max_iters is not None: warnings.warn( 'setting max_iters in run is deprecated, ' 'please set max_iters in runner_config', DeprecationWarning) self._max_iters = max_iters assert self._max_iters is not None, ( 'max_iters must be specified during instantiation') work_dir = self.work_dir if self.work_dir is not None else 'NONE' self.logger.info('Start running, host: %s, work_dir: %s', get_host_info(), work_dir) self.logger.info('Hooks will be executed in the following order:\n%s', self.get_hook_info()) self.logger.info('workflow: %s, max: %d iters', workflow, self._max_iters) self.call_hook('before_run') iter_loaders = [IterLoader(x) for x in data_loaders] self.call_hook('before_epoch') while self.iter < self._max_iters: for i, flow in enumerate(workflow): self._inner_iter = 0 mode, iters = flow if not isinstance(mode, str) or not hasattr(self, mode): raise ValueError( 'runner has no method named "{}" to run a workflow'. format(mode)) iter_runner = getattr(self, mode) for _ in range(iters): if mode == 'train' and self.iter >= self._max_iters: break iter_runner(iter_loaders[i], **kwargs) time.sleep(1) # wait for some hooks like loggers to finish self.call_hook('after_epoch') self.call_hook('after_run') def resume(self, checkpoint, resume_optimizer=True, map_location='default'): """Resume model from checkpoint. Args: checkpoint (str): Checkpoint to resume from. resume_optimizer (bool, optional): Whether resume the optimizer(s) if the checkpoint file includes optimizer(s). Default to True. map_location (str, optional): Same as :func:`torch.load`. Default to 'default'. """ if map_location == 'default': device_id = torch.cuda.current_device() checkpoint = self.load_checkpoint( checkpoint, map_location=lambda storage, loc: storage.cuda(device_id)) else: checkpoint = self.load_checkpoint( checkpoint, map_location=map_location) self._epoch = checkpoint['meta']['epoch'] self._iter = checkpoint['meta']['iter'] self._inner_iter = checkpoint['meta']['iter'] if 'optimizer' in checkpoint and resume_optimizer: if isinstance(self.optimizer, Optimizer): self.optimizer.load_state_dict(checkpoint['optimizer']) elif isinstance(self.optimizer, dict): for k in self.optimizer.keys(): self.optimizer[k].load_state_dict( checkpoint['optimizer'][k]) else: raise TypeError( 'Optimizer should be dict or torch.optim.Optimizer ' f'but got {type(self.optimizer)}') self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}') def save_checkpoint(self, out_dir, filename_tmpl='iter_{}.pth', meta=None, save_optimizer=True, create_symlink=True): """Save checkpoint to file. Args: out_dir (str): Directory to save checkpoint files. filename_tmpl (str, optional): Checkpoint file template. Defaults to 'iter_{}.pth'. meta (dict, optional): Metadata to be saved in checkpoint. Defaults to None. save_optimizer (bool, optional): Whether save optimizer. Defaults to True. create_symlink (bool, optional): Whether create symlink to the latest checkpoint file. Defaults to True. """ if meta is None: meta = {} elif not isinstance(meta, dict): raise TypeError( f'meta should be a dict or None, but got {type(meta)}') if self.meta is not None: meta.update(self.meta) # Note: meta.update(self.meta) should be done before # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise # there will be problems with resumed checkpoints. # More details in https://github.com/open-mmlab/mmcv/pull/1108 meta.update(epoch=self.epoch + 1, iter=self.iter) filename = filename_tmpl.format(self.iter + 1) filepath = osp.join(out_dir, filename) optimizer = self.optimizer if save_optimizer else None save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) # in some environments, `os.symlink` is not supported, you may need to # set `create_symlink` to False if create_symlink: dst_file = osp.join(out_dir, 'latest.pth') if platform.system() != 'Windows': mmcv.symlink(filename, dst_file) else: shutil.copy(filepath, dst_file) def register_training_hooks(self, lr_config, optimizer_config=None, checkpoint_config=None, log_config=None, momentum_config=None, custom_hooks_config=None): """Register default hooks for iter-based training. Checkpoint hook, optimizer stepper hook and logger hooks will be set to `by_epoch=False` by default. Default hooks include: +----------------------+-------------------------+ | Hooks | Priority | +======================+=========================+ | LrUpdaterHook | VERY_HIGH (10) | +----------------------+-------------------------+ | MomentumUpdaterHook | HIGH (30) | +----------------------+-------------------------+ | OptimizerStepperHook | ABOVE_NORMAL (40) | +----------------------+-------------------------+ | CheckpointSaverHook | NORMAL (50) | +----------------------+-------------------------+ | IterTimerHook | LOW (70) | +----------------------+-------------------------+ | LoggerHook(s) | VERY_LOW (90) | +----------------------+-------------------------+ | CustomHook(s) | defaults to NORMAL (50) | +----------------------+-------------------------+ If custom hooks have same priority with default hooks, custom hooks will be triggered after default hooks. """ if checkpoint_config is not None: checkpoint_config.setdefault('by_epoch', False) if lr_config is not None: lr_config.setdefault('by_epoch', False) if log_config is not None: for info in log_config['hooks']: info.setdefault('by_epoch', False) super(IterBasedRunner, self).register_training_hooks( lr_config=lr_config, momentum_config=momentum_config, optimizer_config=optimizer_config, checkpoint_config=checkpoint_config, log_config=log_config, timer_config=IterTimerHook(), custom_hooks_config=custom_hooks_config) ================================================ FILE: annotator/mmpkg/mmcv/runner/log_buffer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from collections import OrderedDict import numpy as np class LogBuffer: def __init__(self): self.val_history = OrderedDict() self.n_history = OrderedDict() self.output = OrderedDict() self.ready = False def clear(self): self.val_history.clear() self.n_history.clear() self.clear_output() def clear_output(self): self.output.clear() self.ready = False def update(self, vars, count=1): assert isinstance(vars, dict) for key, var in vars.items(): if key not in self.val_history: self.val_history[key] = [] self.n_history[key] = [] self.val_history[key].append(var) self.n_history[key].append(count) def average(self, n=0): """Average latest n values or all values.""" assert n >= 0 for key in self.val_history: values = np.array(self.val_history[key][-n:]) nums = np.array(self.n_history[key][-n:]) avg = np.sum(values * nums) / np.sum(nums) self.output[key] = avg self.ready = True ================================================ FILE: annotator/mmpkg/mmcv/runner/optimizer/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer, build_optimizer_constructor) from .default_constructor import DefaultOptimizerConstructor __all__ = [ 'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer', 'build_optimizer_constructor' ] ================================================ FILE: annotator/mmpkg/mmcv/runner/optimizer/builder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import inspect import torch from ...utils import Registry, build_from_cfg OPTIMIZERS = Registry('optimizer') OPTIMIZER_BUILDERS = Registry('optimizer builder') def register_torch_optimizers(): torch_optimizers = [] for module_name in dir(torch.optim): if module_name.startswith('__'): continue _optim = getattr(torch.optim, module_name) if inspect.isclass(_optim) and issubclass(_optim, torch.optim.Optimizer): OPTIMIZERS.register_module()(_optim) torch_optimizers.append(module_name) return torch_optimizers TORCH_OPTIMIZERS = register_torch_optimizers() def build_optimizer_constructor(cfg): return build_from_cfg(cfg, OPTIMIZER_BUILDERS) def build_optimizer(model, cfg): optimizer_cfg = copy.deepcopy(cfg) constructor_type = optimizer_cfg.pop('constructor', 'DefaultOptimizerConstructor') paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None) optim_constructor = build_optimizer_constructor( dict( type=constructor_type, optimizer_cfg=optimizer_cfg, paramwise_cfg=paramwise_cfg)) optimizer = optim_constructor(model) return optimizer ================================================ FILE: annotator/mmpkg/mmcv/runner/optimizer/default_constructor.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import torch from torch.nn import GroupNorm, LayerNorm from annotator.mmpkg.mmcv.utils import _BatchNorm, _InstanceNorm, build_from_cfg, is_list_of from annotator.mmpkg.mmcv.utils.ext_loader import check_ops_exist from .builder import OPTIMIZER_BUILDERS, OPTIMIZERS @OPTIMIZER_BUILDERS.register_module() class DefaultOptimizerConstructor: """Default constructor for optimizers. By default each parameter share the same optimizer settings, and we provide an argument ``paramwise_cfg`` to specify parameter-wise settings. It is a dict and may contain the following fields: - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If one of the keys in ``custom_keys`` is a substring of the name of one parameter, then the setting of the parameter will be specified by ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will be ignored. It should be noted that the aforementioned ``key`` is the longest key that is a substring of the name of the parameter. If there are multiple matched keys with the same length, then the key with lower alphabet order will be chosen. ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult`` and ``decay_mult``. See Example 2 below. - ``bias_lr_mult`` (float): It will be multiplied to the learning rate for all bias parameters (except for those in normalization layers and offset layers of DCN). - ``bias_decay_mult`` (float): It will be multiplied to the weight decay for all bias parameters (except for those in normalization layers, depthwise conv layers, offset layers of DCN). - ``norm_decay_mult`` (float): It will be multiplied to the weight decay for all weight and bias parameters of normalization layers. - ``dwconv_decay_mult`` (float): It will be multiplied to the weight decay for all weight and bias parameters of depthwise conv layers. - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning rate for parameters of offset layer in the deformable convs of a model. - ``bypass_duplicate`` (bool): If true, the duplicate parameters would not be added into optimizer. Default: False. Note: 1. If the option ``dcn_offset_lr_mult`` is used, the constructor will override the effect of ``bias_lr_mult`` in the bias of offset layer. So be careful when using both ``bias_lr_mult`` and ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset layer in deformable convs, set ``dcn_offset_lr_mult`` to the original ``dcn_offset_lr_mult`` * ``bias_lr_mult``. 2. If the option ``dcn_offset_lr_mult`` is used, the constructor will apply it to all the DCN layers in the model. So be careful when the model contains multiple DCN layers in places other than backbone. Args: model (:obj:`nn.Module`): The model with parameters to be optimized. optimizer_cfg (dict): The config dict of the optimizer. Positional fields are - `type`: class name of the optimizer. Optional fields are - any arguments of the corresponding optimizer type, e.g., lr, weight_decay, momentum, etc. paramwise_cfg (dict, optional): Parameter-wise options. Example 1: >>> model = torch.nn.modules.Conv1d(1, 1, 1) >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9, >>> weight_decay=0.0001) >>> paramwise_cfg = dict(norm_decay_mult=0.) >>> optim_builder = DefaultOptimizerConstructor( >>> optimizer_cfg, paramwise_cfg) >>> optimizer = optim_builder(model) Example 2: >>> # assume model have attribute model.backbone and model.cls_head >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95) >>> paramwise_cfg = dict(custom_keys={ '.backbone': dict(lr_mult=0.1, decay_mult=0.9)}) >>> optim_builder = DefaultOptimizerConstructor( >>> optimizer_cfg, paramwise_cfg) >>> optimizer = optim_builder(model) >>> # Then the `lr` and `weight_decay` for model.backbone is >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for >>> # model.cls_head is (0.01, 0.95). """ def __init__(self, optimizer_cfg, paramwise_cfg=None): if not isinstance(optimizer_cfg, dict): raise TypeError('optimizer_cfg should be a dict', f'but got {type(optimizer_cfg)}') self.optimizer_cfg = optimizer_cfg self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg self.base_lr = optimizer_cfg.get('lr', None) self.base_wd = optimizer_cfg.get('weight_decay', None) self._validate_cfg() def _validate_cfg(self): if not isinstance(self.paramwise_cfg, dict): raise TypeError('paramwise_cfg should be None or a dict, ' f'but got {type(self.paramwise_cfg)}') if 'custom_keys' in self.paramwise_cfg: if not isinstance(self.paramwise_cfg['custom_keys'], dict): raise TypeError( 'If specified, custom_keys must be a dict, ' f'but got {type(self.paramwise_cfg["custom_keys"])}') if self.base_wd is None: for key in self.paramwise_cfg['custom_keys']: if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]: raise ValueError('base_wd should not be None') # get base lr and weight decay # weight_decay must be explicitly specified if mult is specified if ('bias_decay_mult' in self.paramwise_cfg or 'norm_decay_mult' in self.paramwise_cfg or 'dwconv_decay_mult' in self.paramwise_cfg): if self.base_wd is None: raise ValueError('base_wd should not be None') def _is_in(self, param_group, param_group_list): assert is_list_of(param_group_list, dict) param = set(param_group['params']) param_set = set() for group in param_group_list: param_set.update(set(group['params'])) return not param.isdisjoint(param_set) def add_params(self, params, module, prefix='', is_dcn_module=None): """Add all parameters of module to the params list. The parameters of the given module will be added to the list of param groups, with specific rules defined by paramwise_cfg. Args: params (list[dict]): A list of param groups, it will be modified in place. module (nn.Module): The module to be added. prefix (str): The prefix of the module is_dcn_module (int|float|None): If the current module is a submodule of DCN, `is_dcn_module` will be passed to control conv_offset layer's learning rate. Defaults to None. """ # get param-wise options custom_keys = self.paramwise_cfg.get('custom_keys', {}) # first sort with alphabet order and then sort with reversed len of str sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.) bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.) norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.) dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.) bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False) dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.) # special rules for norm layers and depth-wise conv layers is_norm = isinstance(module, (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) is_dwconv = ( isinstance(module, torch.nn.Conv2d) and module.in_channels == module.groups) for name, param in module.named_parameters(recurse=False): param_group = {'params': [param]} if not param.requires_grad: params.append(param_group) continue if bypass_duplicate and self._is_in(param_group, params): warnings.warn(f'{prefix} is duplicate. It is skipped since ' f'bypass_duplicate={bypass_duplicate}') continue # if the parameter match one of the custom keys, ignore other rules is_custom = False for key in sorted_keys: if key in f'{prefix}.{name}': is_custom = True lr_mult = custom_keys[key].get('lr_mult', 1.) param_group['lr'] = self.base_lr * lr_mult if self.base_wd is not None: decay_mult = custom_keys[key].get('decay_mult', 1.) param_group['weight_decay'] = self.base_wd * decay_mult break if not is_custom: # bias_lr_mult affects all bias parameters # except for norm.bias dcn.conv_offset.bias if name == 'bias' and not (is_norm or is_dcn_module): param_group['lr'] = self.base_lr * bias_lr_mult if (prefix.find('conv_offset') != -1 and is_dcn_module and isinstance(module, torch.nn.Conv2d)): # deal with both dcn_offset's bias & weight param_group['lr'] = self.base_lr * dcn_offset_lr_mult # apply weight decay policies if self.base_wd is not None: # norm decay if is_norm: param_group[ 'weight_decay'] = self.base_wd * norm_decay_mult # depth-wise conv elif is_dwconv: param_group[ 'weight_decay'] = self.base_wd * dwconv_decay_mult # bias lr and decay elif name == 'bias' and not is_dcn_module: # TODO: current bias_decay_mult will have affect on DCN param_group[ 'weight_decay'] = self.base_wd * bias_decay_mult params.append(param_group) if check_ops_exist(): from annotator.mmpkg.mmcv.ops import DeformConv2d, ModulatedDeformConv2d is_dcn_module = isinstance(module, (DeformConv2d, ModulatedDeformConv2d)) else: is_dcn_module = False for child_name, child_mod in module.named_children(): child_prefix = f'{prefix}.{child_name}' if prefix else child_name self.add_params( params, child_mod, prefix=child_prefix, is_dcn_module=is_dcn_module) def __call__(self, model): if hasattr(model, 'module'): model = model.module optimizer_cfg = self.optimizer_cfg.copy() # if no paramwise option is specified, just use the global setting if not self.paramwise_cfg: optimizer_cfg['params'] = model.parameters() return build_from_cfg(optimizer_cfg, OPTIMIZERS) # set param-wise lr and weight decay recursively params = [] self.add_params(params, model) optimizer_cfg['params'] = params return build_from_cfg(optimizer_cfg, OPTIMIZERS) ================================================ FILE: annotator/mmpkg/mmcv/runner/priority.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from enum import Enum class Priority(Enum): """Hook priority levels. +--------------+------------+ | Level | Value | +==============+============+ | HIGHEST | 0 | +--------------+------------+ | VERY_HIGH | 10 | +--------------+------------+ | HIGH | 30 | +--------------+------------+ | ABOVE_NORMAL | 40 | +--------------+------------+ | NORMAL | 50 | +--------------+------------+ | BELOW_NORMAL | 60 | +--------------+------------+ | LOW | 70 | +--------------+------------+ | VERY_LOW | 90 | +--------------+------------+ | LOWEST | 100 | +--------------+------------+ """ HIGHEST = 0 VERY_HIGH = 10 HIGH = 30 ABOVE_NORMAL = 40 NORMAL = 50 BELOW_NORMAL = 60 LOW = 70 VERY_LOW = 90 LOWEST = 100 def get_priority(priority): """Get priority value. Args: priority (int or str or :obj:`Priority`): Priority. Returns: int: The priority value. """ if isinstance(priority, int): if priority < 0 or priority > 100: raise ValueError('priority must be between 0 and 100') return priority elif isinstance(priority, Priority): return priority.value elif isinstance(priority, str): return Priority[priority.upper()].value else: raise TypeError('priority must be an integer or Priority enum value') ================================================ FILE: annotator/mmpkg/mmcv/runner/utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os import random import sys import time import warnings from getpass import getuser from socket import gethostname import numpy as np import torch import annotator.mmpkg.mmcv as mmcv def get_host_info(): """Get hostname and username. Return empty string if exception raised, e.g. ``getpass.getuser()`` will lead to error in docker container """ host = '' try: host = f'{getuser()}@{gethostname()}' except Exception as e: warnings.warn(f'Host or user not found: {str(e)}') finally: return host def get_time_str(): return time.strftime('%Y%m%d_%H%M%S', time.localtime()) def obj_from_dict(info, parent=None, default_args=None): """Initialize an object from dict. The dict must contain the key "type", which indicates the object type, it can be either a string or type, such as "list" or ``list``. Remaining fields are treated as the arguments for constructing the object. Args: info (dict): Object types and arguments. parent (:class:`module`): Module which may containing expected object classes. default_args (dict, optional): Default arguments for initializing the object. Returns: any type: Object built from the dict. """ assert isinstance(info, dict) and 'type' in info assert isinstance(default_args, dict) or default_args is None args = info.copy() obj_type = args.pop('type') if mmcv.is_str(obj_type): if parent is not None: obj_type = getattr(parent, obj_type) else: obj_type = sys.modules[obj_type] elif not isinstance(obj_type, type): raise TypeError('type must be a str or valid type, but ' f'got {type(obj_type)}') if default_args is not None: for name, value in default_args.items(): args.setdefault(name, value) return obj_type(**args) def set_random_seed(seed, deterministic=False, use_rank_shift=False): """Set random seed. Args: seed (int): Seed to be used. deterministic (bool): Whether to set the deterministic option for CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` to True and `torch.backends.cudnn.benchmark` to False. Default: False. rank_shift (bool): Whether to add rank number to the random seed to have different random seed in different threads. Default: False. """ if use_rank_shift: rank, _ = mmcv.runner.get_dist_info() seed += rank random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) os.environ['PYTHONHASHSEED'] = str(seed) if deterministic: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False ================================================ FILE: annotator/mmpkg/mmcv/utils/__init__.py ================================================ # flake8: noqa # Copyright (c) OpenMMLab. All rights reserved. from .config import Config, ConfigDict, DictAction from .misc import (check_prerequisites, concat_list, deprecated_api_warning, has_method, import_modules_from_strings, is_list_of, is_method_overridden, is_seq_of, is_str, is_tuple_of, iter_cast, list_cast, requires_executable, requires_package, slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple, to_ntuple, tuple_cast) from .path import (check_file_exist, fopen, is_filepath, mkdir_or_exist, scandir, symlink) from .progressbar import (ProgressBar, track_iter_progress, track_parallel_progress, track_progress) from .testing import (assert_attrs_equal, assert_dict_contains_subset, assert_dict_has_keys, assert_is_norm_layer, assert_keys_equal, assert_params_all_zeros, check_python_script) from .timer import Timer, TimerError, check_time from .version_utils import digit_version, get_git_hash try: import torch except ImportError: __all__ = [ 'Config', 'ConfigDict', 'DictAction', 'is_str', 'iter_cast', 'list_cast', 'tuple_cast', 'is_seq_of', 'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list', 'check_prerequisites', 'requires_package', 'requires_executable', 'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist', 'symlink', 'scandir', 'ProgressBar', 'track_progress', 'track_iter_progress', 'track_parallel_progress', 'Timer', 'TimerError', 'check_time', 'deprecated_api_warning', 'digit_version', 'get_git_hash', 'import_modules_from_strings', 'assert_dict_contains_subset', 'assert_attrs_equal', 'assert_dict_has_keys', 'assert_keys_equal', 'check_python_script', 'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'to_ntuple', 'is_method_overridden', 'has_method' ] else: from .env import collect_env from .logging import get_logger, print_log from .parrots_jit import jit, skip_no_elena from .parrots_wrapper import ( TORCH_VERSION, BuildExtension, CppExtension, CUDAExtension, DataLoader, PoolDataLoader, SyncBatchNorm, _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _BatchNorm, _ConvNd, _ConvTransposeMixin, _InstanceNorm, _MaxPoolNd, get_build_config, is_rocm_pytorch, _get_cuda_home) from .registry import Registry, build_from_cfg from .trace import is_jit_tracing __all__ = [ 'Config', 'ConfigDict', 'DictAction', 'collect_env', 'get_logger', 'print_log', 'is_str', 'iter_cast', 'list_cast', 'tuple_cast', 'is_seq_of', 'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list', 'check_prerequisites', 'requires_package', 'requires_executable', 'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist', 'symlink', 'scandir', 'ProgressBar', 'track_progress', 'track_iter_progress', 'track_parallel_progress', 'Registry', 'build_from_cfg', 'Timer', 'TimerError', 'check_time', 'SyncBatchNorm', '_AdaptiveAvgPoolNd', '_AdaptiveMaxPoolNd', '_AvgPoolNd', '_BatchNorm', '_ConvNd', '_ConvTransposeMixin', '_InstanceNorm', '_MaxPoolNd', 'get_build_config', 'BuildExtension', 'CppExtension', 'CUDAExtension', 'DataLoader', 'PoolDataLoader', 'TORCH_VERSION', 'deprecated_api_warning', 'digit_version', 'get_git_hash', 'import_modules_from_strings', 'jit', 'skip_no_elena', 'assert_dict_contains_subset', 'assert_attrs_equal', 'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer', 'assert_params_all_zeros', 'check_python_script', 'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch', '_get_cuda_home', 'has_method' ] ================================================ FILE: annotator/mmpkg/mmcv/utils/config.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import ast import copy import os import os.path as osp import platform import shutil import sys import tempfile import uuid import warnings from argparse import Action, ArgumentParser from collections import abc from importlib import import_module from addict import Dict from yapf.yapflib.yapf_api import FormatCode from .misc import import_modules_from_strings from .path import check_file_exist if platform.system() == 'Windows': import regex as re else: import re BASE_KEY = '_base_' DELETE_KEY = '_delete_' DEPRECATION_KEY = '_deprecation_' RESERVED_KEYS = ['filename', 'text', 'pretty_text'] class ConfigDict(Dict): def __missing__(self, name): raise KeyError(name) def __getattr__(self, name): try: value = super(ConfigDict, self).__getattr__(name) except KeyError: ex = AttributeError(f"'{self.__class__.__name__}' object has no " f"attribute '{name}'") except Exception as e: ex = e else: return value raise ex def add_args(parser, cfg, prefix=''): for k, v in cfg.items(): if isinstance(v, str): parser.add_argument('--' + prefix + k) elif isinstance(v, int): parser.add_argument('--' + prefix + k, type=int) elif isinstance(v, float): parser.add_argument('--' + prefix + k, type=float) elif isinstance(v, bool): parser.add_argument('--' + prefix + k, action='store_true') elif isinstance(v, dict): add_args(parser, v, prefix + k + '.') elif isinstance(v, abc.Iterable): parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+') else: print(f'cannot parse key {prefix + k} of type {type(v)}') return parser class Config: """A facility for config and config files. It supports common file formats as configs: python/json/yaml. The interface is the same as a dict object and also allows access config values as attributes. Example: >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1]))) >>> cfg.a 1 >>> cfg.b {'b1': [0, 1]} >>> cfg.b.b1 [0, 1] >>> cfg = Config.fromfile('tests/data/config/a.py') >>> cfg.filename "/home/kchen/projects/mmcv/tests/data/config/a.py" >>> cfg.item4 'test' >>> cfg "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: " "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}" """ @staticmethod def _validate_py_syntax(filename): with open(filename, 'r', encoding='utf-8') as f: # Setting encoding explicitly to resolve coding issue on windows content = f.read() try: ast.parse(content) except SyntaxError as e: raise SyntaxError('There are syntax errors in config ' f'file {filename}: {e}') @staticmethod def _substitute_predefined_vars(filename, temp_config_name): file_dirname = osp.dirname(filename) file_basename = osp.basename(filename) file_basename_no_extension = osp.splitext(file_basename)[0] file_extname = osp.splitext(filename)[1] support_templates = dict( fileDirname=file_dirname, fileBasename=file_basename, fileBasenameNoExtension=file_basename_no_extension, fileExtname=file_extname) with open(filename, 'r', encoding='utf-8') as f: # Setting encoding explicitly to resolve coding issue on windows config_file = f.read() for key, value in support_templates.items(): regexp = r'\{\{\s*' + str(key) + r'\s*\}\}' value = value.replace('\\', '/') config_file = re.sub(regexp, value, config_file) with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file: tmp_config_file.write(config_file) @staticmethod def _pre_substitute_base_vars(filename, temp_config_name): """Substitute base variable placehoders to string, so that parsing would work.""" with open(filename, 'r', encoding='utf-8') as f: # Setting encoding explicitly to resolve coding issue on windows config_file = f.read() base_var_dict = {} regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}' base_vars = set(re.findall(regexp, config_file)) for base_var in base_vars: randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}' base_var_dict[randstr] = base_var regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}' config_file = re.sub(regexp, f'"{randstr}"', config_file) with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file: tmp_config_file.write(config_file) return base_var_dict @staticmethod def _substitute_base_vars(cfg, base_var_dict, base_cfg): """Substitute variable strings to their actual values.""" cfg = copy.deepcopy(cfg) if isinstance(cfg, dict): for k, v in cfg.items(): if isinstance(v, str) and v in base_var_dict: new_v = base_cfg for new_k in base_var_dict[v].split('.'): new_v = new_v[new_k] cfg[k] = new_v elif isinstance(v, (list, tuple, dict)): cfg[k] = Config._substitute_base_vars( v, base_var_dict, base_cfg) elif isinstance(cfg, tuple): cfg = tuple( Config._substitute_base_vars(c, base_var_dict, base_cfg) for c in cfg) elif isinstance(cfg, list): cfg = [ Config._substitute_base_vars(c, base_var_dict, base_cfg) for c in cfg ] elif isinstance(cfg, str) and cfg in base_var_dict: new_v = base_cfg for new_k in base_var_dict[cfg].split('.'): new_v = new_v[new_k] cfg = new_v return cfg @staticmethod def _file2dict(filename, use_predefined_variables=True): filename = osp.abspath(osp.expanduser(filename)) check_file_exist(filename) fileExtname = osp.splitext(filename)[1] if fileExtname not in ['.py', '.json', '.yaml', '.yml']: raise IOError('Only py/yml/yaml/json type are supported now!') with tempfile.TemporaryDirectory() as temp_config_dir: temp_config_file = tempfile.NamedTemporaryFile( dir=temp_config_dir, suffix=fileExtname) if platform.system() == 'Windows': temp_config_file.close() temp_config_name = osp.basename(temp_config_file.name) # Substitute predefined variables if use_predefined_variables: Config._substitute_predefined_vars(filename, temp_config_file.name) else: shutil.copyfile(filename, temp_config_file.name) # Substitute base variables from placeholders to strings base_var_dict = Config._pre_substitute_base_vars( temp_config_file.name, temp_config_file.name) if filename.endswith('.py'): temp_module_name = osp.splitext(temp_config_name)[0] sys.path.insert(0, temp_config_dir) Config._validate_py_syntax(filename) mod = import_module(temp_module_name) sys.path.pop(0) cfg_dict = { name: value for name, value in mod.__dict__.items() if not name.startswith('__') } # delete imported module del sys.modules[temp_module_name] elif filename.endswith(('.yml', '.yaml', '.json')): import annotator.mmpkg.mmcv as mmcv cfg_dict = mmcv.load(temp_config_file.name) # close temp file temp_config_file.close() # check deprecation information if DEPRECATION_KEY in cfg_dict: deprecation_info = cfg_dict.pop(DEPRECATION_KEY) warning_msg = f'The config file {filename} will be deprecated ' \ 'in the future.' if 'expected' in deprecation_info: warning_msg += f' Please use {deprecation_info["expected"]} ' \ 'instead.' if 'reference' in deprecation_info: warning_msg += ' More information can be found at ' \ f'{deprecation_info["reference"]}' warnings.warn(warning_msg) cfg_text = filename + '\n' with open(filename, 'r', encoding='utf-8') as f: # Setting encoding explicitly to resolve coding issue on windows cfg_text += f.read() if BASE_KEY in cfg_dict: cfg_dir = osp.dirname(filename) base_filename = cfg_dict.pop(BASE_KEY) base_filename = base_filename if isinstance( base_filename, list) else [base_filename] cfg_dict_list = list() cfg_text_list = list() for f in base_filename: _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f)) cfg_dict_list.append(_cfg_dict) cfg_text_list.append(_cfg_text) base_cfg_dict = dict() for c in cfg_dict_list: duplicate_keys = base_cfg_dict.keys() & c.keys() if len(duplicate_keys) > 0: raise KeyError('Duplicate key is not allowed among bases. ' f'Duplicate keys: {duplicate_keys}') base_cfg_dict.update(c) # Substitute base variables from strings to their actual values cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict, base_cfg_dict) base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict) cfg_dict = base_cfg_dict # merge cfg_text cfg_text_list.append(cfg_text) cfg_text = '\n'.join(cfg_text_list) return cfg_dict, cfg_text @staticmethod def _merge_a_into_b(a, b, allow_list_keys=False): """merge dict ``a`` into dict ``b`` (non-inplace). Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid in-place modifications. Args: a (dict): The source dict to be merged into ``b``. b (dict): The origin dict to be fetch keys from ``a``. allow_list_keys (bool): If True, int string keys (e.g. '0', '1') are allowed in source ``a`` and will replace the element of the corresponding index in b if b is a list. Default: False. Returns: dict: The modified dict of ``b`` using ``a``. Examples: # Normally merge a into b. >>> Config._merge_a_into_b( ... dict(obj=dict(a=2)), dict(obj=dict(a=1))) {'obj': {'a': 2}} # Delete b first and merge a into b. >>> Config._merge_a_into_b( ... dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1))) {'obj': {'a': 2}} # b is a list >>> Config._merge_a_into_b( ... {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True) [{'a': 2}, {'b': 2}] """ b = b.copy() for k, v in a.items(): if allow_list_keys and k.isdigit() and isinstance(b, list): k = int(k) if len(b) <= k: raise KeyError(f'Index {k} exceeds the length of list {b}') b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys) elif isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False): allowed_types = (dict, list) if allow_list_keys else dict if not isinstance(b[k], allowed_types): raise TypeError( f'{k}={v} in child config cannot inherit from base ' f'because {k} is a dict in the child config but is of ' f'type {type(b[k])} in base config. You may set ' f'`{DELETE_KEY}=True` to ignore the base config') b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys) else: b[k] = v return b @staticmethod def fromfile(filename, use_predefined_variables=True, import_custom_modules=True): cfg_dict, cfg_text = Config._file2dict(filename, use_predefined_variables) if import_custom_modules and cfg_dict.get('custom_imports', None): import_modules_from_strings(**cfg_dict['custom_imports']) return Config(cfg_dict, cfg_text=cfg_text, filename=filename) @staticmethod def fromstring(cfg_str, file_format): """Generate config from config str. Args: cfg_str (str): Config str. file_format (str): Config file format corresponding to the config str. Only py/yml/yaml/json type are supported now! Returns: obj:`Config`: Config obj. """ if file_format not in ['.py', '.json', '.yaml', '.yml']: raise IOError('Only py/yml/yaml/json type are supported now!') if file_format != '.py' and 'dict(' in cfg_str: # check if users specify a wrong suffix for python warnings.warn( 'Please check "file_format", the file format may be .py') with tempfile.NamedTemporaryFile( 'w', encoding='utf-8', suffix=file_format, delete=False) as temp_file: temp_file.write(cfg_str) # on windows, previous implementation cause error # see PR 1077 for details cfg = Config.fromfile(temp_file.name) os.remove(temp_file.name) return cfg @staticmethod def auto_argparser(description=None): """Generate argparser from config file automatically (experimental)""" partial_parser = ArgumentParser(description=description) partial_parser.add_argument('config', help='config file path') cfg_file = partial_parser.parse_known_args()[0].config cfg = Config.fromfile(cfg_file) parser = ArgumentParser(description=description) parser.add_argument('config', help='config file path') add_args(parser, cfg) return parser, cfg def __init__(self, cfg_dict=None, cfg_text=None, filename=None): if cfg_dict is None: cfg_dict = dict() elif not isinstance(cfg_dict, dict): raise TypeError('cfg_dict must be a dict, but ' f'got {type(cfg_dict)}') for key in cfg_dict: if key in RESERVED_KEYS: raise KeyError(f'{key} is reserved for config file') super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict)) super(Config, self).__setattr__('_filename', filename) if cfg_text: text = cfg_text elif filename: with open(filename, 'r') as f: text = f.read() else: text = '' super(Config, self).__setattr__('_text', text) @property def filename(self): return self._filename @property def text(self): return self._text @property def pretty_text(self): indent = 4 def _indent(s_, num_spaces): s = s_.split('\n') if len(s) == 1: return s_ first = s.pop(0) s = [(num_spaces * ' ') + line for line in s] s = '\n'.join(s) s = first + '\n' + s return s def _format_basic_types(k, v, use_mapping=False): if isinstance(v, str): v_str = f"'{v}'" else: v_str = str(v) if use_mapping: k_str = f"'{k}'" if isinstance(k, str) else str(k) attr_str = f'{k_str}: {v_str}' else: attr_str = f'{str(k)}={v_str}' attr_str = _indent(attr_str, indent) return attr_str def _format_list(k, v, use_mapping=False): # check if all items in the list are dict if all(isinstance(_, dict) for _ in v): v_str = '[\n' v_str += '\n'.join( f'dict({_indent(_format_dict(v_), indent)}),' for v_ in v).rstrip(',') if use_mapping: k_str = f"'{k}'" if isinstance(k, str) else str(k) attr_str = f'{k_str}: {v_str}' else: attr_str = f'{str(k)}={v_str}' attr_str = _indent(attr_str, indent) + ']' else: attr_str = _format_basic_types(k, v, use_mapping) return attr_str def _contain_invalid_identifier(dict_str): contain_invalid_identifier = False for key_name in dict_str: contain_invalid_identifier |= \ (not str(key_name).isidentifier()) return contain_invalid_identifier def _format_dict(input_dict, outest_level=False): r = '' s = [] use_mapping = _contain_invalid_identifier(input_dict) if use_mapping: r += '{' for idx, (k, v) in enumerate(input_dict.items()): is_last = idx >= len(input_dict) - 1 end = '' if outest_level or is_last else ',' if isinstance(v, dict): v_str = '\n' + _format_dict(v) if use_mapping: k_str = f"'{k}'" if isinstance(k, str) else str(k) attr_str = f'{k_str}: dict({v_str}' else: attr_str = f'{str(k)}=dict({v_str}' attr_str = _indent(attr_str, indent) + ')' + end elif isinstance(v, list): attr_str = _format_list(k, v, use_mapping) + end else: attr_str = _format_basic_types(k, v, use_mapping) + end s.append(attr_str) r += '\n'.join(s) if use_mapping: r += '}' return r cfg_dict = self._cfg_dict.to_dict() text = _format_dict(cfg_dict, outest_level=True) # copied from setup.cfg yapf_style = dict( based_on_style='pep8', blank_line_before_nested_class_or_def=True, split_before_expression_after_opening_paren=True) text, _ = FormatCode(text, style_config=yapf_style, verify=True) return text def __repr__(self): return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}' def __len__(self): return len(self._cfg_dict) def __getattr__(self, name): return getattr(self._cfg_dict, name) def __getitem__(self, name): return self._cfg_dict.__getitem__(name) def __setattr__(self, name, value): if isinstance(value, dict): value = ConfigDict(value) self._cfg_dict.__setattr__(name, value) def __setitem__(self, name, value): if isinstance(value, dict): value = ConfigDict(value) self._cfg_dict.__setitem__(name, value) def __iter__(self): return iter(self._cfg_dict) def __getstate__(self): return (self._cfg_dict, self._filename, self._text) def __setstate__(self, state): _cfg_dict, _filename, _text = state super(Config, self).__setattr__('_cfg_dict', _cfg_dict) super(Config, self).__setattr__('_filename', _filename) super(Config, self).__setattr__('_text', _text) def dump(self, file=None): cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict() if self.filename.endswith('.py'): if file is None: return self.pretty_text else: with open(file, 'w', encoding='utf-8') as f: f.write(self.pretty_text) else: import annotator.mmpkg.mmcv as mmcv if file is None: file_format = self.filename.split('.')[-1] return mmcv.dump(cfg_dict, file_format=file_format) else: mmcv.dump(cfg_dict, file) def merge_from_dict(self, options, allow_list_keys=True): """Merge list into cfg_dict. Merge the dict parsed by MultipleKVAction into this cfg. Examples: >>> options = {'model.backbone.depth': 50, ... 'model.backbone.with_cp':True} >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet')))) >>> cfg.merge_from_dict(options) >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict') >>> assert cfg_dict == dict( ... model=dict(backbone=dict(depth=50, with_cp=True))) # Merge list element >>> cfg = Config(dict(pipeline=[ ... dict(type='LoadImage'), dict(type='LoadAnnotations')])) >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')}) >>> cfg.merge_from_dict(options, allow_list_keys=True) >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict') >>> assert cfg_dict == dict(pipeline=[ ... dict(type='SelfLoadImage'), dict(type='LoadAnnotations')]) Args: options (dict): dict of configs to merge from. allow_list_keys (bool): If True, int string keys (e.g. '0', '1') are allowed in ``options`` and will replace the element of the corresponding index in the config if the config is a list. Default: True. """ option_cfg_dict = {} for full_key, v in options.items(): d = option_cfg_dict key_list = full_key.split('.') for subkey in key_list[:-1]: d.setdefault(subkey, ConfigDict()) d = d[subkey] subkey = key_list[-1] d[subkey] = v cfg_dict = super(Config, self).__getattribute__('_cfg_dict') super(Config, self).__setattr__( '_cfg_dict', Config._merge_a_into_b( option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys)) class DictAction(Action): """ argparse action to split an argument into KEY=VALUE form on the first = and append to a dictionary. List options can be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]' """ @staticmethod def _parse_int_float_bool(val): try: return int(val) except ValueError: pass try: return float(val) except ValueError: pass if val.lower() in ['true', 'false']: return True if val.lower() == 'true' else False return val @staticmethod def _parse_iterable(val): """Parse iterable values in the string. All elements inside '()' or '[]' are treated as iterable values. Args: val (str): Value string. Returns: list | tuple: The expanded list or tuple from the string. Examples: >>> DictAction._parse_iterable('1,2,3') [1, 2, 3] >>> DictAction._parse_iterable('[a, b, c]') ['a', 'b', 'c'] >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]') [(1, 2, 3), ['a', 'b'], 'c'] """ def find_next_comma(string): """Find the position of next comma in the string. If no ',' is found in the string, return the string length. All chars inside '()' and '[]' are treated as one element and thus ',' inside these brackets are ignored. """ assert (string.count('(') == string.count(')')) and ( string.count('[') == string.count(']')), \ f'Imbalanced brackets exist in {string}' end = len(string) for idx, char in enumerate(string): pre = string[:idx] # The string before this ',' is balanced if ((char == ',') and (pre.count('(') == pre.count(')')) and (pre.count('[') == pre.count(']'))): end = idx break return end # Strip ' and " characters and replace whitespace. val = val.strip('\'\"').replace(' ', '') is_tuple = False if val.startswith('(') and val.endswith(')'): is_tuple = True val = val[1:-1] elif val.startswith('[') and val.endswith(']'): val = val[1:-1] elif ',' not in val: # val is a single value return DictAction._parse_int_float_bool(val) values = [] while len(val) > 0: comma_idx = find_next_comma(val) element = DictAction._parse_iterable(val[:comma_idx]) values.append(element) val = val[comma_idx + 1:] if is_tuple: values = tuple(values) return values def __call__(self, parser, namespace, values, option_string=None): options = {} for kv in values: key, val = kv.split('=', maxsplit=1) options[key] = self._parse_iterable(val) setattr(namespace, self.dest, options) ================================================ FILE: annotator/mmpkg/mmcv/utils/env.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. """This file holding some environment constant for sharing by other files.""" import os.path as osp import subprocess import sys from collections import defaultdict import cv2 import torch import annotator.mmpkg.mmcv as mmcv from .parrots_wrapper import get_build_config def collect_env(): """Collect the information of the running environments. Returns: dict: The environment information. The following fields are contained. - sys.platform: The variable of ``sys.platform``. - Python: Python version. - CUDA available: Bool, indicating if CUDA is available. - GPU devices: Device type of each GPU. - CUDA_HOME (optional): The env var ``CUDA_HOME``. - NVCC (optional): NVCC version. - GCC: GCC version, "n/a" if GCC is not installed. - PyTorch: PyTorch version. - PyTorch compiling details: The output of \ ``torch.__config__.show()``. - TorchVision (optional): TorchVision version. - OpenCV: OpenCV version. - MMCV: MMCV version. - MMCV Compiler: The GCC version for compiling MMCV ops. - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops. """ env_info = {} env_info['sys.platform'] = sys.platform env_info['Python'] = sys.version.replace('\n', '') cuda_available = torch.cuda.is_available() env_info['CUDA available'] = cuda_available if cuda_available: devices = defaultdict(list) for k in range(torch.cuda.device_count()): devices[torch.cuda.get_device_name(k)].append(str(k)) for name, device_ids in devices.items(): env_info['GPU ' + ','.join(device_ids)] = name from annotator.mmpkg.mmcv.utils.parrots_wrapper import _get_cuda_home CUDA_HOME = _get_cuda_home() env_info['CUDA_HOME'] = CUDA_HOME if CUDA_HOME is not None and osp.isdir(CUDA_HOME): try: nvcc = osp.join(CUDA_HOME, 'bin/nvcc') nvcc = subprocess.check_output( f'"{nvcc}" -V | tail -n1', shell=True) nvcc = nvcc.decode('utf-8').strip() except subprocess.SubprocessError: nvcc = 'Not Available' env_info['NVCC'] = nvcc try: gcc = subprocess.check_output('gcc --version | head -n1', shell=True) gcc = gcc.decode('utf-8').strip() env_info['GCC'] = gcc except subprocess.CalledProcessError: # gcc is unavailable env_info['GCC'] = 'n/a' env_info['PyTorch'] = torch.__version__ env_info['PyTorch compiling details'] = get_build_config() try: import torchvision env_info['TorchVision'] = torchvision.__version__ except ModuleNotFoundError: pass env_info['OpenCV'] = cv2.__version__ env_info['MMCV'] = mmcv.__version__ try: from annotator.mmpkg.mmcv.ops import get_compiler_version, get_compiling_cuda_version except ModuleNotFoundError: env_info['MMCV Compiler'] = 'n/a' env_info['MMCV CUDA Compiler'] = 'n/a' else: env_info['MMCV Compiler'] = get_compiler_version() env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version() return env_info ================================================ FILE: annotator/mmpkg/mmcv/utils/ext_loader.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import importlib import os import pkgutil import warnings from collections import namedtuple import torch if torch.__version__ != 'parrots': def load_ext(name, funcs): ext = importlib.import_module('mmcv.' + name) for fun in funcs: assert hasattr(ext, fun), f'{fun} miss in module {name}' return ext else: from parrots import extension from parrots.base import ParrotsException has_return_value_ops = [ 'nms', 'softnms', 'nms_match', 'nms_rotated', 'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward', 'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward', 'right_pool_forward', 'right_pool_backward', 'fused_bias_leakyrelu', 'upfirdn2d', 'ms_deform_attn_forward', 'pixel_group', 'contour_expand', ] def get_fake_func(name, e): def fake_func(*args, **kwargs): warnings.warn(f'{name} is not supported in parrots now') raise e return fake_func def load_ext(name, funcs): ExtModule = namedtuple('ExtModule', funcs) ext_list = [] lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) for fun in funcs: try: ext_fun = extension.load(fun, name, lib_dir=lib_root) except ParrotsException as e: if 'No element registered' not in e.message: warnings.warn(e.message) ext_fun = get_fake_func(fun, e) ext_list.append(ext_fun) else: if fun in has_return_value_ops: ext_list.append(ext_fun.op) else: ext_list.append(ext_fun.op_) return ExtModule(*ext_list) def check_ops_exist(): ext_loader = pkgutil.find_loader('mmcv._ext') return ext_loader is not None ================================================ FILE: annotator/mmpkg/mmcv/utils/logging.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import logging import torch.distributed as dist logger_initialized = {} def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): """Initialize and get a logger by name. If the logger has not been initialized, this method will initialize the logger by adding one or two handlers, otherwise the initialized logger will be directly returned. During initialization, a StreamHandler will always be added. If `log_file` is specified and the process rank is 0, a FileHandler will also be added. Args: name (str): Logger name. log_file (str | None): The log filename. If specified, a FileHandler will be added to the logger. log_level (int): The logger level. Note that only the process of rank 0 is affected, and other processes will set the level to "Error" thus be silent most of the time. file_mode (str): The file mode used in opening log file. Defaults to 'w'. Returns: logging.Logger: The expected logger. """ logger = logging.getLogger(name) if name in logger_initialized: return logger # handle hierarchical names # e.g., logger "a" is initialized, then logger "a.b" will skip the # initialization since it is a child of "a". for logger_name in logger_initialized: if name.startswith(logger_name): return logger # handle duplicate logs to the console # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler (NOTSET) # to the root logger. As logger.propagate is True by default, this root # level handler causes logging messages from rank>0 processes to # unexpectedly show up on the console, creating much unwanted clutter. # To fix this issue, we set the root logger's StreamHandler, if any, to log # at the ERROR level. for handler in logger.root.handlers: if type(handler) is logging.StreamHandler: handler.setLevel(logging.ERROR) stream_handler = logging.StreamHandler() handlers = [stream_handler] if dist.is_available() and dist.is_initialized(): rank = dist.get_rank() else: rank = 0 # only rank 0 will add a FileHandler if rank == 0 and log_file is not None: # Here, the default behaviour of the official logger is 'a'. Thus, we # provide an interface to change the file mode to the default # behaviour. file_handler = logging.FileHandler(log_file, file_mode) handlers.append(file_handler) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') for handler in handlers: handler.setFormatter(formatter) handler.setLevel(log_level) logger.addHandler(handler) if rank == 0: logger.setLevel(log_level) else: logger.setLevel(logging.ERROR) logger_initialized[name] = True return logger def print_log(msg, logger=None, level=logging.INFO): """Print a log message. Args: msg (str): The message to be logged. logger (logging.Logger | str | None): The logger to be used. Some special loggers are: - "silent": no message will be printed. - other str: the logger obtained with `get_root_logger(logger)`. - None: The `print()` method will be used to print log messages. level (int): Logging level. Only available when `logger` is a Logger object or "root". """ if logger is None: print(msg) elif isinstance(logger, logging.Logger): logger.log(level, msg) elif logger == 'silent': pass elif isinstance(logger, str): _logger = get_logger(logger) _logger.log(level, msg) else: raise TypeError( 'logger should be either a logging.Logger object, str, ' f'"silent" or None, but got {type(logger)}') ================================================ FILE: annotator/mmpkg/mmcv/utils/misc.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import collections.abc import functools import itertools import subprocess import warnings from collections import abc from importlib import import_module from inspect import getfullargspec from itertools import repeat # From PyTorch internals def _ntuple(n): def parse(x): if isinstance(x, collections.abc.Iterable): return x return tuple(repeat(x, n)) return parse to_1tuple = _ntuple(1) to_2tuple = _ntuple(2) to_3tuple = _ntuple(3) to_4tuple = _ntuple(4) to_ntuple = _ntuple def is_str(x): """Whether the input is an string instance. Note: This method is deprecated since python 2 is no longer supported. """ return isinstance(x, str) def import_modules_from_strings(imports, allow_failed_imports=False): """Import modules from the given list of strings. Args: imports (list | str | None): The given module names to be imported. allow_failed_imports (bool): If True, the failed imports will return None. Otherwise, an ImportError is raise. Default: False. Returns: list[module] | module | None: The imported modules. Examples: >>> osp, sys = import_modules_from_strings( ... ['os.path', 'sys']) >>> import os.path as osp_ >>> import sys as sys_ >>> assert osp == osp_ >>> assert sys == sys_ """ if not imports: return single_import = False if isinstance(imports, str): single_import = True imports = [imports] if not isinstance(imports, list): raise TypeError( f'custom_imports must be a list but got type {type(imports)}') imported = [] for imp in imports: if not isinstance(imp, str): raise TypeError( f'{imp} is of type {type(imp)} and cannot be imported.') try: imported_tmp = import_module(imp) except ImportError: if allow_failed_imports: warnings.warn(f'{imp} failed to import and is ignored.', UserWarning) imported_tmp = None else: raise ImportError imported.append(imported_tmp) if single_import: imported = imported[0] return imported def iter_cast(inputs, dst_type, return_type=None): """Cast elements of an iterable object into some type. Args: inputs (Iterable): The input object. dst_type (type): Destination type. return_type (type, optional): If specified, the output object will be converted to this type, otherwise an iterator. Returns: iterator or specified type: The converted object. """ if not isinstance(inputs, abc.Iterable): raise TypeError('inputs must be an iterable object') if not isinstance(dst_type, type): raise TypeError('"dst_type" must be a valid type') out_iterable = map(dst_type, inputs) if return_type is None: return out_iterable else: return return_type(out_iterable) def list_cast(inputs, dst_type): """Cast elements of an iterable object into a list of some type. A partial method of :func:`iter_cast`. """ return iter_cast(inputs, dst_type, return_type=list) def tuple_cast(inputs, dst_type): """Cast elements of an iterable object into a tuple of some type. A partial method of :func:`iter_cast`. """ return iter_cast(inputs, dst_type, return_type=tuple) def is_seq_of(seq, expected_type, seq_type=None): """Check whether it is a sequence of some type. Args: seq (Sequence): The sequence to be checked. expected_type (type): Expected type of sequence items. seq_type (type, optional): Expected sequence type. Returns: bool: Whether the sequence is valid. """ if seq_type is None: exp_seq_type = abc.Sequence else: assert isinstance(seq_type, type) exp_seq_type = seq_type if not isinstance(seq, exp_seq_type): return False for item in seq: if not isinstance(item, expected_type): return False return True def is_list_of(seq, expected_type): """Check whether it is a list of some type. A partial method of :func:`is_seq_of`. """ return is_seq_of(seq, expected_type, seq_type=list) def is_tuple_of(seq, expected_type): """Check whether it is a tuple of some type. A partial method of :func:`is_seq_of`. """ return is_seq_of(seq, expected_type, seq_type=tuple) def slice_list(in_list, lens): """Slice a list into several sub lists by a list of given length. Args: in_list (list): The list to be sliced. lens(int or list): The expected length of each out list. Returns: list: A list of sliced list. """ if isinstance(lens, int): assert len(in_list) % lens == 0 lens = [lens] * int(len(in_list) / lens) if not isinstance(lens, list): raise TypeError('"indices" must be an integer or a list of integers') elif sum(lens) != len(in_list): raise ValueError('sum of lens and list length does not ' f'match: {sum(lens)} != {len(in_list)}') out_list = [] idx = 0 for i in range(len(lens)): out_list.append(in_list[idx:idx + lens[i]]) idx += lens[i] return out_list def concat_list(in_list): """Concatenate a list of list into a single list. Args: in_list (list): The list of list to be merged. Returns: list: The concatenated flat list. """ return list(itertools.chain(*in_list)) def check_prerequisites( prerequisites, checker, msg_tmpl='Prerequisites "{}" are required in method "{}" but not ' 'found, please install them first.'): # yapf: disable """A decorator factory to check if prerequisites are satisfied. Args: prerequisites (str of list[str]): Prerequisites to be checked. checker (callable): The checker method that returns True if a prerequisite is meet, False otherwise. msg_tmpl (str): The message template with two variables. Returns: decorator: A specific decorator. """ def wrap(func): @functools.wraps(func) def wrapped_func(*args, **kwargs): requirements = [prerequisites] if isinstance( prerequisites, str) else prerequisites missing = [] for item in requirements: if not checker(item): missing.append(item) if missing: print(msg_tmpl.format(', '.join(missing), func.__name__)) raise RuntimeError('Prerequisites not meet.') else: return func(*args, **kwargs) return wrapped_func return wrap def _check_py_package(package): try: import_module(package) except ImportError: return False else: return True def _check_executable(cmd): if subprocess.call(f'which {cmd}', shell=True) != 0: return False else: return True def requires_package(prerequisites): """A decorator to check if some python packages are installed. Example: >>> @requires_package('numpy') >>> func(arg1, args): >>> return numpy.zeros(1) array([0.]) >>> @requires_package(['numpy', 'non_package']) >>> func(arg1, args): >>> return numpy.zeros(1) ImportError """ return check_prerequisites(prerequisites, checker=_check_py_package) def requires_executable(prerequisites): """A decorator to check if some executable files are installed. Example: >>> @requires_executable('ffmpeg') >>> func(arg1, args): >>> print(1) 1 """ return check_prerequisites(prerequisites, checker=_check_executable) def deprecated_api_warning(name_dict, cls_name=None): """A decorator to check if some arguments are deprecate and try to replace deprecate src_arg_name to dst_arg_name. Args: name_dict(dict): key (str): Deprecate argument names. val (str): Expected argument names. Returns: func: New function. """ def api_warning_wrapper(old_func): @functools.wraps(old_func) def new_func(*args, **kwargs): # get the arg spec of the decorated method args_info = getfullargspec(old_func) # get name of the function func_name = old_func.__name__ if cls_name is not None: func_name = f'{cls_name}.{func_name}' if args: arg_names = args_info.args[:len(args)] for src_arg_name, dst_arg_name in name_dict.items(): if src_arg_name in arg_names: warnings.warn( f'"{src_arg_name}" is deprecated in ' f'`{func_name}`, please use "{dst_arg_name}" ' 'instead') arg_names[arg_names.index(src_arg_name)] = dst_arg_name if kwargs: for src_arg_name, dst_arg_name in name_dict.items(): if src_arg_name in kwargs: assert dst_arg_name not in kwargs, ( f'The expected behavior is to replace ' f'the deprecated key `{src_arg_name}` to ' f'new key `{dst_arg_name}`, but got them ' f'in the arguments at the same time, which ' f'is confusing. `{src_arg_name} will be ' f'deprecated in the future, please ' f'use `{dst_arg_name}` instead.') warnings.warn( f'"{src_arg_name}" is deprecated in ' f'`{func_name}`, please use "{dst_arg_name}" ' 'instead') kwargs[dst_arg_name] = kwargs.pop(src_arg_name) # apply converted arguments to the decorated method output = old_func(*args, **kwargs) return output return new_func return api_warning_wrapper def is_method_overridden(method, base_class, derived_class): """Check if a method of base class is overridden in derived class. Args: method (str): the method name to check. base_class (type): the class of the base class. derived_class (type | Any): the class or instance of the derived class. """ assert isinstance(base_class, type), \ "base_class doesn't accept instance, Please pass class instead." if not isinstance(derived_class, type): derived_class = derived_class.__class__ base_method = getattr(base_class, method) derived_method = getattr(derived_class, method) return derived_method != base_method def has_method(obj: object, method: str) -> bool: """Check whether the object has a method. Args: method (str): The method name to check. obj (object): The object to check. Returns: bool: True if the object has the method else False. """ return hasattr(obj, method) and callable(getattr(obj, method)) ================================================ FILE: annotator/mmpkg/mmcv/utils/parrots_jit.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os from .parrots_wrapper import TORCH_VERSION parrots_jit_option = os.getenv('PARROTS_JIT_OPTION') if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON': from parrots.jit import pat as jit else: def jit(func=None, check_input=None, full_shape=True, derivate=False, coderize=False, optimize=False): def wrapper(func): def wrapper_inner(*args, **kargs): return func(*args, **kargs) return wrapper_inner if func is None: return wrapper else: return func if TORCH_VERSION == 'parrots': from parrots.utils.tester import skip_no_elena else: def skip_no_elena(func): def wrapper(*args, **kargs): return func(*args, **kargs) return wrapper ================================================ FILE: annotator/mmpkg/mmcv/utils/parrots_wrapper.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from functools import partial import torch TORCH_VERSION = torch.__version__ def is_rocm_pytorch() -> bool: is_rocm = False if TORCH_VERSION != 'parrots': try: from torch.utils.cpp_extension import ROCM_HOME is_rocm = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False except ImportError: pass return is_rocm def _get_cuda_home(): if TORCH_VERSION == 'parrots': from parrots.utils.build_extension import CUDA_HOME else: if is_rocm_pytorch(): from torch.utils.cpp_extension import ROCM_HOME CUDA_HOME = ROCM_HOME else: from torch.utils.cpp_extension import CUDA_HOME return CUDA_HOME def get_build_config(): if TORCH_VERSION == 'parrots': from parrots.config import get_build_info return get_build_info() else: return torch.__config__.show() def _get_conv(): if TORCH_VERSION == 'parrots': from parrots.nn.modules.conv import _ConvNd, _ConvTransposeMixin else: from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin return _ConvNd, _ConvTransposeMixin def _get_dataloader(): if TORCH_VERSION == 'parrots': from torch.utils.data import DataLoader, PoolDataLoader else: from torch.utils.data import DataLoader PoolDataLoader = DataLoader return DataLoader, PoolDataLoader def _get_extension(): if TORCH_VERSION == 'parrots': from parrots.utils.build_extension import BuildExtension, Extension CppExtension = partial(Extension, cuda=False) CUDAExtension = partial(Extension, cuda=True) else: from torch.utils.cpp_extension import (BuildExtension, CppExtension, CUDAExtension) return BuildExtension, CppExtension, CUDAExtension def _get_pool(): if TORCH_VERSION == 'parrots': from parrots.nn.modules.pool import (_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd) else: from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd) return _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd def _get_norm(): if TORCH_VERSION == 'parrots': from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm SyncBatchNorm_ = torch.nn.SyncBatchNorm2d else: from torch.nn.modules.instancenorm import _InstanceNorm from torch.nn.modules.batchnorm import _BatchNorm SyncBatchNorm_ = torch.nn.SyncBatchNorm return _BatchNorm, _InstanceNorm, SyncBatchNorm_ _ConvNd, _ConvTransposeMixin = _get_conv() DataLoader, PoolDataLoader = _get_dataloader() BuildExtension, CppExtension, CUDAExtension = _get_extension() _BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm() _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool() class SyncBatchNorm(SyncBatchNorm_): def _check_input_dim(self, input): if TORCH_VERSION == 'parrots': if input.dim() < 2: raise ValueError( f'expected at least 2D input (got {input.dim()}D input)') else: super()._check_input_dim(input) ================================================ FILE: annotator/mmpkg/mmcv/utils/path.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os import os.path as osp from pathlib import Path from .misc import is_str def is_filepath(x): return is_str(x) or isinstance(x, Path) def fopen(filepath, *args, **kwargs): if is_str(filepath): return open(filepath, *args, **kwargs) elif isinstance(filepath, Path): return filepath.open(*args, **kwargs) raise ValueError('`filepath` should be a string or a Path') def check_file_exist(filename, msg_tmpl='file "{}" does not exist'): if not osp.isfile(filename): raise FileNotFoundError(msg_tmpl.format(filename)) def mkdir_or_exist(dir_name, mode=0o777): if dir_name == '': return dir_name = osp.expanduser(dir_name) os.makedirs(dir_name, mode=mode, exist_ok=True) def symlink(src, dst, overwrite=True, **kwargs): if os.path.lexists(dst) and overwrite: os.remove(dst) os.symlink(src, dst, **kwargs) def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True): """Scan a directory to find the interested files. Args: dir_path (str | obj:`Path`): Path of the directory. suffix (str | tuple(str), optional): File suffix that we are interested in. Default: None. recursive (bool, optional): If set to True, recursively scan the directory. Default: False. case_sensitive (bool, optional) : If set to False, ignore the case of suffix. Default: True. Returns: A generator for all the interested files with relative paths. """ if isinstance(dir_path, (str, Path)): dir_path = str(dir_path) else: raise TypeError('"dir_path" must be a string or Path object') if (suffix is not None) and not isinstance(suffix, (str, tuple)): raise TypeError('"suffix" must be a string or tuple of strings') if suffix is not None and not case_sensitive: suffix = suffix.lower() if isinstance(suffix, str) else tuple( item.lower() for item in suffix) root = dir_path def _scandir(dir_path, suffix, recursive, case_sensitive): for entry in os.scandir(dir_path): if not entry.name.startswith('.') and entry.is_file(): rel_path = osp.relpath(entry.path, root) _rel_path = rel_path if case_sensitive else rel_path.lower() if suffix is None or _rel_path.endswith(suffix): yield rel_path elif recursive and os.path.isdir(entry.path): # scan recursively if entry.path is a directory yield from _scandir(entry.path, suffix, recursive, case_sensitive) return _scandir(dir_path, suffix, recursive, case_sensitive) def find_vcs_root(path, markers=('.git', )): """Finds the root directory (including itself) of specified markers. Args: path (str): Path of directory or file. markers (list[str], optional): List of file or directory names. Returns: The directory contained one of the markers or None if not found. """ if osp.isfile(path): path = osp.dirname(path) prev, cur = None, osp.abspath(osp.expanduser(path)) while cur != prev: if any(osp.exists(osp.join(cur, marker)) for marker in markers): return cur prev, cur = cur, osp.split(cur)[0] return None ================================================ FILE: annotator/mmpkg/mmcv/utils/progressbar.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import sys from collections.abc import Iterable from multiprocessing import Pool from shutil import get_terminal_size from .timer import Timer class ProgressBar: """A progress bar which can print the progress.""" def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout): self.task_num = task_num self.bar_width = bar_width self.completed = 0 self.file = file if start: self.start() @property def terminal_width(self): width, _ = get_terminal_size() return width def start(self): if self.task_num > 0: self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, ' 'elapsed: 0s, ETA:') else: self.file.write('completed: 0, elapsed: 0s') self.file.flush() self.timer = Timer() def update(self, num_tasks=1): assert num_tasks > 0 self.completed += num_tasks elapsed = self.timer.since_start() if elapsed > 0: fps = self.completed / elapsed else: fps = float('inf') if self.task_num > 0: percentage = self.completed / float(self.task_num) eta = int(elapsed * (1 - percentage) / percentage + 0.5) msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \ f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \ f'ETA: {eta:5}s' bar_width = min(self.bar_width, int(self.terminal_width - len(msg)) + 2, int(self.terminal_width * 0.6)) bar_width = max(2, bar_width) mark_width = int(bar_width * percentage) bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width) self.file.write(msg.format(bar_chars)) else: self.file.write( f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,' f' {fps:.1f} tasks/s') self.file.flush() def track_progress(func, tasks, bar_width=50, file=sys.stdout, **kwargs): """Track the progress of tasks execution with a progress bar. Tasks are done with a simple for-loop. Args: func (callable): The function to be applied to each task. tasks (list or tuple[Iterable, int]): A list of tasks or (tasks, total num). bar_width (int): Width of progress bar. Returns: list: The task results. """ if isinstance(tasks, tuple): assert len(tasks) == 2 assert isinstance(tasks[0], Iterable) assert isinstance(tasks[1], int) task_num = tasks[1] tasks = tasks[0] elif isinstance(tasks, Iterable): task_num = len(tasks) else: raise TypeError( '"tasks" must be an iterable object or a (iterator, int) tuple') prog_bar = ProgressBar(task_num, bar_width, file=file) results = [] for task in tasks: results.append(func(task, **kwargs)) prog_bar.update() prog_bar.file.write('\n') return results def init_pool(process_num, initializer=None, initargs=None): if initializer is None: return Pool(process_num) elif initargs is None: return Pool(process_num, initializer) else: if not isinstance(initargs, tuple): raise TypeError('"initargs" must be a tuple') return Pool(process_num, initializer, initargs) def track_parallel_progress(func, tasks, nproc, initializer=None, initargs=None, bar_width=50, chunksize=1, skip_first=False, keep_order=True, file=sys.stdout): """Track the progress of parallel task execution with a progress bar. The built-in :mod:`multiprocessing` module is used for process pools and tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`. Args: func (callable): The function to be applied to each task. tasks (list or tuple[Iterable, int]): A list of tasks or (tasks, total num). nproc (int): Process (worker) number. initializer (None or callable): Refer to :class:`multiprocessing.Pool` for details. initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for details. chunksize (int): Refer to :class:`multiprocessing.Pool` for details. bar_width (int): Width of progress bar. skip_first (bool): Whether to skip the first sample for each worker when estimating fps, since the initialization step may takes longer. keep_order (bool): If True, :func:`Pool.imap` is used, otherwise :func:`Pool.imap_unordered` is used. Returns: list: The task results. """ if isinstance(tasks, tuple): assert len(tasks) == 2 assert isinstance(tasks[0], Iterable) assert isinstance(tasks[1], int) task_num = tasks[1] tasks = tasks[0] elif isinstance(tasks, Iterable): task_num = len(tasks) else: raise TypeError( '"tasks" must be an iterable object or a (iterator, int) tuple') pool = init_pool(nproc, initializer, initargs) start = not skip_first task_num -= nproc * chunksize * int(skip_first) prog_bar = ProgressBar(task_num, bar_width, start, file=file) results = [] if keep_order: gen = pool.imap(func, tasks, chunksize) else: gen = pool.imap_unordered(func, tasks, chunksize) for result in gen: results.append(result) if skip_first: if len(results) < nproc * chunksize: continue elif len(results) == nproc * chunksize: prog_bar.start() continue prog_bar.update() prog_bar.file.write('\n') pool.close() pool.join() return results def track_iter_progress(tasks, bar_width=50, file=sys.stdout): """Track the progress of tasks iteration or enumeration with a progress bar. Tasks are yielded with a simple for-loop. Args: tasks (list or tuple[Iterable, int]): A list of tasks or (tasks, total num). bar_width (int): Width of progress bar. Yields: list: The task results. """ if isinstance(tasks, tuple): assert len(tasks) == 2 assert isinstance(tasks[0], Iterable) assert isinstance(tasks[1], int) task_num = tasks[1] tasks = tasks[0] elif isinstance(tasks, Iterable): task_num = len(tasks) else: raise TypeError( '"tasks" must be an iterable object or a (iterator, int) tuple') prog_bar = ProgressBar(task_num, bar_width, file=file) for task in tasks: yield task prog_bar.update() prog_bar.file.write('\n') ================================================ FILE: annotator/mmpkg/mmcv/utils/registry.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import inspect import warnings from functools import partial from .misc import is_seq_of def build_from_cfg(cfg, registry, default_args=None): """Build a module from config dict. Args: cfg (dict): Config dict. It should at least contain the key "type". registry (:obj:`Registry`): The registry to search the type from. default_args (dict, optional): Default initialization arguments. Returns: object: The constructed object. """ if not isinstance(cfg, dict): raise TypeError(f'cfg must be a dict, but got {type(cfg)}') if 'type' not in cfg: if default_args is None or 'type' not in default_args: raise KeyError( '`cfg` or `default_args` must contain the key "type", ' f'but got {cfg}\n{default_args}') if not isinstance(registry, Registry): raise TypeError('registry must be an mmcv.Registry object, ' f'but got {type(registry)}') if not (isinstance(default_args, dict) or default_args is None): raise TypeError('default_args must be a dict or None, ' f'but got {type(default_args)}') args = cfg.copy() if default_args is not None: for name, value in default_args.items(): args.setdefault(name, value) obj_type = args.pop('type') if isinstance(obj_type, str): obj_cls = registry.get(obj_type) if obj_cls is None: raise KeyError( f'{obj_type} is not in the {registry.name} registry') elif inspect.isclass(obj_type): obj_cls = obj_type else: raise TypeError( f'type must be a str or valid type, but got {type(obj_type)}') try: return obj_cls(**args) except Exception as e: # Normal TypeError does not print class name. raise type(e)(f'{obj_cls.__name__}: {e}') class Registry: """A registry to map strings to classes. Registered object could be built from registry. Example: >>> MODELS = Registry('models') >>> @MODELS.register_module() >>> class ResNet: >>> pass >>> resnet = MODELS.build(dict(type='ResNet')) Please refer to https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for advanced usage. Args: name (str): Registry name. build_func(func, optional): Build function to construct instance from Registry, func:`build_from_cfg` is used if neither ``parent`` or ``build_func`` is specified. If ``parent`` is specified and ``build_func`` is not given, ``build_func`` will be inherited from ``parent``. Default: None. parent (Registry, optional): Parent registry. The class registered in children registry could be built from parent. Default: None. scope (str, optional): The scope of registry. It is the key to search for children registry. If not specified, scope will be the name of the package where class is defined, e.g. mmdet, mmcls, mmseg. Default: None. """ def __init__(self, name, build_func=None, parent=None, scope=None): self._name = name self._module_dict = dict() self._children = dict() self._scope = self.infer_scope() if scope is None else scope # self.build_func will be set with the following priority: # 1. build_func # 2. parent.build_func # 3. build_from_cfg if build_func is None: if parent is not None: self.build_func = parent.build_func else: self.build_func = build_from_cfg else: self.build_func = build_func if parent is not None: assert isinstance(parent, Registry) parent._add_children(self) self.parent = parent else: self.parent = None def __len__(self): return len(self._module_dict) def __contains__(self, key): return self.get(key) is not None def __repr__(self): format_str = self.__class__.__name__ + \ f'(name={self._name}, ' \ f'items={self._module_dict})' return format_str @staticmethod def infer_scope(): """Infer the scope of registry. The name of the package where registry is defined will be returned. Example: # in mmdet/models/backbone/resnet.py >>> MODELS = Registry('models') >>> @MODELS.register_module() >>> class ResNet: >>> pass The scope of ``ResNet`` will be ``mmdet``. Returns: scope (str): The inferred scope name. """ # inspect.stack() trace where this function is called, the index-2 # indicates the frame where `infer_scope()` is called filename = inspect.getmodule(inspect.stack()[2][0]).__name__ split_filename = filename.split('.') return split_filename[0] @staticmethod def split_scope_key(key): """Split scope and key. The first scope will be split from key. Examples: >>> Registry.split_scope_key('mmdet.ResNet') 'mmdet', 'ResNet' >>> Registry.split_scope_key('ResNet') None, 'ResNet' Return: scope (str, None): The first scope. key (str): The remaining key. """ split_index = key.find('.') if split_index != -1: return key[:split_index], key[split_index + 1:] else: return None, key @property def name(self): return self._name @property def scope(self): return self._scope @property def module_dict(self): return self._module_dict @property def children(self): return self._children def get(self, key): """Get the registry record. Args: key (str): The class name in string format. Returns: class: The corresponding class. """ scope, real_key = self.split_scope_key(key) if scope is None or scope == self._scope: # get from self if real_key in self._module_dict: return self._module_dict[real_key] else: # get from self._children if scope in self._children: return self._children[scope].get(real_key) else: # goto root parent = self.parent while parent.parent is not None: parent = parent.parent return parent.get(key) def build(self, *args, **kwargs): return self.build_func(*args, **kwargs, registry=self) def _add_children(self, registry): """Add children for a registry. The ``registry`` will be added as children based on its scope. The parent registry could build objects from children registry. Example: >>> models = Registry('models') >>> mmdet_models = Registry('models', parent=models) >>> @mmdet_models.register_module() >>> class ResNet: >>> pass >>> resnet = models.build(dict(type='mmdet.ResNet')) """ assert isinstance(registry, Registry) assert registry.scope is not None assert registry.scope not in self.children, \ f'scope {registry.scope} exists in {self.name} registry' self.children[registry.scope] = registry def _register_module(self, module_class, module_name=None, force=False): if not inspect.isclass(module_class): raise TypeError('module must be a class, ' f'but got {type(module_class)}') if module_name is None: module_name = module_class.__name__ if isinstance(module_name, str): module_name = [module_name] for name in module_name: if not force and name in self._module_dict: raise KeyError(f'{name} is already registered ' f'in {self.name}') self._module_dict[name] = module_class def deprecated_register_module(self, cls=None, force=False): warnings.warn( 'The old API of register_module(module, force=False) ' 'is deprecated and will be removed, please use the new API ' 'register_module(name=None, force=False, module=None) instead.') if cls is None: return partial(self.deprecated_register_module, force=force) self._register_module(cls, force=force) return cls def register_module(self, name=None, force=False, module=None): """Register a module. A record will be added to `self._module_dict`, whose key is the class name or the specified name, and value is the class itself. It can be used as a decorator or a normal function. Example: >>> backbones = Registry('backbone') >>> @backbones.register_module() >>> class ResNet: >>> pass >>> backbones = Registry('backbone') >>> @backbones.register_module(name='mnet') >>> class MobileNet: >>> pass >>> backbones = Registry('backbone') >>> class ResNet: >>> pass >>> backbones.register_module(ResNet) Args: name (str | None): The module name to be registered. If not specified, the class name will be used. force (bool, optional): Whether to override an existing class with the same name. Default: False. module (type): Module class to be registered. """ if not isinstance(force, bool): raise TypeError(f'force must be a boolean, but got {type(force)}') # NOTE: This is a walkaround to be compatible with the old api, # while it may introduce unexpected bugs. if isinstance(name, type): return self.deprecated_register_module(name, force=force) # raise the error ahead of time if not (name is None or isinstance(name, str) or is_seq_of(name, str)): raise TypeError( 'name must be either of None, an instance of str or a sequence' f' of str, but got {type(name)}') # use it as a normal method: x.register_module(module=SomeClass) if module is not None: self._register_module( module_class=module, module_name=name, force=force) return module # use it as a decorator: @x.register_module() def _register(cls): self._register_module( module_class=cls, module_name=name, force=force) return cls return _register ================================================ FILE: annotator/mmpkg/mmcv/utils/testing.py ================================================ # Copyright (c) Open-MMLab. import sys from collections.abc import Iterable from runpy import run_path from shlex import split from typing import Any, Dict, List from unittest.mock import patch def check_python_script(cmd): """Run the python cmd script with `__main__`. The difference between `os.system` is that, this function exectues code in the current process, so that it can be tracked by coverage tools. Currently it supports two forms: - ./tests/data/scripts/hello.py zz - python tests/data/scripts/hello.py zz """ args = split(cmd) if args[0] == 'python': args = args[1:] with patch.object(sys, 'argv', args): run_path(args[0], run_name='__main__') def _any(judge_result): """Since built-in ``any`` works only when the element of iterable is not iterable, implement the function.""" if not isinstance(judge_result, Iterable): return judge_result try: for element in judge_result: if _any(element): return True except TypeError: # Maybe encounter the case: torch.tensor(True) | torch.tensor(False) if judge_result: return True return False def assert_dict_contains_subset(dict_obj: Dict[Any, Any], expected_subset: Dict[Any, Any]) -> bool: """Check if the dict_obj contains the expected_subset. Args: dict_obj (Dict[Any, Any]): Dict object to be checked. expected_subset (Dict[Any, Any]): Subset expected to be contained in dict_obj. Returns: bool: Whether the dict_obj contains the expected_subset. """ for key, value in expected_subset.items(): if key not in dict_obj.keys() or _any(dict_obj[key] != value): return False return True def assert_attrs_equal(obj: Any, expected_attrs: Dict[str, Any]) -> bool: """Check if attribute of class object is correct. Args: obj (object): Class object to be checked. expected_attrs (Dict[str, Any]): Dict of the expected attrs. Returns: bool: Whether the attribute of class object is correct. """ for attr, value in expected_attrs.items(): if not hasattr(obj, attr) or _any(getattr(obj, attr) != value): return False return True def assert_dict_has_keys(obj: Dict[str, Any], expected_keys: List[str]) -> bool: """Check if the obj has all the expected_keys. Args: obj (Dict[str, Any]): Object to be checked. expected_keys (List[str]): Keys expected to contained in the keys of the obj. Returns: bool: Whether the obj has the expected keys. """ return set(expected_keys).issubset(set(obj.keys())) def assert_keys_equal(result_keys: List[str], target_keys: List[str]) -> bool: """Check if target_keys is equal to result_keys. Args: result_keys (List[str]): Result keys to be checked. target_keys (List[str]): Target keys to be checked. Returns: bool: Whether target_keys is equal to result_keys. """ return set(result_keys) == set(target_keys) def assert_is_norm_layer(module) -> bool: """Check if the module is a norm layer. Args: module (nn.Module): The module to be checked. Returns: bool: Whether the module is a norm layer. """ from .parrots_wrapper import _BatchNorm, _InstanceNorm from torch.nn import GroupNorm, LayerNorm norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm) return isinstance(module, norm_layer_candidates) def assert_params_all_zeros(module) -> bool: """Check if the parameters of the module is all zeros. Args: module (nn.Module): The module to be checked. Returns: bool: Whether the parameters of the module is all zeros. """ weight_data = module.weight.data is_weight_zero = weight_data.allclose( weight_data.new_zeros(weight_data.size())) if hasattr(module, 'bias') and module.bias is not None: bias_data = module.bias.data is_bias_zero = bias_data.allclose( bias_data.new_zeros(bias_data.size())) else: is_bias_zero = True return is_weight_zero and is_bias_zero ================================================ FILE: annotator/mmpkg/mmcv/utils/timer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from time import time class TimerError(Exception): def __init__(self, message): self.message = message super(TimerError, self).__init__(message) class Timer: """A flexible Timer class. :Example: >>> import time >>> import annotator.mmpkg.mmcv as mmcv >>> with mmcv.Timer(): >>> # simulate a code block that will run for 1s >>> time.sleep(1) 1.000 >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'): >>> # simulate a code block that will run for 1s >>> time.sleep(1) it takes 1.0 seconds >>> timer = mmcv.Timer() >>> time.sleep(0.5) >>> print(timer.since_start()) 0.500 >>> time.sleep(0.5) >>> print(timer.since_last_check()) 0.500 >>> print(timer.since_start()) 1.000 """ def __init__(self, start=True, print_tmpl=None): self._is_running = False self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}' if start: self.start() @property def is_running(self): """bool: indicate whether the timer is running""" return self._is_running def __enter__(self): self.start() return self def __exit__(self, type, value, traceback): print(self.print_tmpl.format(self.since_last_check())) self._is_running = False def start(self): """Start the timer.""" if not self._is_running: self._t_start = time() self._is_running = True self._t_last = time() def since_start(self): """Total time since the timer is started. Returns (float): Time in seconds. """ if not self._is_running: raise TimerError('timer is not running') self._t_last = time() return self._t_last - self._t_start def since_last_check(self): """Time since the last checking. Either :func:`since_start` or :func:`since_last_check` is a checking operation. Returns (float): Time in seconds. """ if not self._is_running: raise TimerError('timer is not running') dur = time() - self._t_last self._t_last = time() return dur _g_timers = {} # global timers def check_time(timer_id): """Add check points in a single line. This method is suitable for running a task on a list of items. A timer will be registered when the method is called for the first time. :Example: >>> import time >>> import annotator.mmpkg.mmcv as mmcv >>> for i in range(1, 6): >>> # simulate a code block >>> time.sleep(i) >>> mmcv.check_time('task1') 2.000 3.000 4.000 5.000 Args: timer_id (str): Timer identifier. """ if timer_id not in _g_timers: _g_timers[timer_id] = Timer() return 0 else: return _g_timers[timer_id].since_last_check() ================================================ FILE: annotator/mmpkg/mmcv/utils/trace.py ================================================ import warnings import torch from annotator.mmpkg.mmcv.utils import digit_version def is_jit_tracing() -> bool: if (torch.__version__ != 'parrots' and digit_version(torch.__version__) >= digit_version('1.6.0')): on_trace = torch.jit.is_tracing() # In PyTorch 1.6, torch.jit.is_tracing has a bug. # Refers to https://github.com/pytorch/pytorch/issues/42448 if isinstance(on_trace, bool): return on_trace else: return torch._C._is_tracing() else: warnings.warn( 'torch.jit.is_tracing is only supported after v1.6.0. ' 'Therefore is_tracing returns False automatically. Please ' 'set on_trace manually if you are using trace.', UserWarning) return False ================================================ FILE: annotator/mmpkg/mmcv/utils/version_utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os import subprocess import warnings from packaging.version import parse def digit_version(version_str: str, length: int = 4): """Convert a version string into a tuple of integers. This method is usually used for comparing two versions. For pre-release versions: alpha < beta < rc. Args: version_str (str): The version string. length (int): The maximum number of version levels. Default: 4. Returns: tuple[int]: The version info in digits (integers). """ assert 'parrots' not in version_str version = parse(version_str) assert version.release, f'failed to parse version {version_str}' release = list(version.release) release = release[:length] if len(release) < length: release = release + [0] * (length - len(release)) if version.is_prerelease: mapping = {'a': -3, 'b': -2, 'rc': -1} val = -4 # version.pre can be None if version.pre: if version.pre[0] not in mapping: warnings.warn(f'unknown prerelease version {version.pre[0]}, ' 'version checking may go wrong') else: val = mapping[version.pre[0]] release.extend([val, version.pre[-1]]) else: release.extend([val, 0]) elif version.is_postrelease: release.extend([1, version.post]) else: release.extend([0, 0]) return tuple(release) def _minimal_ext_cmd(cmd): # construct minimal environment env = {} for k in ['SYSTEMROOT', 'PATH', 'HOME']: v = os.environ.get(k) if v is not None: env[k] = v # LANGUAGE is used on win32 env['LANGUAGE'] = 'C' env['LANG'] = 'C' env['LC_ALL'] = 'C' out = subprocess.Popen( cmd, stdout=subprocess.PIPE, env=env).communicate()[0] return out def get_git_hash(fallback='unknown', digits=None): """Get the git hash of the current repo. Args: fallback (str, optional): The fallback string when git hash is unavailable. Defaults to 'unknown'. digits (int, optional): kept digits of the hash. Defaults to None, meaning all digits are kept. Returns: str: Git commit hash. """ if digits is not None and not isinstance(digits, int): raise TypeError('digits must be None or an integer') try: out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) sha = out.strip().decode('ascii') if digits is not None: sha = sha[:digits] except OSError: sha = fallback return sha ================================================ FILE: annotator/mmpkg/mmcv/version.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. __version__ = '1.3.17' def parse_version_info(version_str: str, length: int = 4) -> tuple: """Parse a version string into a tuple. Args: version_str (str): The version string. length (int): The maximum number of version levels. Default: 4. Returns: tuple[int | str]: The version info, e.g., "1.3.0" is parsed into (1, 3, 0, 0, 0, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 0, 'rc', 1) (when length is set to 4). """ from packaging.version import parse version = parse(version_str) assert version.release, f'failed to parse version {version_str}' release = list(version.release) release = release[:length] if len(release) < length: release = release + [0] * (length - len(release)) if version.is_prerelease: release.extend(list(version.pre)) elif version.is_postrelease: release.extend(list(version.post)) else: release.extend([0, 0]) return tuple(release) version_info = tuple(int(x) for x in __version__.split('.')[:3]) __all__ = ['__version__', 'version_info', 'parse_version_info'] ================================================ FILE: annotator/mmpkg/mmcv/video/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .io import Cache, VideoReader, frames2video from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread, flowwrite, quantize_flow, sparse_flow_from_bytes) from .processing import concat_video, convert_video, cut_video, resize_video __all__ = [ 'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video', 'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow', 'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes' ] ================================================ FILE: annotator/mmpkg/mmcv/video/io.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp from collections import OrderedDict import cv2 from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT, CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH, CAP_PROP_POS_FRAMES, VideoWriter_fourcc) from annotator.mmpkg.mmcv.utils import (check_file_exist, mkdir_or_exist, scandir, track_progress) class Cache: def __init__(self, capacity): self._cache = OrderedDict() self._capacity = int(capacity) if capacity <= 0: raise ValueError('capacity must be a positive integer') @property def capacity(self): return self._capacity @property def size(self): return len(self._cache) def put(self, key, val): if key in self._cache: return if len(self._cache) >= self.capacity: self._cache.popitem(last=False) self._cache[key] = val def get(self, key, default=None): val = self._cache[key] if key in self._cache else default return val class VideoReader: """Video class with similar usage to a list object. This video warpper class provides convenient apis to access frames. There exists an issue of OpenCV's VideoCapture class that jumping to a certain frame may be inaccurate. It is fixed in this class by checking the position after jumping each time. Cache is used when decoding videos. So if the same frame is visited for the second time, there is no need to decode again if it is stored in the cache. :Example: >>> import annotator.mmpkg.mmcv as mmcv >>> v = mmcv.VideoReader('sample.mp4') >>> len(v) # get the total frame number with `len()` 120 >>> for img in v: # v is iterable >>> mmcv.imshow(img) >>> v[5] # get the 6th frame """ def __init__(self, filename, cache_capacity=10): # Check whether the video path is a url if not filename.startswith(('https://', 'http://')): check_file_exist(filename, 'Video file not found: ' + filename) self._vcap = cv2.VideoCapture(filename) assert cache_capacity > 0 self._cache = Cache(cache_capacity) self._position = 0 # get basic info self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH)) self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT)) self._fps = self._vcap.get(CAP_PROP_FPS) self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT)) self._fourcc = self._vcap.get(CAP_PROP_FOURCC) @property def vcap(self): """:obj:`cv2.VideoCapture`: The raw VideoCapture object.""" return self._vcap @property def opened(self): """bool: Indicate whether the video is opened.""" return self._vcap.isOpened() @property def width(self): """int: Width of video frames.""" return self._width @property def height(self): """int: Height of video frames.""" return self._height @property def resolution(self): """tuple: Video resolution (width, height).""" return (self._width, self._height) @property def fps(self): """float: FPS of the video.""" return self._fps @property def frame_cnt(self): """int: Total frames of the video.""" return self._frame_cnt @property def fourcc(self): """str: "Four character code" of the video.""" return self._fourcc @property def position(self): """int: Current cursor position, indicating frame decoded.""" return self._position def _get_real_position(self): return int(round(self._vcap.get(CAP_PROP_POS_FRAMES))) def _set_real_position(self, frame_id): self._vcap.set(CAP_PROP_POS_FRAMES, frame_id) pos = self._get_real_position() for _ in range(frame_id - pos): self._vcap.read() self._position = frame_id def read(self): """Read the next frame. If the next frame have been decoded before and in the cache, then return it directly, otherwise decode, cache and return it. Returns: ndarray or None: Return the frame if successful, otherwise None. """ # pos = self._position if self._cache: img = self._cache.get(self._position) if img is not None: ret = True else: if self._position != self._get_real_position(): self._set_real_position(self._position) ret, img = self._vcap.read() if ret: self._cache.put(self._position, img) else: ret, img = self._vcap.read() if ret: self._position += 1 return img def get_frame(self, frame_id): """Get frame by index. Args: frame_id (int): Index of the expected frame, 0-based. Returns: ndarray or None: Return the frame if successful, otherwise None. """ if frame_id < 0 or frame_id >= self._frame_cnt: raise IndexError( f'"frame_id" must be between 0 and {self._frame_cnt - 1}') if frame_id == self._position: return self.read() if self._cache: img = self._cache.get(frame_id) if img is not None: self._position = frame_id + 1 return img self._set_real_position(frame_id) ret, img = self._vcap.read() if ret: if self._cache: self._cache.put(self._position, img) self._position += 1 return img def current_frame(self): """Get the current frame (frame that is just visited). Returns: ndarray or None: If the video is fresh, return None, otherwise return the frame. """ if self._position == 0: return None return self._cache.get(self._position - 1) def cvt2frames(self, frame_dir, file_start=0, filename_tmpl='{:06d}.jpg', start=0, max_num=0, show_progress=True): """Convert a video to frame images. Args: frame_dir (str): Output directory to store all the frame images. file_start (int): Filenames will start from the specified number. filename_tmpl (str): Filename template with the index as the placeholder. start (int): The starting frame index. max_num (int): Maximum number of frames to be written. show_progress (bool): Whether to show a progress bar. """ mkdir_or_exist(frame_dir) if max_num == 0: task_num = self.frame_cnt - start else: task_num = min(self.frame_cnt - start, max_num) if task_num <= 0: raise ValueError('start must be less than total frame number') if start > 0: self._set_real_position(start) def write_frame(file_idx): img = self.read() if img is None: return filename = osp.join(frame_dir, filename_tmpl.format(file_idx)) cv2.imwrite(filename, img) if show_progress: track_progress(write_frame, range(file_start, file_start + task_num)) else: for i in range(task_num): write_frame(file_start + i) def __len__(self): return self.frame_cnt def __getitem__(self, index): if isinstance(index, slice): return [ self.get_frame(i) for i in range(*index.indices(self.frame_cnt)) ] # support negative indexing if index < 0: index += self.frame_cnt if index < 0: raise IndexError('index out of range') return self.get_frame(index) def __iter__(self): self._set_real_position(0) return self def __next__(self): img = self.read() if img is not None: return img else: raise StopIteration next = __next__ def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self._vcap.release() def frames2video(frame_dir, video_file, fps=30, fourcc='XVID', filename_tmpl='{:06d}.jpg', start=0, end=0, show_progress=True): """Read the frame images from a directory and join them as a video. Args: frame_dir (str): The directory containing video frames. video_file (str): Output filename. fps (float): FPS of the output video. fourcc (str): Fourcc of the output video, this should be compatible with the output file type. filename_tmpl (str): Filename template with the index as the variable. start (int): Starting frame index. end (int): Ending frame index. show_progress (bool): Whether to show a progress bar. """ if end == 0: ext = filename_tmpl.split('.')[-1] end = len([name for name in scandir(frame_dir, ext)]) first_file = osp.join(frame_dir, filename_tmpl.format(start)) check_file_exist(first_file, 'The start frame not found: ' + first_file) img = cv2.imread(first_file) height, width = img.shape[:2] resolution = (width, height) vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps, resolution) def write_frame(file_idx): filename = osp.join(frame_dir, filename_tmpl.format(file_idx)) img = cv2.imread(filename) vwriter.write(img) if show_progress: track_progress(write_frame, range(start, end)) else: for i in range(start, end): write_frame(i) vwriter.release() ================================================ FILE: annotator/mmpkg/mmcv/video/optflow.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import cv2 import numpy as np from annotator.mmpkg.mmcv.arraymisc import dequantize, quantize from annotator.mmpkg.mmcv.image import imread, imwrite from annotator.mmpkg.mmcv.utils import is_str def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs): """Read an optical flow map. Args: flow_or_path (ndarray or str): A flow map or filepath. quantize (bool): whether to read quantized pair, if set to True, remaining args will be passed to :func:`dequantize_flow`. concat_axis (int): The axis that dx and dy are concatenated, can be either 0 or 1. Ignored if quantize is False. Returns: ndarray: Optical flow represented as a (h, w, 2) numpy array """ if isinstance(flow_or_path, np.ndarray): if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2): raise ValueError(f'Invalid flow with shape {flow_or_path.shape}') return flow_or_path elif not is_str(flow_or_path): raise TypeError(f'"flow_or_path" must be a filename or numpy array, ' f'not {type(flow_or_path)}') if not quantize: with open(flow_or_path, 'rb') as f: try: header = f.read(4).decode('utf-8') except Exception: raise IOError(f'Invalid flow file: {flow_or_path}') else: if header != 'PIEH': raise IOError(f'Invalid flow file: {flow_or_path}, ' 'header does not contain PIEH') w = np.fromfile(f, np.int32, 1).squeeze() h = np.fromfile(f, np.int32, 1).squeeze() flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2)) else: assert concat_axis in [0, 1] cat_flow = imread(flow_or_path, flag='unchanged') if cat_flow.ndim != 2: raise IOError( f'{flow_or_path} is not a valid quantized flow file, ' f'its dimension is {cat_flow.ndim}.') assert cat_flow.shape[concat_axis] % 2 == 0 dx, dy = np.split(cat_flow, 2, axis=concat_axis) flow = dequantize_flow(dx, dy, *args, **kwargs) return flow.astype(np.float32) def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs): """Write optical flow to file. If the flow is not quantized, it will be saved as a .flo file losslessly, otherwise a jpeg image which is lossy but of much smaller size. (dx and dy will be concatenated horizontally into a single image if quantize is True.) Args: flow (ndarray): (h, w, 2) array of optical flow. filename (str): Output filepath. quantize (bool): Whether to quantize the flow and save it to 2 jpeg images. If set to True, remaining args will be passed to :func:`quantize_flow`. concat_axis (int): The axis that dx and dy are concatenated, can be either 0 or 1. Ignored if quantize is False. """ if not quantize: with open(filename, 'wb') as f: f.write('PIEH'.encode('utf-8')) np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f) flow = flow.astype(np.float32) flow.tofile(f) f.flush() else: assert concat_axis in [0, 1] dx, dy = quantize_flow(flow, *args, **kwargs) dxdy = np.concatenate((dx, dy), axis=concat_axis) imwrite(dxdy, filename) def quantize_flow(flow, max_val=0.02, norm=True): """Quantize flow to [0, 255]. After this step, the size of flow will be much smaller, and can be dumped as jpeg images. Args: flow (ndarray): (h, w, 2) array of optical flow. max_val (float): Maximum value of flow, values beyond [-max_val, max_val] will be truncated. norm (bool): Whether to divide flow values by image width/height. Returns: tuple[ndarray]: Quantized dx and dy. """ h, w, _ = flow.shape dx = flow[..., 0] dy = flow[..., 1] if norm: dx = dx / w # avoid inplace operations dy = dy / h # use 255 levels instead of 256 to make sure 0 is 0 after dequantization. flow_comps = [ quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy] ] return tuple(flow_comps) def dequantize_flow(dx, dy, max_val=0.02, denorm=True): """Recover from quantized flow. Args: dx (ndarray): Quantized dx. dy (ndarray): Quantized dy. max_val (float): Maximum value used when quantizing. denorm (bool): Whether to multiply flow values with width/height. Returns: ndarray: Dequantized flow. """ assert dx.shape == dy.shape assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1) dx, dy = [dequantize(d, -max_val, max_val, 255) for d in [dx, dy]] if denorm: dx *= dx.shape[1] dy *= dx.shape[0] flow = np.dstack((dx, dy)) return flow def flow_warp(img, flow, filling_value=0, interpolate_mode='nearest'): """Use flow to warp img. Args: img (ndarray, float or uint8): Image to be warped. flow (ndarray, float): Optical Flow. filling_value (int): The missing pixels will be set with filling_value. interpolate_mode (str): bilinear -> Bilinear Interpolation; nearest -> Nearest Neighbor. Returns: ndarray: Warped image with the same shape of img """ warnings.warn('This function is just for prototyping and cannot ' 'guarantee the computational efficiency.') assert flow.ndim == 3, 'Flow must be in 3D arrays.' height = flow.shape[0] width = flow.shape[1] channels = img.shape[2] output = np.ones( (height, width, channels), dtype=img.dtype) * filling_value grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2) dx = grid[:, :, 0] + flow[:, :, 1] dy = grid[:, :, 1] + flow[:, :, 0] sx = np.floor(dx).astype(int) sy = np.floor(dy).astype(int) valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1) if interpolate_mode == 'nearest': output[valid, :] = img[dx[valid].round().astype(int), dy[valid].round().astype(int), :] elif interpolate_mode == 'bilinear': # dirty walkround for integer positions eps_ = 1e-6 dx, dy = dx + eps_, dy + eps_ left_top_ = img[np.floor(dx[valid]).astype(int), np.floor(dy[valid]).astype(int), :] * ( np.ceil(dx[valid]) - dx[valid])[:, None] * ( np.ceil(dy[valid]) - dy[valid])[:, None] left_down_ = img[np.ceil(dx[valid]).astype(int), np.floor(dy[valid]).astype(int), :] * ( dx[valid] - np.floor(dx[valid]))[:, None] * ( np.ceil(dy[valid]) - dy[valid])[:, None] right_top_ = img[np.floor(dx[valid]).astype(int), np.ceil(dy[valid]).astype(int), :] * ( np.ceil(dx[valid]) - dx[valid])[:, None] * ( dy[valid] - np.floor(dy[valid]))[:, None] right_down_ = img[np.ceil(dx[valid]).astype(int), np.ceil(dy[valid]).astype(int), :] * ( dx[valid] - np.floor(dx[valid]))[:, None] * ( dy[valid] - np.floor(dy[valid]))[:, None] output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_ else: raise NotImplementedError( 'We only support interpolation modes of nearest and bilinear, ' f'but got {interpolate_mode}.') return output.astype(img.dtype) def flow_from_bytes(content): """Read dense optical flow from bytes. .. note:: This load optical flow function works for FlyingChairs, FlyingThings3D, Sintel, FlyingChairsOcc datasets, but cannot load the data from ChairsSDHom. Args: content (bytes): Optical flow bytes got from files or other streams. Returns: ndarray: Loaded optical flow with the shape (H, W, 2). """ # header in first 4 bytes header = content[:4] if header.decode('utf-8') != 'PIEH': raise Exception('Flow file header does not contain PIEH') # width in second 4 bytes width = np.frombuffer(content[4:], np.int32, 1).squeeze() # height in third 4 bytes height = np.frombuffer(content[8:], np.int32, 1).squeeze() # after first 12 bytes, all bytes are flow flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape( (height, width, 2)) return flow def sparse_flow_from_bytes(content): """Read the optical flow in KITTI datasets from bytes. This function is modified from RAFT load the `KITTI datasets `_. Args: content (bytes): Optical flow bytes got from files or other streams. Returns: Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2) and flow valid mask with the shape (H, W). """ # nopa content = np.frombuffer(content, np.uint8) flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR) flow = flow[:, :, ::-1].astype(np.float32) # flow shape (H, W, 2) valid shape (H, W) flow, valid = flow[:, :, :2], flow[:, :, 2] flow = (flow - 2**15) / 64.0 return flow, valid ================================================ FILE: annotator/mmpkg/mmcv/video/processing.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os import os.path as osp import subprocess import tempfile from annotator.mmpkg.mmcv.utils import requires_executable @requires_executable('ffmpeg') def convert_video(in_file, out_file, print_cmd=False, pre_options='', **kwargs): """Convert a video with ffmpeg. This provides a general api to ffmpeg, the executed command is:: `ffmpeg -y -i ` Options(kwargs) are mapped to ffmpeg commands with the following rules: - key=val: "-key val" - key=True: "-key" - key=False: "" Args: in_file (str): Input video filename. out_file (str): Output video filename. pre_options (str): Options appears before "-i ". print_cmd (bool): Whether to print the final ffmpeg command. """ options = [] for k, v in kwargs.items(): if isinstance(v, bool): if v: options.append(f'-{k}') elif k == 'log_level': assert v in [ 'quiet', 'panic', 'fatal', 'error', 'warning', 'info', 'verbose', 'debug', 'trace' ] options.append(f'-loglevel {v}') else: options.append(f'-{k} {v}') cmd = f'ffmpeg -y {pre_options} -i {in_file} {" ".join(options)} ' \ f'{out_file}' if print_cmd: print(cmd) subprocess.call(cmd, shell=True) @requires_executable('ffmpeg') def resize_video(in_file, out_file, size=None, ratio=None, keep_ar=False, log_level='info', print_cmd=False): """Resize a video. Args: in_file (str): Input video filename. out_file (str): Output video filename. size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1). ratio (tuple or float): Expected resize ratio, (2, 0.5) means (w*2, h*0.5). keep_ar (bool): Whether to keep original aspect ratio. log_level (str): Logging level of ffmpeg. print_cmd (bool): Whether to print the final ffmpeg command. """ if size is None and ratio is None: raise ValueError('expected size or ratio must be specified') if size is not None and ratio is not None: raise ValueError('size and ratio cannot be specified at the same time') options = {'log_level': log_level} if size: if not keep_ar: options['vf'] = f'scale={size[0]}:{size[1]}' else: options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \ 'force_original_aspect_ratio=decrease' else: if not isinstance(ratio, tuple): ratio = (ratio, ratio) options['vf'] = f'scale="trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})"' convert_video(in_file, out_file, print_cmd, **options) @requires_executable('ffmpeg') def cut_video(in_file, out_file, start=None, end=None, vcodec=None, acodec=None, log_level='info', print_cmd=False): """Cut a clip from a video. Args: in_file (str): Input video filename. out_file (str): Output video filename. start (None or float): Start time (in seconds). end (None or float): End time (in seconds). vcodec (None or str): Output video codec, None for unchanged. acodec (None or str): Output audio codec, None for unchanged. log_level (str): Logging level of ffmpeg. print_cmd (bool): Whether to print the final ffmpeg command. """ options = {'log_level': log_level} if vcodec is None: options['vcodec'] = 'copy' if acodec is None: options['acodec'] = 'copy' if start: options['ss'] = start else: start = 0 if end: options['t'] = end - start convert_video(in_file, out_file, print_cmd, **options) @requires_executable('ffmpeg') def concat_video(video_list, out_file, vcodec=None, acodec=None, log_level='info', print_cmd=False): """Concatenate multiple videos into a single one. Args: video_list (list): A list of video filenames out_file (str): Output video filename vcodec (None or str): Output video codec, None for unchanged acodec (None or str): Output audio codec, None for unchanged log_level (str): Logging level of ffmpeg. print_cmd (bool): Whether to print the final ffmpeg command. """ tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True) with open(tmp_filename, 'w') as f: for filename in video_list: f.write(f'file {osp.abspath(filename)}\n') options = {'log_level': log_level} if vcodec is None: options['vcodec'] = 'copy' if acodec is None: options['acodec'] = 'copy' convert_video( tmp_filename, out_file, print_cmd, pre_options='-f concat -safe 0', **options) os.close(tmp_filehandler) os.remove(tmp_filename) ================================================ FILE: annotator/mmpkg/mmcv/visualization/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .color import Color, color_val from .image import imshow, imshow_bboxes, imshow_det_bboxes from .optflow import flow2rgb, flowshow, make_color_wheel __all__ = [ 'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes', 'flowshow', 'flow2rgb', 'make_color_wheel' ] ================================================ FILE: annotator/mmpkg/mmcv/visualization/color.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from enum import Enum import numpy as np from annotator.mmpkg.mmcv.utils import is_str class Color(Enum): """An enum that defines common colors. Contains red, green, blue, cyan, yellow, magenta, white and black. """ red = (0, 0, 255) green = (0, 255, 0) blue = (255, 0, 0) cyan = (255, 255, 0) yellow = (0, 255, 255) magenta = (255, 0, 255) white = (255, 255, 255) black = (0, 0, 0) def color_val(color): """Convert various input to color tuples. Args: color (:obj:`Color`/str/tuple/int/ndarray): Color inputs Returns: tuple[int]: A tuple of 3 integers indicating BGR channels. """ if is_str(color): return Color[color].value elif isinstance(color, Color): return color.value elif isinstance(color, tuple): assert len(color) == 3 for channel in color: assert 0 <= channel <= 255 return color elif isinstance(color, int): assert 0 <= color <= 255 return color, color, color elif isinstance(color, np.ndarray): assert color.ndim == 1 and color.size == 3 assert np.all((color >= 0) & (color <= 255)) color = color.astype(np.uint8) return tuple(color) else: raise TypeError(f'Invalid type for color: {type(color)}') ================================================ FILE: annotator/mmpkg/mmcv/visualization/image.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import cv2 import numpy as np from annotator.mmpkg.mmcv.image import imread, imwrite from .color import color_val def imshow(img, win_name='', wait_time=0): """Show an image. Args: img (str or ndarray): The image to be displayed. win_name (str): The window name. wait_time (int): Value of waitKey param. """ cv2.imshow(win_name, imread(img)) if wait_time == 0: # prevent from hanging if windows was closed while True: ret = cv2.waitKey(1) closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1 # if user closed window or if some key pressed if closed or ret != -1: break else: ret = cv2.waitKey(wait_time) def imshow_bboxes(img, bboxes, colors='green', top_k=-1, thickness=1, show=True, win_name='', wait_time=0, out_file=None): """Draw bboxes on an image. Args: img (str or ndarray): The image to be displayed. bboxes (list or ndarray): A list of ndarray of shape (k, 4). colors (list[str or tuple or Color]): A list of colors. top_k (int): Plot the first k bboxes only if set positive. thickness (int): Thickness of lines. show (bool): Whether to show the image. win_name (str): The window name. wait_time (int): Value of waitKey param. out_file (str, optional): The filename to write the image. Returns: ndarray: The image with bboxes drawn on it. """ img = imread(img) img = np.ascontiguousarray(img) if isinstance(bboxes, np.ndarray): bboxes = [bboxes] if not isinstance(colors, list): colors = [colors for _ in range(len(bboxes))] colors = [color_val(c) for c in colors] assert len(bboxes) == len(colors) for i, _bboxes in enumerate(bboxes): _bboxes = _bboxes.astype(np.int32) if top_k <= 0: _top_k = _bboxes.shape[0] else: _top_k = min(top_k, _bboxes.shape[0]) for j in range(_top_k): left_top = (_bboxes[j, 0], _bboxes[j, 1]) right_bottom = (_bboxes[j, 2], _bboxes[j, 3]) cv2.rectangle( img, left_top, right_bottom, colors[i], thickness=thickness) if show: imshow(img, win_name, wait_time) if out_file is not None: imwrite(img, out_file) return img def imshow_det_bboxes(img, bboxes, labels, class_names=None, score_thr=0, bbox_color='green', text_color='green', thickness=1, font_scale=0.5, show=True, win_name='', wait_time=0, out_file=None): """Draw bboxes and class labels (with scores) on an image. Args: img (str or ndarray): The image to be displayed. bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or (n, 5). labels (ndarray): Labels of bboxes. class_names (list[str]): Names of each classes. score_thr (float): Minimum score of bboxes to be shown. bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. text_color (str or tuple or :obj:`Color`): Color of texts. thickness (int): Thickness of lines. font_scale (float): Font scales of texts. show (bool): Whether to show the image. win_name (str): The window name. wait_time (int): Value of waitKey param. out_file (str or None): The filename to write the image. Returns: ndarray: The image with bboxes drawn on it. """ assert bboxes.ndim == 2 assert labels.ndim == 1 assert bboxes.shape[0] == labels.shape[0] assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5 img = imread(img) img = np.ascontiguousarray(img) if score_thr > 0: assert bboxes.shape[1] == 5 scores = bboxes[:, -1] inds = scores > score_thr bboxes = bboxes[inds, :] labels = labels[inds] bbox_color = color_val(bbox_color) text_color = color_val(text_color) for bbox, label in zip(bboxes, labels): bbox_int = bbox.astype(np.int32) left_top = (bbox_int[0], bbox_int[1]) right_bottom = (bbox_int[2], bbox_int[3]) cv2.rectangle( img, left_top, right_bottom, bbox_color, thickness=thickness) label_text = class_names[ label] if class_names is not None else f'cls {label}' if len(bbox) > 4: label_text += f'|{bbox[-1]:.02f}' cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2), cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color) if show: imshow(img, win_name, wait_time) if out_file is not None: imwrite(img, out_file) return img ================================================ FILE: annotator/mmpkg/mmcv/visualization/optflow.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from __future__ import division import numpy as np from annotator.mmpkg.mmcv.image import rgb2bgr from annotator.mmpkg.mmcv.video import flowread from .image import imshow def flowshow(flow, win_name='', wait_time=0): """Show optical flow. Args: flow (ndarray or str): The optical flow to be displayed. win_name (str): The window name. wait_time (int): Value of waitKey param. """ flow = flowread(flow) flow_img = flow2rgb(flow) imshow(rgb2bgr(flow_img), win_name, wait_time) def flow2rgb(flow, color_wheel=None, unknown_thr=1e6): """Convert flow map to RGB image. Args: flow (ndarray): Array of optical flow. color_wheel (ndarray or None): Color wheel used to map flow field to RGB colorspace. Default color wheel will be used if not specified. unknown_thr (str): Values above this threshold will be marked as unknown and thus ignored. Returns: ndarray: RGB image that can be visualized. """ assert flow.ndim == 3 and flow.shape[-1] == 2 if color_wheel is None: color_wheel = make_color_wheel() assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3 num_bins = color_wheel.shape[0] dx = flow[:, :, 0].copy() dy = flow[:, :, 1].copy() ignore_inds = ( np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) | (np.abs(dy) > unknown_thr)) dx[ignore_inds] = 0 dy[ignore_inds] = 0 rad = np.sqrt(dx**2 + dy**2) if np.any(rad > np.finfo(float).eps): max_rad = np.max(rad) dx /= max_rad dy /= max_rad rad = np.sqrt(dx**2 + dy**2) angle = np.arctan2(-dy, -dx) / np.pi bin_real = (angle + 1) / 2 * (num_bins - 1) bin_left = np.floor(bin_real).astype(int) bin_right = (bin_left + 1) % num_bins w = (bin_real - bin_left.astype(np.float32))[..., None] flow_img = (1 - w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :] small_ind = rad <= 1 flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind]) flow_img[np.logical_not(small_ind)] *= 0.75 flow_img[ignore_inds, :] = 0 return flow_img def make_color_wheel(bins=None): """Build a color wheel. Args: bins(list or tuple, optional): Specify the number of bins for each color range, corresponding to six ranges: red -> yellow, yellow -> green, green -> cyan, cyan -> blue, blue -> magenta, magenta -> red. [15, 6, 4, 11, 13, 6] is used for default (see Middlebury). Returns: ndarray: Color wheel of shape (total_bins, 3). """ if bins is None: bins = [15, 6, 4, 11, 13, 6] assert len(bins) == 6 RY, YG, GC, CB, BM, MR = tuple(bins) ry = [1, np.arange(RY) / RY, 0] yg = [1 - np.arange(YG) / YG, 1, 0] gc = [0, 1, np.arange(GC) / GC] cb = [0, 1 - np.arange(CB) / CB, 1] bm = [np.arange(BM) / BM, 0, 1] mr = [1, 0, 1 - np.arange(MR) / MR] num_bins = RY + YG + GC + CB + BM + MR color_wheel = np.zeros((3, num_bins), dtype=np.float32) col = 0 for i, color in enumerate([ry, yg, gc, cb, bm, mr]): for j in range(3): color_wheel[j, col:col + bins[i]] = color[j] col += bins[i] return color_wheel.T ================================================ FILE: annotator/mmpkg/mmseg/apis/__init__.py ================================================ from .inference import inference_segmentor, init_segmentor, show_result_pyplot from .test import multi_gpu_test, single_gpu_test from .train import get_root_logger, set_random_seed, train_segmentor __all__ = [ 'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor', 'inference_segmentor', 'multi_gpu_test', 'single_gpu_test', 'show_result_pyplot' ] ================================================ FILE: annotator/mmpkg/mmseg/apis/inference.py ================================================ import matplotlib.pyplot as plt import annotator.mmpkg.mmcv as mmcv import torch from annotator.mmpkg.mmcv.parallel import collate, scatter from annotator.mmpkg.mmcv.runner import load_checkpoint from annotator.mmpkg.mmseg.datasets.pipelines import Compose from annotator.mmpkg.mmseg.models import build_segmentor from modules import devices def init_segmentor(config, checkpoint=None, device=devices.get_device_for("controlnet")): """Initialize a segmentor from config file. Args: config (str or :obj:`mmcv.Config`): Config file path or the config object. checkpoint (str, optional): Checkpoint path. If left as None, the model will not load any weights. device (str, optional) CPU/CUDA device option. Default 'cuda:0'. Use 'cpu' for loading model on CPU. Returns: nn.Module: The constructed segmentor. """ if isinstance(config, str): config = mmcv.Config.fromfile(config) elif not isinstance(config, mmcv.Config): raise TypeError('config must be a filename or Config object, ' 'but got {}'.format(type(config))) config.model.pretrained = None config.model.train_cfg = None model = build_segmentor(config.model, test_cfg=config.get('test_cfg')) if checkpoint is not None: checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') model.CLASSES = checkpoint['meta']['CLASSES'] model.PALETTE = checkpoint['meta']['PALETTE'] model.cfg = config # save the config in the model for convenience model.to(device) model.eval() return model class LoadImage: """A simple pipeline to load image.""" def __call__(self, results): """Call function to load images into results. Args: results (dict): A result dict contains the file name of the image to be read. Returns: dict: ``results`` will be returned containing loaded image. """ if isinstance(results['img'], str): results['filename'] = results['img'] results['ori_filename'] = results['img'] else: results['filename'] = None results['ori_filename'] = None img = mmcv.imread(results['img']) results['img'] = img results['img_shape'] = img.shape results['ori_shape'] = img.shape return results def inference_segmentor(model, img): """Inference image(s) with the segmentor. Args: model (nn.Module): The loaded segmentor. imgs (str/ndarray or list[str/ndarray]): Either image files or loaded images. Returns: (list[Tensor]): The segmentation result. """ cfg = model.cfg device = next(model.parameters()).device # model device # build the data pipeline test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:] test_pipeline = Compose(test_pipeline) # prepare data data = dict(img=img) data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] else: data['img'][0] = data['img'][0].to(devices.get_device_for("controlnet")) data['img_metas'] = [i.data[0] for i in data['img_metas']] # forward the model with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) return result def show_result_pyplot(model, img, result, palette=None, fig_size=(15, 10), opacity=0.5, title='', block=True): """Visualize the segmentation results on the image. Args: model (nn.Module): The loaded segmentor. img (str or np.ndarray): Image filename or loaded image. result (list): The segmentation result. palette (list[list[int]]] | None): The palette of segmentation map. If None is given, random palette will be generated. Default: None fig_size (tuple): Figure size of the pyplot figure. opacity(float): Opacity of painted segmentation map. Default 0.5. Must be in (0, 1] range. title (str): The title of pyplot figure. Default is ''. block (bool): Whether to block the pyplot figure. Default is True. """ if hasattr(model, 'module'): model = model.module img = model.show_result( img, result, palette=palette, show=False, opacity=opacity) # plt.figure(figsize=fig_size) # plt.imshow(mmcv.bgr2rgb(img)) # plt.title(title) # plt.tight_layout() # plt.show(block=block) return mmcv.bgr2rgb(img) ================================================ FILE: annotator/mmpkg/mmseg/apis/test.py ================================================ import os.path as osp import pickle import shutil import tempfile import annotator.mmpkg.mmcv as mmcv import numpy as np import torch import torch.distributed as dist from annotator.mmpkg.mmcv.image import tensor2imgs from annotator.mmpkg.mmcv.runner import get_dist_info def np2tmp(array, temp_file_name=None): """Save ndarray to local numpy file. Args: array (ndarray): Ndarray to save. temp_file_name (str): Numpy file name. If 'temp_file_name=None', this function will generate a file name with tempfile.NamedTemporaryFile to save ndarray. Default: None. Returns: str: The numpy file name. """ if temp_file_name is None: temp_file_name = tempfile.NamedTemporaryFile( suffix='.npy', delete=False).name np.save(temp_file_name, array) return temp_file_name def single_gpu_test(model, data_loader, show=False, out_dir=None, efficient_test=False, opacity=0.5): """Test with single GPU. Args: model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. show (bool): Whether show results during inference. Default: False. out_dir (str, optional): If specified, the results will be dumped into the directory to save output results. efficient_test (bool): Whether save the results as local numpy files to save CPU memory during evaluation. Default: False. opacity(float): Opacity of painted segmentation map. Default 0.5. Must be in (0, 1] range. Returns: list: The prediction results. """ model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, **data) if show or out_dir: img_tensor = data['img'][0] img_metas = data['img_metas'][0].data[0] imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg']) assert len(imgs) == len(img_metas) for img, img_meta in zip(imgs, img_metas): h, w, _ = img_meta['img_shape'] img_show = img[:h, :w, :] ori_h, ori_w = img_meta['ori_shape'][:-1] img_show = mmcv.imresize(img_show, (ori_w, ori_h)) if out_dir: out_file = osp.join(out_dir, img_meta['ori_filename']) else: out_file = None model.module.show_result( img_show, result, palette=dataset.PALETTE, show=show, out_file=out_file, opacity=opacity) if isinstance(result, list): if efficient_test: result = [np2tmp(_) for _ in result] results.extend(result) else: if efficient_test: result = np2tmp(result) results.append(result) batch_size = len(result) for _ in range(batch_size): prog_bar.update() return results def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False, efficient_test=False): """Test model with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' it encodes results to gpu tensors and use gpu communication for results collection. On cpu mode it saves the results on different gpus to 'tmpdir' and collects them by the rank 0 worker. Args: model (nn.Module): Model to be tested. data_loader (utils.data.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. efficient_test (bool): Whether save the results as local numpy files to save CPU memory during evaluation. Default: False. Returns: list: The prediction results. """ model.eval() results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) if isinstance(result, list): if efficient_test: result = [np2tmp(_) for _ in result] results.extend(result) else: if efficient_test: result = np2tmp(result) results.append(result) if rank == 0: batch_size = data['img'][0].size(0) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks if gpu_collect: results = collect_results_gpu(results, len(dataset)) else: results = collect_results_cpu(results, len(dataset), tmpdir) return results def collect_results_cpu(result_part, size, tmpdir=None): """Collect results with CPU.""" rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: tmpdir = tempfile.mkdtemp() tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results def collect_results_gpu(result_part, size): """Collect results with GPU.""" rank, world_size = get_dist_info() # dump result part to tensor with pickle part_tensor = torch.tensor( bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') # gather all result part tensor shape shape_tensor = torch.tensor(part_tensor.shape, device='cuda') shape_list = [shape_tensor.clone() for _ in range(world_size)] dist.all_gather(shape_list, shape_tensor) # padding result part tensor to max length shape_max = torch.tensor(shape_list).max() part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') part_send[:shape_tensor[0]] = part_tensor part_recv_list = [ part_tensor.new_zeros(shape_max) for _ in range(world_size) ] # gather all result part dist.all_gather(part_recv_list, part_send) if rank == 0: part_list = [] for recv, shape in zip(part_recv_list, shape_list): part_list.append( pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] return ordered_results ================================================ FILE: annotator/mmpkg/mmseg/apis/train.py ================================================ import random import warnings import numpy as np import torch from annotator.mmpkg.mmcv.parallel import MMDataParallel, MMDistributedDataParallel from annotator.mmpkg.mmcv.runner import build_optimizer, build_runner from annotator.mmpkg.mmseg.core import DistEvalHook, EvalHook from annotator.mmpkg.mmseg.datasets import build_dataloader, build_dataset from annotator.mmpkg.mmseg.utils import get_root_logger def set_random_seed(seed, deterministic=False): """Set random seed. Args: seed (int): Seed to be used. deterministic (bool): Whether to set the deterministic option for CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` to True and `torch.backends.cudnn.benchmark` to False. Default: False. """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if deterministic: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def train_segmentor(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Launch segmentor training.""" logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, drop_last=True) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters} warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) runner = build_runner( cfg.runner, default_args=dict( model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # register hooks runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow) ================================================ FILE: annotator/mmpkg/mmseg/core/__init__.py ================================================ from .evaluation import * # noqa: F401, F403 from .seg import * # noqa: F401, F403 from .utils import * # noqa: F401, F403 ================================================ FILE: annotator/mmpkg/mmseg/core/evaluation/__init__.py ================================================ from .class_names import get_classes, get_palette from .eval_hooks import DistEvalHook, EvalHook from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou __all__ = [ 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', 'eval_metrics', 'get_classes', 'get_palette' ] ================================================ FILE: annotator/mmpkg/mmseg/core/evaluation/class_names.py ================================================ import annotator.mmpkg.mmcv as mmcv def cityscapes_classes(): """Cityscapes class names for external use.""" return [ 'road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle' ] def ade_classes(): """ADE20K class names for external use.""" return [ 'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ', 'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth', 'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car', 'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe', 'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column', 'signboard', 'chest of drawers', 'counter', 'sand', 'sink', 'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path', 'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door', 'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table', 'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove', 'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar', 'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower', 'chandelier', 'awning', 'streetlight', 'booth', 'television receiver', 'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister', 'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van', 'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything', 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent', 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank', 'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce', 'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen', 'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass', 'clock', 'flag' ] def voc_classes(): """Pascal VOC class names for external use.""" return [ 'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor' ] def cityscapes_palette(): """Cityscapes palette for external use.""" return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156], [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0], [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]] def ade_palette(): """ADE20K palette for external use.""" return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255], [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255], [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0], [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0], [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255], [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255], [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20], [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255], [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255], [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255], [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0], [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0], [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255], [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112], [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160], [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163], [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0], [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0], [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255], [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204], [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255], [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255], [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194], [102, 255, 0], [92, 0, 255]] def voc_palette(): """Pascal VOC palette for external use.""" return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]] dataset_aliases = { 'cityscapes': ['cityscapes'], 'ade': ['ade', 'ade20k'], 'voc': ['voc', 'pascal_voc', 'voc12', 'voc12aug'] } def get_classes(dataset): """Get class names of a dataset.""" alias2name = {} for name, aliases in dataset_aliases.items(): for alias in aliases: alias2name[alias] = name if mmcv.is_str(dataset): if dataset in alias2name: labels = eval(alias2name[dataset] + '_classes()') else: raise ValueError(f'Unrecognized dataset: {dataset}') else: raise TypeError(f'dataset must a str, but got {type(dataset)}') return labels def get_palette(dataset): """Get class palette (RGB) of a dataset.""" alias2name = {} for name, aliases in dataset_aliases.items(): for alias in aliases: alias2name[alias] = name if mmcv.is_str(dataset): if dataset in alias2name: labels = eval(alias2name[dataset] + '_palette()') else: raise ValueError(f'Unrecognized dataset: {dataset}') else: raise TypeError(f'dataset must a str, but got {type(dataset)}') return labels ================================================ FILE: annotator/mmpkg/mmseg/core/evaluation/eval_hooks.py ================================================ import os.path as osp from annotator.mmpkg.mmcv.runner import DistEvalHook as _DistEvalHook from annotator.mmpkg.mmcv.runner import EvalHook as _EvalHook class EvalHook(_EvalHook): """Single GPU EvalHook, with efficient test support. Args: by_epoch (bool): Determine perform evaluation by epoch or by iteration. If set to True, it will perform by epoch. Otherwise, by iteration. Default: False. efficient_test (bool): Whether save the results as local numpy files to save CPU memory during evaluation. Default: False. Returns: list: The prediction results. """ greater_keys = ['mIoU', 'mAcc', 'aAcc'] def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) self.efficient_test = efficient_test def after_train_iter(self, runner): """After train epoch hook. Override default ``single_gpu_test``. """ if self.by_epoch or not self.every_n_iters(runner, self.interval): return from annotator.mmpkg.mmseg.apis import single_gpu_test runner.log_buffer.clear() results = single_gpu_test( runner.model, self.dataloader, show=False, efficient_test=self.efficient_test) self.evaluate(runner, results) def after_train_epoch(self, runner): """After train epoch hook. Override default ``single_gpu_test``. """ if not self.by_epoch or not self.every_n_epochs(runner, self.interval): return from annotator.mmpkg.mmseg.apis import single_gpu_test runner.log_buffer.clear() results = single_gpu_test(runner.model, self.dataloader, show=False) self.evaluate(runner, results) class DistEvalHook(_DistEvalHook): """Distributed EvalHook, with efficient test support. Args: by_epoch (bool): Determine perform evaluation by epoch or by iteration. If set to True, it will perform by epoch. Otherwise, by iteration. Default: False. efficient_test (bool): Whether save the results as local numpy files to save CPU memory during evaluation. Default: False. Returns: list: The prediction results. """ greater_keys = ['mIoU', 'mAcc', 'aAcc'] def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): super().__init__(*args, by_epoch=by_epoch, **kwargs) self.efficient_test = efficient_test def after_train_iter(self, runner): """After train epoch hook. Override default ``multi_gpu_test``. """ if self.by_epoch or not self.every_n_iters(runner, self.interval): return from annotator.mmpkg.mmseg.apis import multi_gpu_test runner.log_buffer.clear() results = multi_gpu_test( runner.model, self.dataloader, tmpdir=osp.join(runner.work_dir, '.eval_hook'), gpu_collect=self.gpu_collect, efficient_test=self.efficient_test) if runner.rank == 0: print('\n') self.evaluate(runner, results) def after_train_epoch(self, runner): """After train epoch hook. Override default ``multi_gpu_test``. """ if not self.by_epoch or not self.every_n_epochs(runner, self.interval): return from annotator.mmpkg.mmseg.apis import multi_gpu_test runner.log_buffer.clear() results = multi_gpu_test( runner.model, self.dataloader, tmpdir=osp.join(runner.work_dir, '.eval_hook'), gpu_collect=self.gpu_collect) if runner.rank == 0: print('\n') self.evaluate(runner, results) ================================================ FILE: annotator/mmpkg/mmseg/core/evaluation/metrics.py ================================================ from collections import OrderedDict import annotator.mmpkg.mmcv as mmcv import numpy as np import torch def f_score(precision, recall, beta=1): """calcuate the f-score value. Args: precision (float | torch.Tensor): The precision value. recall (float | torch.Tensor): The recall value. beta (int): Determines the weight of recall in the combined score. Default: False. Returns: [torch.tensor]: The f-score value. """ score = (1 + beta**2) * (precision * recall) / ( (beta**2 * precision) + recall) return score def intersect_and_union(pred_label, label, num_classes, ignore_index, label_map=dict(), reduce_zero_label=False): """Calculate intersection and Union. Args: pred_label (ndarray | str): Prediction segmentation map or predict result filename. label (ndarray | str): Ground truth segmentation map or label filename. num_classes (int): Number of categories. ignore_index (int): Index that will be ignored in evaluation. label_map (dict): Mapping old labels to new labels. The parameter will work only when label is str. Default: dict(). reduce_zero_label (bool): Whether ignore zero label. The parameter will work only when label is str. Default: False. Returns: torch.Tensor: The intersection of prediction and ground truth histogram on all classes. torch.Tensor: The union of prediction and ground truth histogram on all classes. torch.Tensor: The prediction histogram on all classes. torch.Tensor: The ground truth histogram on all classes. """ if isinstance(pred_label, str): pred_label = torch.from_numpy(np.load(pred_label)) else: pred_label = torch.from_numpy((pred_label)) if isinstance(label, str): label = torch.from_numpy( mmcv.imread(label, flag='unchanged', backend='pillow')) else: label = torch.from_numpy(label) if label_map is not None: for old_id, new_id in label_map.items(): label[label == old_id] = new_id if reduce_zero_label: label[label == 0] = 255 label = label - 1 label[label == 254] = 255 mask = (label != ignore_index) pred_label = pred_label[mask] label = label[mask] intersect = pred_label[pred_label == label] area_intersect = torch.histc( intersect.float(), bins=(num_classes), min=0, max=num_classes - 1) area_pred_label = torch.histc( pred_label.float(), bins=(num_classes), min=0, max=num_classes - 1) area_label = torch.histc( label.float(), bins=(num_classes), min=0, max=num_classes - 1) area_union = area_pred_label + area_label - area_intersect return area_intersect, area_union, area_pred_label, area_label def total_intersect_and_union(results, gt_seg_maps, num_classes, ignore_index, label_map=dict(), reduce_zero_label=False): """Calculate Total Intersection and Union. Args: results (list[ndarray] | list[str]): List of prediction segmentation maps or list of prediction result filenames. gt_seg_maps (list[ndarray] | list[str]): list of ground truth segmentation maps or list of label filenames. num_classes (int): Number of categories. ignore_index (int): Index that will be ignored in evaluation. label_map (dict): Mapping old labels to new labels. Default: dict(). reduce_zero_label (bool): Whether ignore zero label. Default: False. Returns: ndarray: The intersection of prediction and ground truth histogram on all classes. ndarray: The union of prediction and ground truth histogram on all classes. ndarray: The prediction histogram on all classes. ndarray: The ground truth histogram on all classes. """ num_imgs = len(results) assert len(gt_seg_maps) == num_imgs total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64) total_area_union = torch.zeros((num_classes, ), dtype=torch.float64) total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64) total_area_label = torch.zeros((num_classes, ), dtype=torch.float64) for i in range(num_imgs): area_intersect, area_union, area_pred_label, area_label = \ intersect_and_union( results[i], gt_seg_maps[i], num_classes, ignore_index, label_map, reduce_zero_label) total_area_intersect += area_intersect total_area_union += area_union total_area_pred_label += area_pred_label total_area_label += area_label return total_area_intersect, total_area_union, total_area_pred_label, \ total_area_label def mean_iou(results, gt_seg_maps, num_classes, ignore_index, nan_to_num=None, label_map=dict(), reduce_zero_label=False): """Calculate Mean Intersection and Union (mIoU) Args: results (list[ndarray] | list[str]): List of prediction segmentation maps or list of prediction result filenames. gt_seg_maps (list[ndarray] | list[str]): list of ground truth segmentation maps or list of label filenames. num_classes (int): Number of categories. ignore_index (int): Index that will be ignored in evaluation. nan_to_num (int, optional): If specified, NaN values will be replaced by the numbers defined by the user. Default: None. label_map (dict): Mapping old labels to new labels. Default: dict(). reduce_zero_label (bool): Whether ignore zero label. Default: False. Returns: dict[str, float | ndarray]: float: Overall accuracy on all images. ndarray: Per category accuracy, shape (num_classes, ). ndarray: Per category IoU, shape (num_classes, ). """ iou_result = eval_metrics( results=results, gt_seg_maps=gt_seg_maps, num_classes=num_classes, ignore_index=ignore_index, metrics=['mIoU'], nan_to_num=nan_to_num, label_map=label_map, reduce_zero_label=reduce_zero_label) return iou_result def mean_dice(results, gt_seg_maps, num_classes, ignore_index, nan_to_num=None, label_map=dict(), reduce_zero_label=False): """Calculate Mean Dice (mDice) Args: results (list[ndarray] | list[str]): List of prediction segmentation maps or list of prediction result filenames. gt_seg_maps (list[ndarray] | list[str]): list of ground truth segmentation maps or list of label filenames. num_classes (int): Number of categories. ignore_index (int): Index that will be ignored in evaluation. nan_to_num (int, optional): If specified, NaN values will be replaced by the numbers defined by the user. Default: None. label_map (dict): Mapping old labels to new labels. Default: dict(). reduce_zero_label (bool): Whether ignore zero label. Default: False. Returns: dict[str, float | ndarray]: Default metrics. float: Overall accuracy on all images. ndarray: Per category accuracy, shape (num_classes, ). ndarray: Per category dice, shape (num_classes, ). """ dice_result = eval_metrics( results=results, gt_seg_maps=gt_seg_maps, num_classes=num_classes, ignore_index=ignore_index, metrics=['mDice'], nan_to_num=nan_to_num, label_map=label_map, reduce_zero_label=reduce_zero_label) return dice_result def mean_fscore(results, gt_seg_maps, num_classes, ignore_index, nan_to_num=None, label_map=dict(), reduce_zero_label=False, beta=1): """Calculate Mean Intersection and Union (mIoU) Args: results (list[ndarray] | list[str]): List of prediction segmentation maps or list of prediction result filenames. gt_seg_maps (list[ndarray] | list[str]): list of ground truth segmentation maps or list of label filenames. num_classes (int): Number of categories. ignore_index (int): Index that will be ignored in evaluation. nan_to_num (int, optional): If specified, NaN values will be replaced by the numbers defined by the user. Default: None. label_map (dict): Mapping old labels to new labels. Default: dict(). reduce_zero_label (bool): Whether ignore zero label. Default: False. beta (int): Determines the weight of recall in the combined score. Default: False. Returns: dict[str, float | ndarray]: Default metrics. float: Overall accuracy on all images. ndarray: Per category recall, shape (num_classes, ). ndarray: Per category precision, shape (num_classes, ). ndarray: Per category f-score, shape (num_classes, ). """ fscore_result = eval_metrics( results=results, gt_seg_maps=gt_seg_maps, num_classes=num_classes, ignore_index=ignore_index, metrics=['mFscore'], nan_to_num=nan_to_num, label_map=label_map, reduce_zero_label=reduce_zero_label, beta=beta) return fscore_result def eval_metrics(results, gt_seg_maps, num_classes, ignore_index, metrics=['mIoU'], nan_to_num=None, label_map=dict(), reduce_zero_label=False, beta=1): """Calculate evaluation metrics Args: results (list[ndarray] | list[str]): List of prediction segmentation maps or list of prediction result filenames. gt_seg_maps (list[ndarray] | list[str]): list of ground truth segmentation maps or list of label filenames. num_classes (int): Number of categories. ignore_index (int): Index that will be ignored in evaluation. metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'. nan_to_num (int, optional): If specified, NaN values will be replaced by the numbers defined by the user. Default: None. label_map (dict): Mapping old labels to new labels. Default: dict(). reduce_zero_label (bool): Whether ignore zero label. Default: False. Returns: float: Overall accuracy on all images. ndarray: Per category accuracy, shape (num_classes, ). ndarray: Per category evaluation metrics, shape (num_classes, ). """ if isinstance(metrics, str): metrics = [metrics] allowed_metrics = ['mIoU', 'mDice', 'mFscore'] if not set(metrics).issubset(set(allowed_metrics)): raise KeyError('metrics {} is not supported'.format(metrics)) total_area_intersect, total_area_union, total_area_pred_label, \ total_area_label = total_intersect_and_union( results, gt_seg_maps, num_classes, ignore_index, label_map, reduce_zero_label) all_acc = total_area_intersect.sum() / total_area_label.sum() ret_metrics = OrderedDict({'aAcc': all_acc}) for metric in metrics: if metric == 'mIoU': iou = total_area_intersect / total_area_union acc = total_area_intersect / total_area_label ret_metrics['IoU'] = iou ret_metrics['Acc'] = acc elif metric == 'mDice': dice = 2 * total_area_intersect / ( total_area_pred_label + total_area_label) acc = total_area_intersect / total_area_label ret_metrics['Dice'] = dice ret_metrics['Acc'] = acc elif metric == 'mFscore': precision = total_area_intersect / total_area_pred_label recall = total_area_intersect / total_area_label f_value = torch.tensor( [f_score(x[0], x[1], beta) for x in zip(precision, recall)]) ret_metrics['Fscore'] = f_value ret_metrics['Precision'] = precision ret_metrics['Recall'] = recall ret_metrics = { metric: value.numpy() for metric, value in ret_metrics.items() } if nan_to_num is not None: ret_metrics = OrderedDict({ metric: np.nan_to_num(metric_value, nan=nan_to_num) for metric, metric_value in ret_metrics.items() }) return ret_metrics ================================================ FILE: annotator/mmpkg/mmseg/core/seg/__init__.py ================================================ from .builder import build_pixel_sampler from .sampler import BasePixelSampler, OHEMPixelSampler __all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler'] ================================================ FILE: annotator/mmpkg/mmseg/core/seg/builder.py ================================================ from annotator.mmpkg.mmcv.utils import Registry, build_from_cfg PIXEL_SAMPLERS = Registry('pixel sampler') def build_pixel_sampler(cfg, **default_args): """Build pixel sampler for segmentation map.""" return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args) ================================================ FILE: annotator/mmpkg/mmseg/core/seg/sampler/__init__.py ================================================ from .base_pixel_sampler import BasePixelSampler from .ohem_pixel_sampler import OHEMPixelSampler __all__ = ['BasePixelSampler', 'OHEMPixelSampler'] ================================================ FILE: annotator/mmpkg/mmseg/core/seg/sampler/base_pixel_sampler.py ================================================ from abc import ABCMeta, abstractmethod class BasePixelSampler(metaclass=ABCMeta): """Base class of pixel sampler.""" def __init__(self, **kwargs): pass @abstractmethod def sample(self, seg_logit, seg_label): """Placeholder for sample function.""" ================================================ FILE: annotator/mmpkg/mmseg/core/seg/sampler/ohem_pixel_sampler.py ================================================ import torch import torch.nn.functional as F from ..builder import PIXEL_SAMPLERS from .base_pixel_sampler import BasePixelSampler @PIXEL_SAMPLERS.register_module() class OHEMPixelSampler(BasePixelSampler): """Online Hard Example Mining Sampler for segmentation. Args: context (nn.Module): The context of sampler, subclass of :obj:`BaseDecodeHead`. thresh (float, optional): The threshold for hard example selection. Below which, are prediction with low confidence. If not specified, the hard examples will be pixels of top ``min_kept`` loss. Default: None. min_kept (int, optional): The minimum number of predictions to keep. Default: 100000. """ def __init__(self, context, thresh=None, min_kept=100000): super(OHEMPixelSampler, self).__init__() self.context = context assert min_kept > 1 self.thresh = thresh self.min_kept = min_kept def sample(self, seg_logit, seg_label): """Sample pixels that have high loss or with low prediction confidence. Args: seg_logit (torch.Tensor): segmentation logits, shape (N, C, H, W) seg_label (torch.Tensor): segmentation label, shape (N, 1, H, W) Returns: torch.Tensor: segmentation weight, shape (N, H, W) """ with torch.no_grad(): assert seg_logit.shape[2:] == seg_label.shape[2:] assert seg_label.shape[1] == 1 seg_label = seg_label.squeeze(1).long() batch_kept = self.min_kept * seg_label.size(0) valid_mask = seg_label != self.context.ignore_index seg_weight = seg_logit.new_zeros(size=seg_label.size()) valid_seg_weight = seg_weight[valid_mask] if self.thresh is not None: seg_prob = F.softmax(seg_logit, dim=1) tmp_seg_label = seg_label.clone().unsqueeze(1) tmp_seg_label[tmp_seg_label == self.context.ignore_index] = 0 seg_prob = seg_prob.gather(1, tmp_seg_label).squeeze(1) sort_prob, sort_indices = seg_prob[valid_mask].sort() if sort_prob.numel() > 0: min_threshold = sort_prob[min(batch_kept, sort_prob.numel() - 1)] else: min_threshold = 0.0 threshold = max(min_threshold, self.thresh) valid_seg_weight[seg_prob[valid_mask] < threshold] = 1. else: losses = self.context.loss_decode( seg_logit, seg_label, weight=None, ignore_index=self.context.ignore_index, reduction_override='none') # faster than topk according to https://github.com/pytorch/pytorch/issues/22812 # noqa _, sort_indices = losses[valid_mask].sort(descending=True) valid_seg_weight[sort_indices[:batch_kept]] = 1. seg_weight[valid_mask] = valid_seg_weight return seg_weight ================================================ FILE: annotator/mmpkg/mmseg/core/utils/__init__.py ================================================ from .misc import add_prefix __all__ = ['add_prefix'] ================================================ FILE: annotator/mmpkg/mmseg/core/utils/misc.py ================================================ def add_prefix(inputs, prefix): """Add prefix for dict. Args: inputs (dict): The input dict with str keys. prefix (str): The prefix to add. Returns: dict: The dict with keys updated with ``prefix``. """ outputs = dict() for name, value in inputs.items(): outputs[f'{prefix}.{name}'] = value return outputs ================================================ FILE: annotator/mmpkg/mmseg/datasets/__init__.py ================================================ from .ade import ADE20KDataset from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset from .chase_db1 import ChaseDB1Dataset from .cityscapes import CityscapesDataset from .custom import CustomDataset from .dataset_wrappers import ConcatDataset, RepeatDataset from .drive import DRIVEDataset from .hrf import HRFDataset from .pascal_context import PascalContextDataset, PascalContextDataset59 from .stare import STAREDataset from .voc import PascalVOCDataset __all__ = [ 'CustomDataset', 'build_dataloader', 'ConcatDataset', 'RepeatDataset', 'DATASETS', 'build_dataset', 'PIPELINES', 'CityscapesDataset', 'PascalVOCDataset', 'ADE20KDataset', 'PascalContextDataset', 'PascalContextDataset59', 'ChaseDB1Dataset', 'DRIVEDataset', 'HRFDataset', 'STAREDataset' ] ================================================ FILE: annotator/mmpkg/mmseg/datasets/ade.py ================================================ from .builder import DATASETS from .custom import CustomDataset @DATASETS.register_module() class ADE20KDataset(CustomDataset): """ADE20K dataset. In segmentation map annotation for ADE20K, 0 stands for background, which is not included in 150 categories. ``reduce_zero_label`` is fixed to True. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to '.png'. """ CLASSES = ( 'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ', 'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth', 'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car', 'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe', 'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column', 'signboard', 'chest of drawers', 'counter', 'sand', 'sink', 'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path', 'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door', 'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table', 'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove', 'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar', 'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower', 'chandelier', 'awning', 'streetlight', 'booth', 'television receiver', 'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister', 'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van', 'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything', 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent', 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank', 'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce', 'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen', 'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass', 'clock', 'flag') PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255], [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255], [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0], [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0], [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255], [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255], [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20], [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255], [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255], [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255], [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0], [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0], [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255], [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112], [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160], [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163], [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0], [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0], [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255], [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204], [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255], [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255], [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194], [102, 255, 0], [92, 0, 255]] def __init__(self, **kwargs): super(ADE20KDataset, self).__init__( img_suffix='.jpg', seg_map_suffix='.png', reduce_zero_label=True, **kwargs) ================================================ FILE: annotator/mmpkg/mmseg/datasets/builder.py ================================================ import copy import platform import random from functools import partial import numpy as np from annotator.mmpkg.mmcv.parallel import collate from annotator.mmpkg.mmcv.runner import get_dist_info from annotator.mmpkg.mmcv.utils import Registry, build_from_cfg from annotator.mmpkg.mmcv.utils.parrots_wrapper import DataLoader, PoolDataLoader from torch.utils.data import DistributedSampler if platform.system() != 'Windows': # https://github.com/pytorch/pytorch/issues/973 import resource rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) hard_limit = rlimit[1] soft_limit = min(4096, hard_limit) resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) DATASETS = Registry('dataset') PIPELINES = Registry('pipeline') def _concat_dataset(cfg, default_args=None): """Build :obj:`ConcatDataset by.""" from .dataset_wrappers import ConcatDataset img_dir = cfg['img_dir'] ann_dir = cfg.get('ann_dir', None) split = cfg.get('split', None) num_img_dir = len(img_dir) if isinstance(img_dir, (list, tuple)) else 1 if ann_dir is not None: num_ann_dir = len(ann_dir) if isinstance(ann_dir, (list, tuple)) else 1 else: num_ann_dir = 0 if split is not None: num_split = len(split) if isinstance(split, (list, tuple)) else 1 else: num_split = 0 if num_img_dir > 1: assert num_img_dir == num_ann_dir or num_ann_dir == 0 assert num_img_dir == num_split or num_split == 0 else: assert num_split == num_ann_dir or num_ann_dir <= 1 num_dset = max(num_split, num_img_dir) datasets = [] for i in range(num_dset): data_cfg = copy.deepcopy(cfg) if isinstance(img_dir, (list, tuple)): data_cfg['img_dir'] = img_dir[i] if isinstance(ann_dir, (list, tuple)): data_cfg['ann_dir'] = ann_dir[i] if isinstance(split, (list, tuple)): data_cfg['split'] = split[i] datasets.append(build_dataset(data_cfg, default_args)) return ConcatDataset(datasets) def build_dataset(cfg, default_args=None): """Build datasets.""" from .dataset_wrappers import ConcatDataset, RepeatDataset if isinstance(cfg, (list, tuple)): dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) elif cfg['type'] == 'RepeatDataset': dataset = RepeatDataset( build_dataset(cfg['dataset'], default_args), cfg['times']) elif isinstance(cfg.get('img_dir'), (list, tuple)) or isinstance( cfg.get('split', None), (list, tuple)): dataset = _concat_dataset(cfg, default_args) else: dataset = build_from_cfg(cfg, DATASETS, default_args) return dataset def build_dataloader(dataset, samples_per_gpu, workers_per_gpu, num_gpus=1, dist=True, shuffle=True, seed=None, drop_last=False, pin_memory=True, dataloader_type='PoolDataLoader', **kwargs): """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. In non-distributed training, there is only one dataloader for all GPUs. Args: dataset (Dataset): A PyTorch dataset. samples_per_gpu (int): Number of training samples on each GPU, i.e., batch size of each GPU. workers_per_gpu (int): How many subprocesses to use for data loading for each GPU. num_gpus (int): Number of GPUs. Only used in non-distributed training. dist (bool): Distributed training/test or not. Default: True. shuffle (bool): Whether to shuffle the data at every epoch. Default: True. seed (int | None): Seed to be used. Default: None. drop_last (bool): Whether to drop the last incomplete batch in epoch. Default: False pin_memory (bool): Whether to use pin_memory in DataLoader. Default: True dataloader_type (str): Type of dataloader. Default: 'PoolDataLoader' kwargs: any keyword argument to be used to initialize DataLoader Returns: DataLoader: A PyTorch dataloader. """ rank, world_size = get_dist_info() if dist: sampler = DistributedSampler( dataset, world_size, rank, shuffle=shuffle) shuffle = False batch_size = samples_per_gpu num_workers = workers_per_gpu else: sampler = None batch_size = num_gpus * samples_per_gpu num_workers = num_gpus * workers_per_gpu init_fn = partial( worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None assert dataloader_type in ( 'DataLoader', 'PoolDataLoader'), f'unsupported dataloader {dataloader_type}' if dataloader_type == 'PoolDataLoader': dataloader = PoolDataLoader elif dataloader_type == 'DataLoader': dataloader = DataLoader data_loader = dataloader( dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), pin_memory=pin_memory, shuffle=shuffle, worker_init_fn=init_fn, drop_last=drop_last, **kwargs) return data_loader def worker_init_fn(worker_id, num_workers, rank, seed): """Worker init func for dataloader. The seed of each worker equals to num_worker * rank + worker_id + user_seed Args: worker_id (int): Worker id. num_workers (int): Number of workers. rank (int): The rank of current process. seed (int): The random seed to use. """ worker_seed = num_workers * rank + worker_id + seed np.random.seed(worker_seed) random.seed(worker_seed) ================================================ FILE: annotator/mmpkg/mmseg/datasets/chase_db1.py ================================================ import os.path as osp from .builder import DATASETS from .custom import CustomDataset @DATASETS.register_module() class ChaseDB1Dataset(CustomDataset): """Chase_db1 dataset. In segmentation map annotation for Chase_db1, 0 stands for background, which is included in 2 categories. ``reduce_zero_label`` is fixed to False. The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to '_1stHO.png'. """ CLASSES = ('background', 'vessel') PALETTE = [[120, 120, 120], [6, 230, 230]] def __init__(self, **kwargs): super(ChaseDB1Dataset, self).__init__( img_suffix='.png', seg_map_suffix='_1stHO.png', reduce_zero_label=False, **kwargs) assert osp.exists(self.img_dir) ================================================ FILE: annotator/mmpkg/mmseg/datasets/cityscapes.py ================================================ import os.path as osp import tempfile import annotator.mmpkg.mmcv as mmcv import numpy as np from annotator.mmpkg.mmcv.utils import print_log from PIL import Image from .builder import DATASETS from .custom import CustomDataset @DATASETS.register_module() class CityscapesDataset(CustomDataset): """Cityscapes dataset. The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset. """ CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle') PALETTE = [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156], [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0], [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]] def __init__(self, **kwargs): super(CityscapesDataset, self).__init__( img_suffix='_leftImg8bit.png', seg_map_suffix='_gtFine_labelTrainIds.png', **kwargs) @staticmethod def _convert_to_label_id(result): """Convert trainId to id for cityscapes.""" if isinstance(result, str): result = np.load(result) import cityscapesscripts.helpers.labels as CSLabels result_copy = result.copy() for trainId, label in CSLabels.trainId2label.items(): result_copy[result == trainId] = label.id return result_copy def results2img(self, results, imgfile_prefix, to_label_id): """Write the segmentation results to images. Args: results (list[list | tuple | ndarray]): Testing results of the dataset. imgfile_prefix (str): The filename prefix of the png files. If the prefix is "somepath/xxx", the png files will be named "somepath/xxx.png". to_label_id (bool): whether convert output to label_id for submission Returns: list[str: str]: result txt files which contains corresponding semantic segmentation images. """ mmcv.mkdir_or_exist(imgfile_prefix) result_files = [] prog_bar = mmcv.ProgressBar(len(self)) for idx in range(len(self)): result = results[idx] if to_label_id: result = self._convert_to_label_id(result) filename = self.img_infos[idx]['filename'] basename = osp.splitext(osp.basename(filename))[0] png_filename = osp.join(imgfile_prefix, f'{basename}.png') output = Image.fromarray(result.astype(np.uint8)).convert('P') import cityscapesscripts.helpers.labels as CSLabels palette = np.zeros((len(CSLabels.id2label), 3), dtype=np.uint8) for label_id, label in CSLabels.id2label.items(): palette[label_id] = label.color output.putpalette(palette) output.save(png_filename) result_files.append(png_filename) prog_bar.update() return result_files def format_results(self, results, imgfile_prefix=None, to_label_id=True): """Format the results into dir (standard format for Cityscapes evaluation). Args: results (list): Testing results of the dataset. imgfile_prefix (str | None): The prefix of images files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. to_label_id (bool): whether convert output to label_id for submission. Default: False Returns: tuple: (result_files, tmp_dir), result_files is a list containing the image paths, tmp_dir is the temporal directory created for saving json/png files when img_prefix is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) == len(self), ( 'The length of results is not equal to the dataset len: ' f'{len(results)} != {len(self)}') if imgfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() imgfile_prefix = tmp_dir.name else: tmp_dir = None result_files = self.results2img(results, imgfile_prefix, to_label_id) return result_files, tmp_dir def evaluate(self, results, metric='mIoU', logger=None, imgfile_prefix=None, efficient_test=False): """Evaluation in Cityscapes/default protocol. Args: results (list): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. logger (logging.Logger | None | str): Logger used for printing related information during evaluation. Default: None. imgfile_prefix (str | None): The prefix of output image file, for cityscapes evaluation only. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If results are evaluated with cityscapes protocol, it would be the prefix of output png files. The output files would be png images under folder "a/b/prefix/xxx.png", where "xxx" is the image name of cityscapes. If not specified, a temp file will be created for evaluation. Default: None. Returns: dict[str, float]: Cityscapes/default metrics. """ eval_results = dict() metrics = metric.copy() if isinstance(metric, list) else [metric] if 'cityscapes' in metrics: eval_results.update( self._evaluate_cityscapes(results, logger, imgfile_prefix)) metrics.remove('cityscapes') if len(metrics) > 0: eval_results.update( super(CityscapesDataset, self).evaluate(results, metrics, logger, efficient_test)) return eval_results def _evaluate_cityscapes(self, results, logger, imgfile_prefix): """Evaluation in Cityscapes protocol. Args: results (list): Testing results of the dataset. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. imgfile_prefix (str | None): The prefix of output image file Returns: dict[str: float]: Cityscapes evaluation results. """ try: import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as CSEval # noqa except ImportError: raise ImportError('Please run "pip install cityscapesscripts" to ' 'install cityscapesscripts first.') msg = 'Evaluating in Cityscapes style' if logger is None: msg = '\n' + msg print_log(msg, logger=logger) result_files, tmp_dir = self.format_results(results, imgfile_prefix) if tmp_dir is None: result_dir = imgfile_prefix else: result_dir = tmp_dir.name eval_results = dict() print_log(f'Evaluating results under {result_dir} ...', logger=logger) CSEval.args.evalInstLevelScore = True CSEval.args.predictionPath = osp.abspath(result_dir) CSEval.args.evalPixelAccuracy = True CSEval.args.JSONOutput = False seg_map_list = [] pred_list = [] # when evaluating with official cityscapesscripts, # **_gtFine_labelIds.png is used for seg_map in mmcv.scandir( self.ann_dir, 'gtFine_labelIds.png', recursive=True): seg_map_list.append(osp.join(self.ann_dir, seg_map)) pred_list.append(CSEval.getPrediction(CSEval.args, seg_map)) eval_results.update( CSEval.evaluateImgLists(pred_list, seg_map_list, CSEval.args)) if tmp_dir is not None: tmp_dir.cleanup() return eval_results ================================================ FILE: annotator/mmpkg/mmseg/datasets/custom.py ================================================ import os import os.path as osp from collections import OrderedDict from functools import reduce import annotator.mmpkg.mmcv as mmcv import numpy as np from annotator.mmpkg.mmcv.utils import print_log from torch.utils.data import Dataset from annotator.mmpkg.mmseg.core import eval_metrics from annotator.mmpkg.mmseg.utils import get_root_logger from .builder import DATASETS from .pipelines import Compose @DATASETS.register_module() class CustomDataset(Dataset): """Custom dataset for semantic segmentation. An example of file structure is as followed. .. code-block:: none ├── data │ ├── my_dataset │ │ ├── img_dir │ │ │ ├── train │ │ │ │ ├── xxx{img_suffix} │ │ │ │ ├── yyy{img_suffix} │ │ │ │ ├── zzz{img_suffix} │ │ │ ├── val │ │ ├── ann_dir │ │ │ ├── train │ │ │ │ ├── xxx{seg_map_suffix} │ │ │ │ ├── yyy{seg_map_suffix} │ │ │ │ ├── zzz{seg_map_suffix} │ │ │ ├── val The img/gt_semantic_seg pair of CustomDataset should be of the same except suffix. A valid img/gt_semantic_seg filename pair should be like ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included in the suffix). If split is given, then ``xxx`` is specified in txt file. Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded. Please refer to ``docs/tutorials/new_dataset.md`` for more details. Args: pipeline (list[dict]): Processing pipeline img_dir (str): Path to image directory img_suffix (str): Suffix of images. Default: '.jpg' ann_dir (str, optional): Path to annotation directory. Default: None seg_map_suffix (str): Suffix of segmentation maps. Default: '.png' split (str, optional): Split txt file. If split is specified, only file with suffix in the splits will be loaded. Otherwise, all images in img_dir/ann_dir will be loaded. Default: None data_root (str, optional): Data root for img_dir/ann_dir. Default: None. test_mode (bool): If test_mode=True, gt wouldn't be loaded. ignore_index (int): The label index to be ignored. Default: 255 reduce_zero_label (bool): Whether to mark label zero as ignored. Default: False classes (str | Sequence[str], optional): Specify classes to load. If is None, ``cls.CLASSES`` will be used. Default: None. palette (Sequence[Sequence[int]]] | np.ndarray | None): The palette of segmentation map. If None is given, and self.PALETTE is None, random palette will be generated. Default: None """ CLASSES = None PALETTE = None def __init__(self, pipeline, img_dir, img_suffix='.jpg', ann_dir=None, seg_map_suffix='.png', split=None, data_root=None, test_mode=False, ignore_index=255, reduce_zero_label=False, classes=None, palette=None): self.pipeline = Compose(pipeline) self.img_dir = img_dir self.img_suffix = img_suffix self.ann_dir = ann_dir self.seg_map_suffix = seg_map_suffix self.split = split self.data_root = data_root self.test_mode = test_mode self.ignore_index = ignore_index self.reduce_zero_label = reduce_zero_label self.label_map = None self.CLASSES, self.PALETTE = self.get_classes_and_palette( classes, palette) # join paths if data_root is specified if self.data_root is not None: if not osp.isabs(self.img_dir): self.img_dir = osp.join(self.data_root, self.img_dir) if not (self.ann_dir is None or osp.isabs(self.ann_dir)): self.ann_dir = osp.join(self.data_root, self.ann_dir) if not (self.split is None or osp.isabs(self.split)): self.split = osp.join(self.data_root, self.split) # load annotations self.img_infos = self.load_annotations(self.img_dir, self.img_suffix, self.ann_dir, self.seg_map_suffix, self.split) def __len__(self): """Total number of samples of data.""" return len(self.img_infos) def load_annotations(self, img_dir, img_suffix, ann_dir, seg_map_suffix, split): """Load annotation from directory. Args: img_dir (str): Path to image directory img_suffix (str): Suffix of images. ann_dir (str|None): Path to annotation directory. seg_map_suffix (str|None): Suffix of segmentation maps. split (str|None): Split txt file. If split is specified, only file with suffix in the splits will be loaded. Otherwise, all images in img_dir/ann_dir will be loaded. Default: None Returns: list[dict]: All image info of dataset. """ img_infos = [] if split is not None: with open(split) as f: for line in f: img_name = line.strip() img_info = dict(filename=img_name + img_suffix) if ann_dir is not None: seg_map = img_name + seg_map_suffix img_info['ann'] = dict(seg_map=seg_map) img_infos.append(img_info) else: for img in mmcv.scandir(img_dir, img_suffix, recursive=True): img_info = dict(filename=img) if ann_dir is not None: seg_map = img.replace(img_suffix, seg_map_suffix) img_info['ann'] = dict(seg_map=seg_map) img_infos.append(img_info) print_log(f'Loaded {len(img_infos)} images', logger=get_root_logger()) return img_infos def get_ann_info(self, idx): """Get annotation by index. Args: idx (int): Index of data. Returns: dict: Annotation info of specified index. """ return self.img_infos[idx]['ann'] def pre_pipeline(self, results): """Prepare results dict for pipeline.""" results['seg_fields'] = [] results['img_prefix'] = self.img_dir results['seg_prefix'] = self.ann_dir if self.custom_classes: results['label_map'] = self.label_map def __getitem__(self, idx): """Get training/test data after pipeline. Args: idx (int): Index of data. Returns: dict: Training/test data (with annotation if `test_mode` is set False). """ if self.test_mode: return self.prepare_test_img(idx) else: return self.prepare_train_img(idx) def prepare_train_img(self, idx): """Get training data and annotations after pipeline. Args: idx (int): Index of data. Returns: dict: Training data and annotation after pipeline with new keys introduced by pipeline. """ img_info = self.img_infos[idx] ann_info = self.get_ann_info(idx) results = dict(img_info=img_info, ann_info=ann_info) self.pre_pipeline(results) return self.pipeline(results) def prepare_test_img(self, idx): """Get testing data after pipeline. Args: idx (int): Index of data. Returns: dict: Testing data after pipeline with new keys introduced by pipeline. """ img_info = self.img_infos[idx] results = dict(img_info=img_info) self.pre_pipeline(results) return self.pipeline(results) def format_results(self, results, **kwargs): """Place holder to format result to dataset specific output.""" def get_gt_seg_maps(self, efficient_test=False): """Get ground truth segmentation maps for evaluation.""" gt_seg_maps = [] for img_info in self.img_infos: seg_map = osp.join(self.ann_dir, img_info['ann']['seg_map']) if efficient_test: gt_seg_map = seg_map else: gt_seg_map = mmcv.imread( seg_map, flag='unchanged', backend='pillow') gt_seg_maps.append(gt_seg_map) return gt_seg_maps def get_classes_and_palette(self, classes=None, palette=None): """Get class names of current dataset. Args: classes (Sequence[str] | str | None): If classes is None, use default CLASSES defined by builtin dataset. If classes is a string, take it as a file name. The file contains the name of classes where each line contains one class name. If classes is a tuple or list, override the CLASSES defined by the dataset. palette (Sequence[Sequence[int]]] | np.ndarray | None): The palette of segmentation map. If None is given, random palette will be generated. Default: None """ if classes is None: self.custom_classes = False return self.CLASSES, self.PALETTE self.custom_classes = True if isinstance(classes, str): # take it as a file path class_names = mmcv.list_from_file(classes) elif isinstance(classes, (tuple, list)): class_names = classes else: raise ValueError(f'Unsupported type {type(classes)} of classes.') if self.CLASSES: if not set(classes).issubset(self.CLASSES): raise ValueError('classes is not a subset of CLASSES.') # dictionary, its keys are the old label ids and its values # are the new label ids. # used for changing pixel labels in load_annotations. self.label_map = {} for i, c in enumerate(self.CLASSES): if c not in class_names: self.label_map[i] = -1 else: self.label_map[i] = classes.index(c) palette = self.get_palette_for_custom_classes(class_names, palette) return class_names, palette def get_palette_for_custom_classes(self, class_names, palette=None): if self.label_map is not None: # return subset of palette palette = [] for old_id, new_id in sorted( self.label_map.items(), key=lambda x: x[1]): if new_id != -1: palette.append(self.PALETTE[old_id]) palette = type(self.PALETTE)(palette) elif palette is None: if self.PALETTE is None: palette = np.random.randint(0, 255, size=(len(class_names), 3)) else: palette = self.PALETTE return palette def evaluate(self, results, metric='mIoU', logger=None, efficient_test=False, **kwargs): """Evaluate the dataset. Args: results (list): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. 'mIoU', 'mDice' and 'mFscore' are supported. logger (logging.Logger | None | str): Logger used for printing related information during evaluation. Default: None. Returns: dict[str, float]: Default metrics. """ if isinstance(metric, str): metric = [metric] allowed_metrics = ['mIoU', 'mDice', 'mFscore'] if not set(metric).issubset(set(allowed_metrics)): raise KeyError('metric {} is not supported'.format(metric)) eval_results = {} gt_seg_maps = self.get_gt_seg_maps(efficient_test) if self.CLASSES is None: num_classes = len( reduce(np.union1d, [np.unique(_) for _ in gt_seg_maps])) else: num_classes = len(self.CLASSES) ret_metrics = eval_metrics( results, gt_seg_maps, num_classes, self.ignore_index, metric, label_map=self.label_map, reduce_zero_label=self.reduce_zero_label) if self.CLASSES is None: class_names = tuple(range(num_classes)) else: class_names = self.CLASSES # summary table ret_metrics_summary = OrderedDict({ ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2) for ret_metric, ret_metric_value in ret_metrics.items() }) # each class table ret_metrics.pop('aAcc', None) ret_metrics_class = OrderedDict({ ret_metric: np.round(ret_metric_value * 100, 2) for ret_metric, ret_metric_value in ret_metrics.items() }) ret_metrics_class.update({'Class': class_names}) ret_metrics_class.move_to_end('Class', last=False) try: from prettytable import PrettyTable # for logger class_table_data = PrettyTable() for key, val in ret_metrics_class.items(): class_table_data.add_column(key, val) summary_table_data = PrettyTable() for key, val in ret_metrics_summary.items(): if key == 'aAcc': summary_table_data.add_column(key, [val]) else: summary_table_data.add_column('m' + key, [val]) print_log('per class results:', logger) print_log('\n' + class_table_data.get_string(), logger=logger) print_log('Summary:', logger) print_log('\n' + summary_table_data.get_string(), logger=logger) except ImportError: # prettytable is not installed pass # each metric dict for key, value in ret_metrics_summary.items(): if key == 'aAcc': eval_results[key] = value / 100.0 else: eval_results['m' + key] = value / 100.0 ret_metrics_class.pop('Class', None) for key, value in ret_metrics_class.items(): eval_results.update({ key + '.' + str(name): value[idx] / 100.0 for idx, name in enumerate(class_names) }) if mmcv.is_list_of(results, str): for file_name in results: os.remove(file_name) return eval_results ================================================ FILE: annotator/mmpkg/mmseg/datasets/dataset_wrappers.py ================================================ from torch.utils.data.dataset import ConcatDataset as _ConcatDataset from .builder import DATASETS @DATASETS.register_module() class ConcatDataset(_ConcatDataset): """A wrapper of concatenated dataset. Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but concat the group flag for image aspect ratio. Args: datasets (list[:obj:`Dataset`]): A list of datasets. """ def __init__(self, datasets): super(ConcatDataset, self).__init__(datasets) self.CLASSES = datasets[0].CLASSES self.PALETTE = datasets[0].PALETTE @DATASETS.register_module() class RepeatDataset(object): """A wrapper of repeated dataset. The length of repeated dataset will be `times` larger than the original dataset. This is useful when the data loading time is long but the dataset is small. Using RepeatDataset can reduce the data loading time between epochs. Args: dataset (:obj:`Dataset`): The dataset to be repeated. times (int): Repeat times. """ def __init__(self, dataset, times): self.dataset = dataset self.times = times self.CLASSES = dataset.CLASSES self.PALETTE = dataset.PALETTE self._ori_len = len(self.dataset) def __getitem__(self, idx): """Get item from original dataset.""" return self.dataset[idx % self._ori_len] def __len__(self): """The length is multiplied by ``times``""" return self.times * self._ori_len ================================================ FILE: annotator/mmpkg/mmseg/datasets/drive.py ================================================ import os.path as osp from .builder import DATASETS from .custom import CustomDataset @DATASETS.register_module() class DRIVEDataset(CustomDataset): """DRIVE dataset. In segmentation map annotation for DRIVE, 0 stands for background, which is included in 2 categories. ``reduce_zero_label`` is fixed to False. The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to '_manual1.png'. """ CLASSES = ('background', 'vessel') PALETTE = [[120, 120, 120], [6, 230, 230]] def __init__(self, **kwargs): super(DRIVEDataset, self).__init__( img_suffix='.png', seg_map_suffix='_manual1.png', reduce_zero_label=False, **kwargs) assert osp.exists(self.img_dir) ================================================ FILE: annotator/mmpkg/mmseg/datasets/hrf.py ================================================ import os.path as osp from .builder import DATASETS from .custom import CustomDataset @DATASETS.register_module() class HRFDataset(CustomDataset): """HRF dataset. In segmentation map annotation for HRF, 0 stands for background, which is included in 2 categories. ``reduce_zero_label`` is fixed to False. The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'. """ CLASSES = ('background', 'vessel') PALETTE = [[120, 120, 120], [6, 230, 230]] def __init__(self, **kwargs): super(HRFDataset, self).__init__( img_suffix='.png', seg_map_suffix='.png', reduce_zero_label=False, **kwargs) assert osp.exists(self.img_dir) ================================================ FILE: annotator/mmpkg/mmseg/datasets/pascal_context.py ================================================ import os.path as osp from .builder import DATASETS from .custom import CustomDataset @DATASETS.register_module() class PascalContextDataset(CustomDataset): """PascalContext dataset. In segmentation map annotation for PascalContext, 0 stands for background, which is included in 60 categories. ``reduce_zero_label`` is fixed to False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to '.png'. Args: split (str): Split txt file for PascalContext. """ CLASSES = ('background', 'aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle', 'bird', 'boat', 'book', 'bottle', 'building', 'bus', 'cabinet', 'car', 'cat', 'ceiling', 'chair', 'cloth', 'computer', 'cow', 'cup', 'curtain', 'dog', 'door', 'fence', 'floor', 'flower', 'food', 'grass', 'ground', 'horse', 'keyboard', 'light', 'motorbike', 'mountain', 'mouse', 'person', 'plate', 'platform', 'pottedplant', 'road', 'rock', 'sheep', 'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table', 'track', 'train', 'tree', 'truck', 'tvmonitor', 'wall', 'water', 'window', 'wood') PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]] def __init__(self, split, **kwargs): super(PascalContextDataset, self).__init__( img_suffix='.jpg', seg_map_suffix='.png', split=split, reduce_zero_label=False, **kwargs) assert osp.exists(self.img_dir) and self.split is not None @DATASETS.register_module() class PascalContextDataset59(CustomDataset): """PascalContext dataset. In segmentation map annotation for PascalContext, 0 stands for background, which is included in 60 categories. ``reduce_zero_label`` is fixed to False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to '.png'. Args: split (str): Split txt file for PascalContext. """ CLASSES = ('aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle', 'bird', 'boat', 'book', 'bottle', 'building', 'bus', 'cabinet', 'car', 'cat', 'ceiling', 'chair', 'cloth', 'computer', 'cow', 'cup', 'curtain', 'dog', 'door', 'fence', 'floor', 'flower', 'food', 'grass', 'ground', 'horse', 'keyboard', 'light', 'motorbike', 'mountain', 'mouse', 'person', 'plate', 'platform', 'pottedplant', 'road', 'rock', 'sheep', 'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table', 'track', 'train', 'tree', 'truck', 'tvmonitor', 'wall', 'water', 'window', 'wood') PALETTE = [[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]] def __init__(self, split, **kwargs): super(PascalContextDataset59, self).__init__( img_suffix='.jpg', seg_map_suffix='.png', split=split, reduce_zero_label=True, **kwargs) assert osp.exists(self.img_dir) and self.split is not None ================================================ FILE: annotator/mmpkg/mmseg/datasets/pipelines/__init__.py ================================================ from .compose import Compose from .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor, Transpose, to_tensor) from .loading import LoadAnnotations, LoadImageFromFile from .test_time_aug import MultiScaleFlipAug from .transforms import (CLAHE, AdjustGamma, Normalize, Pad, PhotoMetricDistortion, RandomCrop, RandomFlip, RandomRotate, Rerange, Resize, RGB2Gray, SegRescale) __all__ = [ 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer', 'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile', 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray' ] ================================================ FILE: annotator/mmpkg/mmseg/datasets/pipelines/compose.py ================================================ import collections from annotator.mmpkg.mmcv.utils import build_from_cfg from ..builder import PIPELINES @PIPELINES.register_module() class Compose(object): """Compose multiple transforms sequentially. Args: transforms (Sequence[dict | callable]): Sequence of transform object or config dict to be composed. """ def __init__(self, transforms): assert isinstance(transforms, collections.abc.Sequence) self.transforms = [] for transform in transforms: if isinstance(transform, dict): transform = build_from_cfg(transform, PIPELINES) self.transforms.append(transform) elif callable(transform): self.transforms.append(transform) else: raise TypeError('transform must be callable or a dict') def __call__(self, data): """Call function to apply transforms sequentially. Args: data (dict): A result dict contains the data to transform. Returns: dict: Transformed data. """ for t in self.transforms: data = t(data) if data is None: return None return data def __repr__(self): format_string = self.__class__.__name__ + '(' for t in self.transforms: format_string += '\n' format_string += f' {t}' format_string += '\n)' return format_string ================================================ FILE: annotator/mmpkg/mmseg/datasets/pipelines/formating.py ================================================ from collections.abc import Sequence import annotator.mmpkg.mmcv as mmcv import numpy as np import torch from annotator.mmpkg.mmcv.parallel import DataContainer as DC from ..builder import PIPELINES def to_tensor(data): """Convert objects of various python types to :obj:`torch.Tensor`. Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, :class:`Sequence`, :class:`int` and :class:`float`. Args: data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to be converted. """ if isinstance(data, torch.Tensor): return data elif isinstance(data, np.ndarray): return torch.from_numpy(data) elif isinstance(data, Sequence) and not mmcv.is_str(data): return torch.tensor(data) elif isinstance(data, int): return torch.LongTensor([data]) elif isinstance(data, float): return torch.FloatTensor([data]) else: raise TypeError(f'type {type(data)} cannot be converted to tensor.') @PIPELINES.register_module() class ToTensor(object): """Convert some results to :obj:`torch.Tensor` by given keys. Args: keys (Sequence[str]): Keys that need to be converted to Tensor. """ def __init__(self, keys): self.keys = keys def __call__(self, results): """Call function to convert data in results to :obj:`torch.Tensor`. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data converted to :obj:`torch.Tensor`. """ for key in self.keys: results[key] = to_tensor(results[key]) return results def __repr__(self): return self.__class__.__name__ + f'(keys={self.keys})' @PIPELINES.register_module() class ImageToTensor(object): """Convert image to :obj:`torch.Tensor` by given keys. The dimension order of input image is (H, W, C). The pipeline will convert it to (C, H, W). If only 2 dimension (H, W) is given, the output would be (1, H, W). Args: keys (Sequence[str]): Key of images to be converted to Tensor. """ def __init__(self, keys): self.keys = keys def __call__(self, results): """Call function to convert image in results to :obj:`torch.Tensor` and transpose the channel order. Args: results (dict): Result dict contains the image data to convert. Returns: dict: The result dict contains the image converted to :obj:`torch.Tensor` and transposed to (C, H, W) order. """ for key in self.keys: img = results[key] if len(img.shape) < 3: img = np.expand_dims(img, -1) results[key] = to_tensor(img.transpose(2, 0, 1)) return results def __repr__(self): return self.__class__.__name__ + f'(keys={self.keys})' @PIPELINES.register_module() class Transpose(object): """Transpose some results by given keys. Args: keys (Sequence[str]): Keys of results to be transposed. order (Sequence[int]): Order of transpose. """ def __init__(self, keys, order): self.keys = keys self.order = order def __call__(self, results): """Call function to convert image in results to :obj:`torch.Tensor` and transpose the channel order. Args: results (dict): Result dict contains the image data to convert. Returns: dict: The result dict contains the image converted to :obj:`torch.Tensor` and transposed to (C, H, W) order. """ for key in self.keys: results[key] = results[key].transpose(self.order) return results def __repr__(self): return self.__class__.__name__ + \ f'(keys={self.keys}, order={self.order})' @PIPELINES.register_module() class ToDataContainer(object): """Convert results to :obj:`mmcv.DataContainer` by given fields. Args: fields (Sequence[dict]): Each field is a dict like ``dict(key='xxx', **kwargs)``. The ``key`` in result will be converted to :obj:`mmcv.DataContainer` with ``**kwargs``. Default: ``(dict(key='img', stack=True), dict(key='gt_semantic_seg'))``. """ def __init__(self, fields=(dict(key='img', stack=True), dict(key='gt_semantic_seg'))): self.fields = fields def __call__(self, results): """Call function to convert data in results to :obj:`mmcv.DataContainer`. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data converted to :obj:`mmcv.DataContainer`. """ for field in self.fields: field = field.copy() key = field.pop('key') results[key] = DC(results[key], **field) return results def __repr__(self): return self.__class__.__name__ + f'(fields={self.fields})' @PIPELINES.register_module() class DefaultFormatBundle(object): """Default formatting bundle. It simplifies the pipeline of formatting common fields, including "img" and "gt_semantic_seg". These fields are formatted as follows. - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, (3)to DataContainer (stack=True) """ def __call__(self, results): """Call function to transform and format common fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with default bundle. """ if 'img' in results: img = results['img'] if len(img.shape) < 3: img = np.expand_dims(img, -1) img = np.ascontiguousarray(img.transpose(2, 0, 1)) results['img'] = DC(to_tensor(img), stack=True) if 'gt_semantic_seg' in results: # convert to long results['gt_semantic_seg'] = DC( to_tensor(results['gt_semantic_seg'][None, ...].astype(np.int64)), stack=True) return results def __repr__(self): return self.__class__.__name__ @PIPELINES.register_module() class Collect(object): """Collect data from the loader relevant to the specific task. This is usually the last stage of the data loader pipeline. Typically keys is set to some subset of "img", "gt_semantic_seg". The "img_meta" item is always populated. The contents of the "img_meta" dictionary depends on "meta_keys". By default this includes: - "img_shape": shape of the image input to the network as a tuple (h, w, c). Note that images may be zero padded on the bottom/right if the batch tensor is larger than this shape. - "scale_factor": a float indicating the preprocessing scale - "flip": a boolean indicating if image flip transform was used - "filename": path to the image file - "ori_shape": original shape of the image as a tuple (h, w, c) - "pad_shape": image shape after padding - "img_norm_cfg": a dict of normalization information: - mean - per channel mean subtraction - std - per channel std divisor - to_rgb - bool indicating if bgr was converted to rgb Args: keys (Sequence[str]): Keys of results to be collected in ``data``. meta_keys (Sequence[str], optional): Meta keys to be converted to ``mmcv.DataContainer`` and collected in ``data[img_metas]``. Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg')`` """ def __init__(self, keys, meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'flip_direction', 'img_norm_cfg')): self.keys = keys self.meta_keys = meta_keys def __call__(self, results): """Call function to collect keys in results. The keys in ``meta_keys`` will be converted to :obj:mmcv.DataContainer. Args: results (dict): Result dict contains the data to collect. Returns: dict: The result dict contains the following keys - keys in``self.keys`` - ``img_metas`` """ data = {} img_meta = {} for key in self.meta_keys: img_meta[key] = results[key] data['img_metas'] = DC(img_meta, cpu_only=True) for key in self.keys: data[key] = results[key] return data def __repr__(self): return self.__class__.__name__ + \ f'(keys={self.keys}, meta_keys={self.meta_keys})' ================================================ FILE: annotator/mmpkg/mmseg/datasets/pipelines/loading.py ================================================ import os.path as osp import annotator.mmpkg.mmcv as mmcv import numpy as np from ..builder import PIPELINES @PIPELINES.register_module() class LoadImageFromFile(object): """Load an image from file. Required keys are "img_prefix" and "img_info" (a dict that must contain the key "filename"). Added or updated keys are "filename", "img", "img_shape", "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`), "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1). Args: to_float32 (bool): Whether to convert the loaded image to a float32 numpy array. If set to False, the loaded image is an uint8 array. Defaults to False. color_type (str): The flag argument for :func:`mmcv.imfrombytes`. Defaults to 'color'. file_client_args (dict): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Defaults to ``dict(backend='disk')``. imdecode_backend (str): Backend for :func:`mmcv.imdecode`. Default: 'cv2' """ def __init__(self, to_float32=False, color_type='color', file_client_args=dict(backend='disk'), imdecode_backend='cv2'): self.to_float32 = to_float32 self.color_type = color_type self.file_client_args = file_client_args.copy() self.file_client = None self.imdecode_backend = imdecode_backend def __call__(self, results): """Call functions to load image and get image meta information. Args: results (dict): Result dict from :obj:`mmseg.CustomDataset`. Returns: dict: The dict contains loaded image and meta information. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) if results.get('img_prefix') is not None: filename = osp.join(results['img_prefix'], results['img_info']['filename']) else: filename = results['img_info']['filename'] img_bytes = self.file_client.get(filename) img = mmcv.imfrombytes( img_bytes, flag=self.color_type, backend=self.imdecode_backend) if self.to_float32: img = img.astype(np.float32) results['filename'] = filename results['ori_filename'] = results['img_info']['filename'] results['img'] = img results['img_shape'] = img.shape results['ori_shape'] = img.shape # Set initial values for default meta_keys results['pad_shape'] = img.shape results['scale_factor'] = 1.0 num_channels = 1 if len(img.shape) < 3 else img.shape[2] results['img_norm_cfg'] = dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(to_float32={self.to_float32},' repr_str += f"color_type='{self.color_type}'," repr_str += f"imdecode_backend='{self.imdecode_backend}')" return repr_str @PIPELINES.register_module() class LoadAnnotations(object): """Load annotations for semantic segmentation. Args: reduce_zero_label (bool): Whether reduce all label value by 1. Usually used for datasets where 0 is background label. Default: False. file_client_args (dict): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Defaults to ``dict(backend='disk')``. imdecode_backend (str): Backend for :func:`mmcv.imdecode`. Default: 'pillow' """ def __init__(self, reduce_zero_label=False, file_client_args=dict(backend='disk'), imdecode_backend='pillow'): self.reduce_zero_label = reduce_zero_label self.file_client_args = file_client_args.copy() self.file_client = None self.imdecode_backend = imdecode_backend def __call__(self, results): """Call function to load multiple types annotations. Args: results (dict): Result dict from :obj:`mmseg.CustomDataset`. Returns: dict: The dict contains loaded semantic segmentation annotations. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) if results.get('seg_prefix', None) is not None: filename = osp.join(results['seg_prefix'], results['ann_info']['seg_map']) else: filename = results['ann_info']['seg_map'] img_bytes = self.file_client.get(filename) gt_semantic_seg = mmcv.imfrombytes( img_bytes, flag='unchanged', backend=self.imdecode_backend).squeeze().astype(np.uint8) # modify if custom classes if results.get('label_map', None) is not None: for old_id, new_id in results['label_map'].items(): gt_semantic_seg[gt_semantic_seg == old_id] = new_id # reduce zero_label if self.reduce_zero_label: # avoid using underflow conversion gt_semantic_seg[gt_semantic_seg == 0] = 255 gt_semantic_seg = gt_semantic_seg - 1 gt_semantic_seg[gt_semantic_seg == 254] = 255 results['gt_semantic_seg'] = gt_semantic_seg results['seg_fields'].append('gt_semantic_seg') return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(reduce_zero_label={self.reduce_zero_label},' repr_str += f"imdecode_backend='{self.imdecode_backend}')" return repr_str ================================================ FILE: annotator/mmpkg/mmseg/datasets/pipelines/test_time_aug.py ================================================ import warnings import annotator.mmpkg.mmcv as mmcv from ..builder import PIPELINES from .compose import Compose @PIPELINES.register_module() class MultiScaleFlipAug(object): """Test-time augmentation with multiple scales and flipping. An example configuration is as followed: .. code-block:: img_scale=(2048, 1024), img_ratios=[0.5, 1.0], flip=True, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ] After MultiScaleFLipAug with above configuration, the results are wrapped into lists of the same length as followed: .. code-block:: dict( img=[...], img_shape=[...], scale=[(1024, 512), (1024, 512), (2048, 1024), (2048, 1024)] flip=[False, True, False, True] ... ) Args: transforms (list[dict]): Transforms to apply in each augmentation. img_scale (None | tuple | list[tuple]): Images scales for resizing. img_ratios (float | list[float]): Image ratios for resizing flip (bool): Whether apply flip augmentation. Default: False. flip_direction (str | list[str]): Flip augmentation directions, options are "horizontal" and "vertical". If flip_direction is list, multiple flip augmentations will be applied. It has no effect when flip == False. Default: "horizontal". """ def __init__(self, transforms, img_scale, img_ratios=None, flip=False, flip_direction='horizontal'): self.transforms = Compose(transforms) if img_ratios is not None: img_ratios = img_ratios if isinstance(img_ratios, list) else [img_ratios] assert mmcv.is_list_of(img_ratios, float) if img_scale is None: # mode 1: given img_scale=None and a range of image ratio self.img_scale = None assert mmcv.is_list_of(img_ratios, float) elif isinstance(img_scale, tuple) and mmcv.is_list_of( img_ratios, float): assert len(img_scale) == 2 # mode 2: given a scale and a range of image ratio self.img_scale = [(int(img_scale[0] * ratio), int(img_scale[1] * ratio)) for ratio in img_ratios] else: # mode 3: given multiple scales self.img_scale = img_scale if isinstance(img_scale, list) else [img_scale] assert mmcv.is_list_of(self.img_scale, tuple) or self.img_scale is None self.flip = flip self.img_ratios = img_ratios self.flip_direction = flip_direction if isinstance( flip_direction, list) else [flip_direction] assert mmcv.is_list_of(self.flip_direction, str) if not self.flip and self.flip_direction != ['horizontal']: warnings.warn( 'flip_direction has no effect when flip is set to False') if (self.flip and not any([t['type'] == 'RandomFlip' for t in transforms])): warnings.warn( 'flip has no effect when RandomFlip is not in transforms') def __call__(self, results): """Call function to apply test time augment transforms on results. Args: results (dict): Result dict contains the data to transform. Returns: dict[str: list]: The augmented data, where each value is wrapped into a list. """ aug_data = [] if self.img_scale is None and mmcv.is_list_of(self.img_ratios, float): h, w = results['img'].shape[:2] img_scale = [(int(w * ratio), int(h * ratio)) for ratio in self.img_ratios] else: img_scale = self.img_scale flip_aug = [False, True] if self.flip else [False] for scale in img_scale: for flip in flip_aug: for direction in self.flip_direction: _results = results.copy() _results['scale'] = scale _results['flip'] = flip _results['flip_direction'] = direction data = self.transforms(_results) aug_data.append(data) # list of dict to dict of list aug_data_dict = {key: [] for key in aug_data[0]} for data in aug_data: for key, val in data.items(): aug_data_dict[key].append(val) return aug_data_dict def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(transforms={self.transforms}, ' repr_str += f'img_scale={self.img_scale}, flip={self.flip})' repr_str += f'flip_direction={self.flip_direction}' return repr_str ================================================ FILE: annotator/mmpkg/mmseg/datasets/pipelines/transforms.py ================================================ import annotator.mmpkg.mmcv as mmcv import numpy as np from annotator.mmpkg.mmcv.utils import deprecated_api_warning, is_tuple_of from numpy import random from ..builder import PIPELINES @PIPELINES.register_module() class Resize(object): """Resize images & seg. This transform resizes the input image to some scale. If the input dict contains the key "scale", then the scale in the input dict is used, otherwise the specified scale in the init method is used. ``img_scale`` can be None, a tuple (single-scale) or a list of tuple (multi-scale). There are 4 multiscale modes: - ``ratio_range is not None``: 1. When img_scale is None, img_scale is the shape of image in results (img_scale = results['img'].shape[:2]) and the image is resized based on the original size. (mode 1) 2. When img_scale is a tuple (single-scale), randomly sample a ratio from the ratio range and multiply it with the image scale. (mode 2) - ``ratio_range is None and multiscale_mode == "range"``: randomly sample a scale from the a range. (mode 3) - ``ratio_range is None and multiscale_mode == "value"``: randomly sample a scale from multiple scales. (mode 4) Args: img_scale (tuple or list[tuple]): Images scales for resizing. multiscale_mode (str): Either "range" or "value". ratio_range (tuple[float]): (min_ratio, max_ratio) keep_ratio (bool): Whether to keep the aspect ratio when resizing the image. """ def __init__(self, img_scale=None, multiscale_mode='range', ratio_range=None, keep_ratio=True): if img_scale is None: self.img_scale = None else: if isinstance(img_scale, list): self.img_scale = img_scale else: self.img_scale = [img_scale] assert mmcv.is_list_of(self.img_scale, tuple) if ratio_range is not None: # mode 1: given img_scale=None and a range of image ratio # mode 2: given a scale and a range of image ratio assert self.img_scale is None or len(self.img_scale) == 1 else: # mode 3 and 4: given multiple scales or a range of scales assert multiscale_mode in ['value', 'range'] self.multiscale_mode = multiscale_mode self.ratio_range = ratio_range self.keep_ratio = keep_ratio @staticmethod def random_select(img_scales): """Randomly select an img_scale from given candidates. Args: img_scales (list[tuple]): Images scales for selection. Returns: (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, where ``img_scale`` is the selected image scale and ``scale_idx`` is the selected index in the given candidates. """ assert mmcv.is_list_of(img_scales, tuple) scale_idx = np.random.randint(len(img_scales)) img_scale = img_scales[scale_idx] return img_scale, scale_idx @staticmethod def random_sample(img_scales): """Randomly sample an img_scale when ``multiscale_mode=='range'``. Args: img_scales (list[tuple]): Images scale range for sampling. There must be two tuples in img_scales, which specify the lower and upper bound of image scales. Returns: (tuple, None): Returns a tuple ``(img_scale, None)``, where ``img_scale`` is sampled scale and None is just a placeholder to be consistent with :func:`random_select`. """ assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 img_scale_long = [max(s) for s in img_scales] img_scale_short = [min(s) for s in img_scales] long_edge = np.random.randint( min(img_scale_long), max(img_scale_long) + 1) short_edge = np.random.randint( min(img_scale_short), max(img_scale_short) + 1) img_scale = (long_edge, short_edge) return img_scale, None @staticmethod def random_sample_ratio(img_scale, ratio_range): """Randomly sample an img_scale when ``ratio_range`` is specified. A ratio will be randomly sampled from the range specified by ``ratio_range``. Then it would be multiplied with ``img_scale`` to generate sampled scale. Args: img_scale (tuple): Images scale base to multiply with ratio. ratio_range (tuple[float]): The minimum and maximum ratio to scale the ``img_scale``. Returns: (tuple, None): Returns a tuple ``(scale, None)``, where ``scale`` is sampled ratio multiplied with ``img_scale`` and None is just a placeholder to be consistent with :func:`random_select`. """ assert isinstance(img_scale, tuple) and len(img_scale) == 2 min_ratio, max_ratio = ratio_range assert min_ratio <= max_ratio ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) return scale, None def _random_scale(self, results): """Randomly sample an img_scale according to ``ratio_range`` and ``multiscale_mode``. If ``ratio_range`` is specified, a ratio will be sampled and be multiplied with ``img_scale``. If multiple scales are specified by ``img_scale``, a scale will be sampled according to ``multiscale_mode``. Otherwise, single scale will be used. Args: results (dict): Result dict from :obj:`dataset`. Returns: dict: Two new keys 'scale` and 'scale_idx` are added into ``results``, which would be used by subsequent pipelines. """ if self.ratio_range is not None: if self.img_scale is None: h, w = results['img'].shape[:2] scale, scale_idx = self.random_sample_ratio((w, h), self.ratio_range) else: scale, scale_idx = self.random_sample_ratio( self.img_scale[0], self.ratio_range) elif len(self.img_scale) == 1: scale, scale_idx = self.img_scale[0], 0 elif self.multiscale_mode == 'range': scale, scale_idx = self.random_sample(self.img_scale) elif self.multiscale_mode == 'value': scale, scale_idx = self.random_select(self.img_scale) else: raise NotImplementedError results['scale'] = scale results['scale_idx'] = scale_idx def _resize_img(self, results): """Resize images with ``results['scale']``.""" if self.keep_ratio: img, scale_factor = mmcv.imrescale( results['img'], results['scale'], return_scale=True) # the w_scale and h_scale has minor difference # a real fix should be done in the mmcv.imrescale in the future new_h, new_w = img.shape[:2] h, w = results['img'].shape[:2] w_scale = new_w / w h_scale = new_h / h else: img, w_scale, h_scale = mmcv.imresize( results['img'], results['scale'], return_scale=True) scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], dtype=np.float32) results['img'] = img results['img_shape'] = img.shape results['pad_shape'] = img.shape # in case that there is no padding results['scale_factor'] = scale_factor results['keep_ratio'] = self.keep_ratio def _resize_seg(self, results): """Resize semantic segmentation map with ``results['scale']``.""" for key in results.get('seg_fields', []): if self.keep_ratio: gt_seg = mmcv.imrescale( results[key], results['scale'], interpolation='nearest') else: gt_seg = mmcv.imresize( results[key], results['scale'], interpolation='nearest') results[key] = gt_seg def __call__(self, results): """Call function to resize images, bounding boxes, masks, semantic segmentation map. Args: results (dict): Result dict from loading pipeline. Returns: dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', 'keep_ratio' keys are added into result dict. """ if 'scale' not in results: self._random_scale(results) self._resize_img(results) self._resize_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += (f'(img_scale={self.img_scale}, ' f'multiscale_mode={self.multiscale_mode}, ' f'ratio_range={self.ratio_range}, ' f'keep_ratio={self.keep_ratio})') return repr_str @PIPELINES.register_module() class RandomFlip(object): """Flip the image & seg. If the input dict contains the key "flip", then the flag will be used, otherwise it will be randomly decided by a ratio specified in the init method. Args: prob (float, optional): The flipping probability. Default: None. direction(str, optional): The flipping direction. Options are 'horizontal' and 'vertical'. Default: 'horizontal'. """ @deprecated_api_warning({'flip_ratio': 'prob'}, cls_name='RandomFlip') def __init__(self, prob=None, direction='horizontal'): self.prob = prob self.direction = direction if prob is not None: assert prob >= 0 and prob <= 1 assert direction in ['horizontal', 'vertical'] def __call__(self, results): """Call function to flip bounding boxes, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Flipped results, 'flip', 'flip_direction' keys are added into result dict. """ if 'flip' not in results: flip = True if np.random.rand() < self.prob else False results['flip'] = flip if 'flip_direction' not in results: results['flip_direction'] = self.direction if results['flip']: # flip image results['img'] = mmcv.imflip( results['img'], direction=results['flip_direction']) # flip segs for key in results.get('seg_fields', []): # use copy() to make numpy stride positive results[key] = mmcv.imflip( results[key], direction=results['flip_direction']).copy() return results def __repr__(self): return self.__class__.__name__ + f'(prob={self.prob})' @PIPELINES.register_module() class Pad(object): """Pad the image & mask. There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", Args: size (tuple, optional): Fixed padding size. size_divisor (int, optional): The divisor of padded size. pad_val (float, optional): Padding value. Default: 0. seg_pad_val (float, optional): Padding value of segmentation map. Default: 255. """ def __init__(self, size=None, size_divisor=None, pad_val=0, seg_pad_val=255): self.size = size self.size_divisor = size_divisor self.pad_val = pad_val self.seg_pad_val = seg_pad_val # only one of size and size_divisor should be valid assert size is not None or size_divisor is not None assert size is None or size_divisor is None def _pad_img(self, results): """Pad images according to ``self.size``.""" if self.size is not None: padded_img = mmcv.impad( results['img'], shape=self.size, pad_val=self.pad_val) elif self.size_divisor is not None: padded_img = mmcv.impad_to_multiple( results['img'], self.size_divisor, pad_val=self.pad_val) results['img'] = padded_img results['pad_shape'] = padded_img.shape results['pad_fixed_size'] = self.size results['pad_size_divisor'] = self.size_divisor def _pad_seg(self, results): """Pad masks according to ``results['pad_shape']``.""" for key in results.get('seg_fields', []): results[key] = mmcv.impad( results[key], shape=results['pad_shape'][:2], pad_val=self.seg_pad_val) def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) self._pad_seg(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.size}, size_divisor={self.size_divisor}, ' \ f'pad_val={self.pad_val})' return repr_str @PIPELINES.register_module() class Normalize(object): """Normalize the image. Added key is "img_norm_cfg". Args: mean (sequence): Mean values of 3 channels. std (sequence): Std values of 3 channels. to_rgb (bool): Whether to convert the image from BGR to RGB, default is true. """ def __init__(self, mean, std, to_rgb=True): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) self.to_rgb = to_rgb def __call__(self, results): """Call function to normalize images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Normalized results, 'img_norm_cfg' key is added into result dict. """ results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std, self.to_rgb) results['img_norm_cfg'] = dict( mean=self.mean, std=self.std, to_rgb=self.to_rgb) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(mean={self.mean}, std={self.std}, to_rgb=' \ f'{self.to_rgb})' return repr_str @PIPELINES.register_module() class Rerange(object): """Rerange the image pixel value. Args: min_value (float or int): Minimum value of the reranged image. Default: 0. max_value (float or int): Maximum value of the reranged image. Default: 255. """ def __init__(self, min_value=0, max_value=255): assert isinstance(min_value, float) or isinstance(min_value, int) assert isinstance(max_value, float) or isinstance(max_value, int) assert min_value < max_value self.min_value = min_value self.max_value = max_value def __call__(self, results): """Call function to rerange images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Reranged results. """ img = results['img'] img_min_value = np.min(img) img_max_value = np.max(img) assert img_min_value < img_max_value # rerange to [0, 1] img = (img - img_min_value) / (img_max_value - img_min_value) # rerange to [min_value, max_value] img = img * (self.max_value - self.min_value) + self.min_value results['img'] = img return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(min_value={self.min_value}, max_value={self.max_value})' return repr_str @PIPELINES.register_module() class CLAHE(object): """Use CLAHE method to process the image. See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J]. Graphics Gems, 1994:474-485.` for more information. Args: clip_limit (float): Threshold for contrast limiting. Default: 40.0. tile_grid_size (tuple[int]): Size of grid for histogram equalization. Input image will be divided into equally sized rectangular tiles. It defines the number of tiles in row and column. Default: (8, 8). """ def __init__(self, clip_limit=40.0, tile_grid_size=(8, 8)): assert isinstance(clip_limit, (float, int)) self.clip_limit = clip_limit assert is_tuple_of(tile_grid_size, int) assert len(tile_grid_size) == 2 self.tile_grid_size = tile_grid_size def __call__(self, results): """Call function to Use CLAHE method process images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Processed results. """ for i in range(results['img'].shape[2]): results['img'][:, :, i] = mmcv.clahe( np.array(results['img'][:, :, i], dtype=np.uint8), self.clip_limit, self.tile_grid_size) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(clip_limit={self.clip_limit}, '\ f'tile_grid_size={self.tile_grid_size})' return repr_str @PIPELINES.register_module() class RandomCrop(object): """Random crop the image & seg. Args: crop_size (tuple): Expected size after cropping, (h, w). cat_max_ratio (float): The maximum ratio that single category could occupy. """ def __init__(self, crop_size, cat_max_ratio=1., ignore_index=255): assert crop_size[0] > 0 and crop_size[1] > 0 self.crop_size = crop_size self.cat_max_ratio = cat_max_ratio self.ignore_index = ignore_index def get_crop_bbox(self, img): """Randomly get a crop bounding box.""" margin_h = max(img.shape[0] - self.crop_size[0], 0) margin_w = max(img.shape[1] - self.crop_size[1], 0) offset_h = np.random.randint(0, margin_h + 1) offset_w = np.random.randint(0, margin_w + 1) crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0] crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1] return crop_y1, crop_y2, crop_x1, crop_x2 def crop(self, img, crop_bbox): """Crop from ``img``""" crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] return img def __call__(self, results): """Call function to randomly crop images, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. """ img = results['img'] crop_bbox = self.get_crop_bbox(img) if self.cat_max_ratio < 1.: # Repeat 10 times for _ in range(10): seg_temp = self.crop(results['gt_semantic_seg'], crop_bbox) labels, cnt = np.unique(seg_temp, return_counts=True) cnt = cnt[labels != self.ignore_index] if len(cnt) > 1 and np.max(cnt) / np.sum( cnt) < self.cat_max_ratio: break crop_bbox = self.get_crop_bbox(img) # crop the image img = self.crop(img, crop_bbox) img_shape = img.shape results['img'] = img results['img_shape'] = img_shape # crop semantic seg for key in results.get('seg_fields', []): results[key] = self.crop(results[key], crop_bbox) return results def __repr__(self): return self.__class__.__name__ + f'(crop_size={self.crop_size})' @PIPELINES.register_module() class RandomRotate(object): """Rotate the image & seg. Args: prob (float): The rotation probability. degree (float, tuple[float]): Range of degrees to select from. If degree is a number instead of tuple like (min, max), the range of degree will be (``-degree``, ``+degree``) pad_val (float, optional): Padding value of image. Default: 0. seg_pad_val (float, optional): Padding value of segmentation map. Default: 255. center (tuple[float], optional): Center point (w, h) of the rotation in the source image. If not specified, the center of the image will be used. Default: None. auto_bound (bool): Whether to adjust the image size to cover the whole rotated image. Default: False """ def __init__(self, prob, degree, pad_val=0, seg_pad_val=255, center=None, auto_bound=False): self.prob = prob assert prob >= 0 and prob <= 1 if isinstance(degree, (float, int)): assert degree > 0, f'degree {degree} should be positive' self.degree = (-degree, degree) else: self.degree = degree assert len(self.degree) == 2, f'degree {self.degree} should be a ' \ f'tuple of (min, max)' self.pal_val = pad_val self.seg_pad_val = seg_pad_val self.center = center self.auto_bound = auto_bound def __call__(self, results): """Call function to rotate image, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Rotated results. """ rotate = True if np.random.rand() < self.prob else False degree = np.random.uniform(min(*self.degree), max(*self.degree)) if rotate: # rotate image results['img'] = mmcv.imrotate( results['img'], angle=degree, border_value=self.pal_val, center=self.center, auto_bound=self.auto_bound) # rotate segs for key in results.get('seg_fields', []): results[key] = mmcv.imrotate( results[key], angle=degree, border_value=self.seg_pad_val, center=self.center, auto_bound=self.auto_bound, interpolation='nearest') return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(prob={self.prob}, ' \ f'degree={self.degree}, ' \ f'pad_val={self.pal_val}, ' \ f'seg_pad_val={self.seg_pad_val}, ' \ f'center={self.center}, ' \ f'auto_bound={self.auto_bound})' return repr_str @PIPELINES.register_module() class RGB2Gray(object): """Convert RGB image to grayscale image. This transform calculate the weighted mean of input image channels with ``weights`` and then expand the channels to ``out_channels``. When ``out_channels`` is None, the number of output channels is the same as input channels. Args: out_channels (int): Expected number of output channels after transforming. Default: None. weights (tuple[float]): The weights to calculate the weighted mean. Default: (0.299, 0.587, 0.114). """ def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)): assert out_channels is None or out_channels > 0 self.out_channels = out_channels assert isinstance(weights, tuple) for item in weights: assert isinstance(item, (float, int)) self.weights = weights def __call__(self, results): """Call function to convert RGB image to grayscale image. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with grayscale image. """ img = results['img'] assert len(img.shape) == 3 assert img.shape[2] == len(self.weights) weights = np.array(self.weights).reshape((1, 1, -1)) img = (img * weights).sum(2, keepdims=True) if self.out_channels is None: img = img.repeat(weights.shape[2], axis=2) else: img = img.repeat(self.out_channels, axis=2) results['img'] = img results['img_shape'] = img.shape return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(out_channels={self.out_channels}, ' \ f'weights={self.weights})' return repr_str @PIPELINES.register_module() class AdjustGamma(object): """Using gamma correction to process the image. Args: gamma (float or int): Gamma value used in gamma correction. Default: 1.0. """ def __init__(self, gamma=1.0): assert isinstance(gamma, float) or isinstance(gamma, int) assert gamma > 0 self.gamma = gamma inv_gamma = 1.0 / gamma self.table = np.array([(i / 255.0)**inv_gamma * 255 for i in np.arange(256)]).astype('uint8') def __call__(self, results): """Call function to process the image with gamma correction. Args: results (dict): Result dict from loading pipeline. Returns: dict: Processed results. """ results['img'] = mmcv.lut_transform( np.array(results['img'], dtype=np.uint8), self.table) return results def __repr__(self): return self.__class__.__name__ + f'(gamma={self.gamma})' @PIPELINES.register_module() class SegRescale(object): """Rescale semantic segmentation maps. Args: scale_factor (float): The scale factor of the final output. """ def __init__(self, scale_factor=1): self.scale_factor = scale_factor def __call__(self, results): """Call function to scale the semantic segmentation map. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with semantic segmentation map scaled. """ for key in results.get('seg_fields', []): if self.scale_factor != 1: results[key] = mmcv.imrescale( results[key], self.scale_factor, interpolation='nearest') return results def __repr__(self): return self.__class__.__name__ + f'(scale_factor={self.scale_factor})' @PIPELINES.register_module() class PhotoMetricDistortion(object): """Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last. 1. random brightness 2. random contrast (mode 0) 3. convert color from BGR to HSV 4. random saturation 5. random hue 6. convert color from HSV to BGR 7. random contrast (mode 1) Args: brightness_delta (int): delta of brightness. contrast_range (tuple): range of contrast. saturation_range (tuple): range of saturation. hue_delta (int): delta of hue. """ def __init__(self, brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18): self.brightness_delta = brightness_delta self.contrast_lower, self.contrast_upper = contrast_range self.saturation_lower, self.saturation_upper = saturation_range self.hue_delta = hue_delta def convert(self, img, alpha=1, beta=0): """Multiple with alpha and add beat with clip.""" img = img.astype(np.float32) * alpha + beta img = np.clip(img, 0, 255) return img.astype(np.uint8) def brightness(self, img): """Brightness distortion.""" if random.randint(2): return self.convert( img, beta=random.uniform(-self.brightness_delta, self.brightness_delta)) return img def contrast(self, img): """Contrast distortion.""" if random.randint(2): return self.convert( img, alpha=random.uniform(self.contrast_lower, self.contrast_upper)) return img def saturation(self, img): """Saturation distortion.""" if random.randint(2): img = mmcv.bgr2hsv(img) img[:, :, 1] = self.convert( img[:, :, 1], alpha=random.uniform(self.saturation_lower, self.saturation_upper)) img = mmcv.hsv2bgr(img) return img def hue(self, img): """Hue distortion.""" if random.randint(2): img = mmcv.bgr2hsv(img) img[:, :, 0] = (img[:, :, 0].astype(int) + random.randint(-self.hue_delta, self.hue_delta)) % 180 img = mmcv.hsv2bgr(img) return img def __call__(self, results): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images distorted. """ img = results['img'] # random brightness img = self.brightness(img) # mode == 0 --> do random contrast first # mode == 1 --> do random contrast last mode = random.randint(2) if mode == 1: img = self.contrast(img) # random saturation img = self.saturation(img) # random hue img = self.hue(img) # random contrast if mode == 0: img = self.contrast(img) results['img'] = img return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += (f'(brightness_delta={self.brightness_delta}, ' f'contrast_range=({self.contrast_lower}, ' f'{self.contrast_upper}), ' f'saturation_range=({self.saturation_lower}, ' f'{self.saturation_upper}), ' f'hue_delta={self.hue_delta})') return repr_str ================================================ FILE: annotator/mmpkg/mmseg/datasets/stare.py ================================================ import os.path as osp from .builder import DATASETS from .custom import CustomDataset @DATASETS.register_module() class STAREDataset(CustomDataset): """STARE dataset. In segmentation map annotation for STARE, 0 stands for background, which is included in 2 categories. ``reduce_zero_label`` is fixed to False. The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to '.ah.png'. """ CLASSES = ('background', 'vessel') PALETTE = [[120, 120, 120], [6, 230, 230]] def __init__(self, **kwargs): super(STAREDataset, self).__init__( img_suffix='.png', seg_map_suffix='.ah.png', reduce_zero_label=False, **kwargs) assert osp.exists(self.img_dir) ================================================ FILE: annotator/mmpkg/mmseg/datasets/voc.py ================================================ import os.path as osp from .builder import DATASETS from .custom import CustomDataset @DATASETS.register_module() class PascalVOCDataset(CustomDataset): """Pascal VOC dataset. Args: split (str): Split txt file for Pascal VOC. """ CLASSES = ('background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') PALETTE = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]] def __init__(self, split, **kwargs): super(PascalVOCDataset, self).__init__( img_suffix='.jpg', seg_map_suffix='.png', split=split, **kwargs) assert osp.exists(self.img_dir) and self.split is not None ================================================ FILE: annotator/mmpkg/mmseg/models/__init__.py ================================================ from .backbones import * # noqa: F401,F403 from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone, build_head, build_loss, build_segmentor) from .decode_heads import * # noqa: F401,F403 from .losses import * # noqa: F401,F403 from .necks import * # noqa: F401,F403 from .segmentors import * # noqa: F401,F403 __all__ = [ 'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone', 'build_head', 'build_loss', 'build_segmentor' ] ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/__init__.py ================================================ from .cgnet import CGNet # from .fast_scnn import FastSCNN from .hrnet import HRNet from .mobilenet_v2 import MobileNetV2 from .mobilenet_v3 import MobileNetV3 from .resnest import ResNeSt from .resnet import ResNet, ResNetV1c, ResNetV1d from .resnext import ResNeXt from .unet import UNet from .vit import VisionTransformer __all__ = [ 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3', 'VisionTransformer' ] ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/cgnet.py ================================================ import torch import torch.nn as nn import torch.utils.checkpoint as cp from annotator.mmpkg.mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer, constant_init, kaiming_init) from annotator.mmpkg.mmcv.runner import load_checkpoint from annotator.mmpkg.mmcv.utils.parrots_wrapper import _BatchNorm from annotator.mmpkg.mmseg.utils import get_root_logger from ..builder import BACKBONES class GlobalContextExtractor(nn.Module): """Global Context Extractor for CGNet. This class is employed to refine the joint feature of both local feature and surrounding context. Args: channel (int): Number of input feature channels. reduction (int): Reductions for global context extractor. Default: 16. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. """ def __init__(self, channel, reduction=16, with_cp=False): super(GlobalContextExtractor, self).__init__() self.channel = channel self.reduction = reduction assert reduction >= 1 and channel >= reduction self.with_cp = with_cp self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Sequential( nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True), nn.Linear(channel // reduction, channel), nn.Sigmoid()) def forward(self, x): def _inner_forward(x): num_batch, num_channel = x.size()[:2] y = self.avg_pool(x).view(num_batch, num_channel) y = self.fc(y).view(num_batch, num_channel, 1, 1) return x * y if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) return out class ContextGuidedBlock(nn.Module): """Context Guided Block for CGNet. This class consists of four components: local feature extractor, surrounding feature extractor, joint feature extractor and global context extractor. Args: in_channels (int): Number of input feature channels. out_channels (int): Number of output feature channels. dilation (int): Dilation rate for surrounding context extractor. Default: 2. reduction (int): Reduction for global context extractor. Default: 16. skip_connect (bool): Add input to output or not. Default: True. downsample (bool): Downsample the input to 1/2 or not. Default: False. conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN', requires_grad=True). act_cfg (dict): Config dict for activation layer. Default: dict(type='PReLU'). with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. """ def __init__(self, in_channels, out_channels, dilation=2, reduction=16, skip_connect=True, downsample=False, conv_cfg=None, norm_cfg=dict(type='BN', requires_grad=True), act_cfg=dict(type='PReLU'), with_cp=False): super(ContextGuidedBlock, self).__init__() self.with_cp = with_cp self.downsample = downsample channels = out_channels if downsample else out_channels // 2 if 'type' in act_cfg and act_cfg['type'] == 'PReLU': act_cfg['num_parameters'] = channels kernel_size = 3 if downsample else 1 stride = 2 if downsample else 1 padding = (kernel_size - 1) // 2 self.conv1x1 = ConvModule( in_channels, channels, kernel_size, stride, padding, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) self.f_loc = build_conv_layer( conv_cfg, channels, channels, kernel_size=3, padding=1, groups=channels, bias=False) self.f_sur = build_conv_layer( conv_cfg, channels, channels, kernel_size=3, padding=dilation, groups=channels, dilation=dilation, bias=False) self.bn = build_norm_layer(norm_cfg, 2 * channels)[1] self.activate = nn.PReLU(2 * channels) if downsample: self.bottleneck = build_conv_layer( conv_cfg, 2 * channels, out_channels, kernel_size=1, bias=False) self.skip_connect = skip_connect and not downsample self.f_glo = GlobalContextExtractor(out_channels, reduction, with_cp) def forward(self, x): def _inner_forward(x): out = self.conv1x1(x) loc = self.f_loc(out) sur = self.f_sur(out) joi_feat = torch.cat([loc, sur], 1) # the joint feature joi_feat = self.bn(joi_feat) joi_feat = self.activate(joi_feat) if self.downsample: joi_feat = self.bottleneck(joi_feat) # channel = out_channels # f_glo is employed to refine the joint feature out = self.f_glo(joi_feat) if self.skip_connect: return x + out else: return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) return out class InputInjection(nn.Module): """Downsampling module for CGNet.""" def __init__(self, num_downsampling): super(InputInjection, self).__init__() self.pool = nn.ModuleList() for i in range(num_downsampling): self.pool.append(nn.AvgPool2d(3, stride=2, padding=1)) def forward(self, x): for pool in self.pool: x = pool(x) return x @BACKBONES.register_module() class CGNet(nn.Module): """CGNet backbone. A Light-weight Context Guided Network for Semantic Segmentation arXiv: https://arxiv.org/abs/1811.08201 Args: in_channels (int): Number of input image channels. Normally 3. num_channels (tuple[int]): Numbers of feature channels at each stages. Default: (32, 64, 128). num_blocks (tuple[int]): Numbers of CG blocks at stage 1 and stage 2. Default: (3, 21). dilations (tuple[int]): Dilation rate for surrounding context extractors at stage 1 and stage 2. Default: (2, 4). reductions (tuple[int]): Reductions for global context extractors at stage 1 and stage 2. Default: (8, 16). conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN', requires_grad=True). act_cfg (dict): Config dict for activation layer. Default: dict(type='PReLU'). norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. Default: False. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. """ def __init__(self, in_channels=3, num_channels=(32, 64, 128), num_blocks=(3, 21), dilations=(2, 4), reductions=(8, 16), conv_cfg=None, norm_cfg=dict(type='BN', requires_grad=True), act_cfg=dict(type='PReLU'), norm_eval=False, with_cp=False): super(CGNet, self).__init__() self.in_channels = in_channels self.num_channels = num_channels assert isinstance(self.num_channels, tuple) and len( self.num_channels) == 3 self.num_blocks = num_blocks assert isinstance(self.num_blocks, tuple) and len(self.num_blocks) == 2 self.dilations = dilations assert isinstance(self.dilations, tuple) and len(self.dilations) == 2 self.reductions = reductions assert isinstance(self.reductions, tuple) and len(self.reductions) == 2 self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg if 'type' in self.act_cfg and self.act_cfg['type'] == 'PReLU': self.act_cfg['num_parameters'] = num_channels[0] self.norm_eval = norm_eval self.with_cp = with_cp cur_channels = in_channels self.stem = nn.ModuleList() for i in range(3): self.stem.append( ConvModule( cur_channels, num_channels[0], 3, 2 if i == 0 else 1, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) cur_channels = num_channels[0] self.inject_2x = InputInjection(1) # down-sample for Input, factor=2 self.inject_4x = InputInjection(2) # down-sample for Input, factor=4 cur_channels += in_channels self.norm_prelu_0 = nn.Sequential( build_norm_layer(norm_cfg, cur_channels)[1], nn.PReLU(cur_channels)) # stage 1 self.level1 = nn.ModuleList() for i in range(num_blocks[0]): self.level1.append( ContextGuidedBlock( cur_channels if i == 0 else num_channels[1], num_channels[1], dilations[0], reductions[0], downsample=(i == 0), conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, with_cp=with_cp)) # CG block cur_channels = 2 * num_channels[1] + in_channels self.norm_prelu_1 = nn.Sequential( build_norm_layer(norm_cfg, cur_channels)[1], nn.PReLU(cur_channels)) # stage 2 self.level2 = nn.ModuleList() for i in range(num_blocks[1]): self.level2.append( ContextGuidedBlock( cur_channels if i == 0 else num_channels[2], num_channels[2], dilations[1], reductions[1], downsample=(i == 0), conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, with_cp=with_cp)) # CG block cur_channels = 2 * num_channels[2] self.norm_prelu_2 = nn.Sequential( build_norm_layer(norm_cfg, cur_channels)[1], nn.PReLU(cur_channels)) def forward(self, x): output = [] # stage 0 inp_2x = self.inject_2x(x) inp_4x = self.inject_4x(x) for layer in self.stem: x = layer(x) x = self.norm_prelu_0(torch.cat([x, inp_2x], 1)) output.append(x) # stage 1 for i, layer in enumerate(self.level1): x = layer(x) if i == 0: down1 = x x = self.norm_prelu_1(torch.cat([x, down1, inp_4x], 1)) output.append(x) # stage 2 for i, layer in enumerate(self.level2): x = layer(x) if i == 0: down2 = x x = self.norm_prelu_2(torch.cat([down2, x], 1)) output.append(x) return output def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ if isinstance(pretrained, str): logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, (nn.Conv2d, nn.Linear)): kaiming_init(m) elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) elif isinstance(m, nn.PReLU): constant_init(m, 0) else: raise TypeError('pretrained must be a str or None') def train(self, mode=True): """Convert the model into training mode will keeping the normalization layer freezed.""" super(CGNet, self).train(mode) if mode and self.norm_eval: for m in self.modules(): # trick: eval have effect on BatchNorm only if isinstance(m, _BatchNorm): m.eval() ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/fast_scnn.py ================================================ import torch import torch.nn as nn from annotator.mmpkg.mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, constant_init, kaiming_init) from torch.nn.modules.batchnorm import _BatchNorm from annotator.mmpkg.mmseg.models.decode_heads.psp_head import PPM from annotator.mmpkg.mmseg.ops import resize from ..builder import BACKBONES from ..utils.inverted_residual import InvertedResidual class LearningToDownsample(nn.Module): """Learning to downsample module. Args: in_channels (int): Number of input channels. dw_channels (tuple[int]): Number of output channels of the first and the second depthwise conv (dwconv) layers. out_channels (int): Number of output channels of the whole 'learning to downsample' module. conv_cfg (dict | None): Config of conv layers. Default: None norm_cfg (dict | None): Config of norm layers. Default: dict(type='BN') act_cfg (dict): Config of activation layers. Default: dict(type='ReLU') """ def __init__(self, in_channels, dw_channels, out_channels, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU')): super(LearningToDownsample, self).__init__() self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg dw_channels1 = dw_channels[0] dw_channels2 = dw_channels[1] self.conv = ConvModule( in_channels, dw_channels1, 3, stride=2, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.dsconv1 = DepthwiseSeparableConvModule( dw_channels1, dw_channels2, kernel_size=3, stride=2, padding=1, norm_cfg=self.norm_cfg) self.dsconv2 = DepthwiseSeparableConvModule( dw_channels2, out_channels, kernel_size=3, stride=2, padding=1, norm_cfg=self.norm_cfg) def forward(self, x): x = self.conv(x) x = self.dsconv1(x) x = self.dsconv2(x) return x class GlobalFeatureExtractor(nn.Module): """Global feature extractor module. Args: in_channels (int): Number of input channels of the GFE module. Default: 64 block_channels (tuple[int]): Tuple of ints. Each int specifies the number of output channels of each Inverted Residual module. Default: (64, 96, 128) out_channels(int): Number of output channels of the GFE module. Default: 128 expand_ratio (int): Adjusts number of channels of the hidden layer in InvertedResidual by this amount. Default: 6 num_blocks (tuple[int]): Tuple of ints. Each int specifies the number of times each Inverted Residual module is repeated. The repeated Inverted Residual modules are called a 'group'. Default: (3, 3, 3) strides (tuple[int]): Tuple of ints. Each int specifies the downsampling factor of each 'group'. Default: (2, 2, 1) pool_scales (tuple[int]): Tuple of ints. Each int specifies the parameter required in 'global average pooling' within PPM. Default: (1, 2, 3, 6) conv_cfg (dict | None): Config of conv layers. Default: None norm_cfg (dict | None): Config of norm layers. Default: dict(type='BN') act_cfg (dict): Config of activation layers. Default: dict(type='ReLU') align_corners (bool): align_corners argument of F.interpolate. Default: False """ def __init__(self, in_channels=64, block_channels=(64, 96, 128), out_channels=128, expand_ratio=6, num_blocks=(3, 3, 3), strides=(2, 2, 1), pool_scales=(1, 2, 3, 6), conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), align_corners=False): super(GlobalFeatureExtractor, self).__init__() self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg assert len(block_channels) == len(num_blocks) == 3 self.bottleneck1 = self._make_layer(in_channels, block_channels[0], num_blocks[0], strides[0], expand_ratio) self.bottleneck2 = self._make_layer(block_channels[0], block_channels[1], num_blocks[1], strides[1], expand_ratio) self.bottleneck3 = self._make_layer(block_channels[1], block_channels[2], num_blocks[2], strides[2], expand_ratio) self.ppm = PPM( pool_scales, block_channels[2], block_channels[2] // 4, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, align_corners=align_corners) self.out = ConvModule( block_channels[2] * 2, out_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def _make_layer(self, in_channels, out_channels, blocks, stride=1, expand_ratio=6): layers = [ InvertedResidual( in_channels, out_channels, stride, expand_ratio, norm_cfg=self.norm_cfg) ] for i in range(1, blocks): layers.append( InvertedResidual( out_channels, out_channels, 1, expand_ratio, norm_cfg=self.norm_cfg)) return nn.Sequential(*layers) def forward(self, x): x = self.bottleneck1(x) x = self.bottleneck2(x) x = self.bottleneck3(x) x = torch.cat([x, *self.ppm(x)], dim=1) x = self.out(x) return x class FeatureFusionModule(nn.Module): """Feature fusion module. Args: higher_in_channels (int): Number of input channels of the higher-resolution branch. lower_in_channels (int): Number of input channels of the lower-resolution branch. out_channels (int): Number of output channels. conv_cfg (dict | None): Config of conv layers. Default: None norm_cfg (dict | None): Config of norm layers. Default: dict(type='BN') act_cfg (dict): Config of activation layers. Default: dict(type='ReLU') align_corners (bool): align_corners argument of F.interpolate. Default: False """ def __init__(self, higher_in_channels, lower_in_channels, out_channels, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), align_corners=False): super(FeatureFusionModule, self).__init__() self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.align_corners = align_corners self.dwconv = ConvModule( lower_in_channels, out_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.conv_lower_res = ConvModule( out_channels, out_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=None) self.conv_higher_res = ConvModule( higher_in_channels, out_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=None) self.relu = nn.ReLU(True) def forward(self, higher_res_feature, lower_res_feature): lower_res_feature = resize( lower_res_feature, size=higher_res_feature.size()[2:], mode='bilinear', align_corners=self.align_corners) lower_res_feature = self.dwconv(lower_res_feature) lower_res_feature = self.conv_lower_res(lower_res_feature) higher_res_feature = self.conv_higher_res(higher_res_feature) out = higher_res_feature + lower_res_feature return self.relu(out) @BACKBONES.register_module() class FastSCNN(nn.Module): """Fast-SCNN Backbone. Args: in_channels (int): Number of input image channels. Default: 3. downsample_dw_channels (tuple[int]): Number of output channels after the first conv layer & the second conv layer in Learning-To-Downsample (LTD) module. Default: (32, 48). global_in_channels (int): Number of input channels of Global Feature Extractor(GFE). Equal to number of output channels of LTD. Default: 64. global_block_channels (tuple[int]): Tuple of integers that describe the output channels for each of the MobileNet-v2 bottleneck residual blocks in GFE. Default: (64, 96, 128). global_block_strides (tuple[int]): Tuple of integers that describe the strides (downsampling factors) for each of the MobileNet-v2 bottleneck residual blocks in GFE. Default: (2, 2, 1). global_out_channels (int): Number of output channels of GFE. Default: 128. higher_in_channels (int): Number of input channels of the higher resolution branch in FFM. Equal to global_in_channels. Default: 64. lower_in_channels (int): Number of input channels of the lower resolution branch in FFM. Equal to global_out_channels. Default: 128. fusion_out_channels (int): Number of output channels of FFM. Default: 128. out_indices (tuple): Tuple of indices of list [higher_res_features, lower_res_features, fusion_output]. Often set to (0,1,2) to enable aux. heads. Default: (0, 1, 2). conv_cfg (dict | None): Config of conv layers. Default: None norm_cfg (dict | None): Config of norm layers. Default: dict(type='BN') act_cfg (dict): Config of activation layers. Default: dict(type='ReLU') align_corners (bool): align_corners argument of F.interpolate. Default: False """ def __init__(self, in_channels=3, downsample_dw_channels=(32, 48), global_in_channels=64, global_block_channels=(64, 96, 128), global_block_strides=(2, 2, 1), global_out_channels=128, higher_in_channels=64, lower_in_channels=128, fusion_out_channels=128, out_indices=(0, 1, 2), conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), align_corners=False): super(FastSCNN, self).__init__() if global_in_channels != higher_in_channels: raise AssertionError('Global Input Channels must be the same \ with Higher Input Channels!') elif global_out_channels != lower_in_channels: raise AssertionError('Global Output Channels must be the same \ with Lower Input Channels!') self.in_channels = in_channels self.downsample_dw_channels1 = downsample_dw_channels[0] self.downsample_dw_channels2 = downsample_dw_channels[1] self.global_in_channels = global_in_channels self.global_block_channels = global_block_channels self.global_block_strides = global_block_strides self.global_out_channels = global_out_channels self.higher_in_channels = higher_in_channels self.lower_in_channels = lower_in_channels self.fusion_out_channels = fusion_out_channels self.out_indices = out_indices self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.align_corners = align_corners self.learning_to_downsample = LearningToDownsample( in_channels, downsample_dw_channels, global_in_channels, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.global_feature_extractor = GlobalFeatureExtractor( global_in_channels, global_block_channels, global_out_channels, strides=self.global_block_strides, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, align_corners=self.align_corners) self.feature_fusion = FeatureFusionModule( higher_in_channels, lower_in_channels, fusion_out_channels, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, align_corners=self.align_corners) def init_weights(self, pretrained=None): for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) def forward(self, x): higher_res_features = self.learning_to_downsample(x) lower_res_features = self.global_feature_extractor(higher_res_features) fusion_output = self.feature_fusion(higher_res_features, lower_res_features) outs = [higher_res_features, lower_res_features, fusion_output] outs = [outs[i] for i in self.out_indices] return tuple(outs) ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/hrnet.py ================================================ import torch.nn as nn from annotator.mmpkg.mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, kaiming_init) from annotator.mmpkg.mmcv.runner import load_checkpoint from annotator.mmpkg.mmcv.utils.parrots_wrapper import _BatchNorm from annotator.mmpkg.mmseg.ops import Upsample, resize from annotator.mmpkg.mmseg.utils import get_root_logger from ..builder import BACKBONES from .resnet import BasicBlock, Bottleneck class HRModule(nn.Module): """High-Resolution Module for HRNet. In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange is in this module. """ def __init__(self, num_branches, blocks, num_blocks, in_channels, num_channels, multiscale_output=True, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN', requires_grad=True)): super(HRModule, self).__init__() self._check_branches(num_branches, num_blocks, in_channels, num_channels) self.in_channels = in_channels self.num_branches = num_branches self.multiscale_output = multiscale_output self.norm_cfg = norm_cfg self.conv_cfg = conv_cfg self.with_cp = with_cp self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels) self.fuse_layers = self._make_fuse_layers() self.relu = nn.ReLU(inplace=False) def _check_branches(self, num_branches, num_blocks, in_channels, num_channels): """Check branches configuration.""" if num_branches != len(num_blocks): error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_BLOCKS(' \ f'{len(num_blocks)})' raise ValueError(error_msg) if num_branches != len(num_channels): error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_CHANNELS(' \ f'{len(num_channels)})' raise ValueError(error_msg) if num_branches != len(in_channels): error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_INCHANNELS(' \ f'{len(in_channels)})' raise ValueError(error_msg) def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1): """Build one branch.""" downsample = None if stride != 1 or \ self.in_channels[branch_index] != \ num_channels[branch_index] * block.expansion: downsample = nn.Sequential( build_conv_layer( self.conv_cfg, self.in_channels[branch_index], num_channels[branch_index] * block.expansion, kernel_size=1, stride=stride, bias=False), build_norm_layer(self.norm_cfg, num_channels[branch_index] * block.expansion)[1]) layers = [] layers.append( block( self.in_channels[branch_index], num_channels[branch_index], stride, downsample=downsample, with_cp=self.with_cp, norm_cfg=self.norm_cfg, conv_cfg=self.conv_cfg)) self.in_channels[branch_index] = \ num_channels[branch_index] * block.expansion for i in range(1, num_blocks[branch_index]): layers.append( block( self.in_channels[branch_index], num_channels[branch_index], with_cp=self.with_cp, norm_cfg=self.norm_cfg, conv_cfg=self.conv_cfg)) return nn.Sequential(*layers) def _make_branches(self, num_branches, block, num_blocks, num_channels): """Build multiple branch.""" branches = [] for i in range(num_branches): branches.append( self._make_one_branch(i, block, num_blocks, num_channels)) return nn.ModuleList(branches) def _make_fuse_layers(self): """Build fuse layer.""" if self.num_branches == 1: return None num_branches = self.num_branches in_channels = self.in_channels fuse_layers = [] num_out_branches = num_branches if self.multiscale_output else 1 for i in range(num_out_branches): fuse_layer = [] for j in range(num_branches): if j > i: fuse_layer.append( nn.Sequential( build_conv_layer( self.conv_cfg, in_channels[j], in_channels[i], kernel_size=1, stride=1, padding=0, bias=False), build_norm_layer(self.norm_cfg, in_channels[i])[1], # we set align_corners=False for HRNet Upsample( scale_factor=2**(j - i), mode='bilinear', align_corners=False))) elif j == i: fuse_layer.append(None) else: conv_downsamples = [] for k in range(i - j): if k == i - j - 1: conv_downsamples.append( nn.Sequential( build_conv_layer( self.conv_cfg, in_channels[j], in_channels[i], kernel_size=3, stride=2, padding=1, bias=False), build_norm_layer(self.norm_cfg, in_channels[i])[1])) else: conv_downsamples.append( nn.Sequential( build_conv_layer( self.conv_cfg, in_channels[j], in_channels[j], kernel_size=3, stride=2, padding=1, bias=False), build_norm_layer(self.norm_cfg, in_channels[j])[1], nn.ReLU(inplace=False))) fuse_layer.append(nn.Sequential(*conv_downsamples)) fuse_layers.append(nn.ModuleList(fuse_layer)) return nn.ModuleList(fuse_layers) def forward(self, x): """Forward function.""" if self.num_branches == 1: return [self.branches[0](x[0])] for i in range(self.num_branches): x[i] = self.branches[i](x[i]) x_fuse = [] for i in range(len(self.fuse_layers)): y = 0 for j in range(self.num_branches): if i == j: y += x[j] elif j > i: y = y + resize( self.fuse_layers[i][j](x[j]), size=x[i].shape[2:], mode='bilinear', align_corners=False) else: y += self.fuse_layers[i][j](x[j]) x_fuse.append(self.relu(y)) return x_fuse @BACKBONES.register_module() class HRNet(nn.Module): """HRNet backbone. High-Resolution Representations for Labeling Pixels and Regions arXiv: https://arxiv.org/abs/1904.04514 Args: extra (dict): detailed configuration for each stage of HRNet. in_channels (int): Number of input image channels. Normally 3. conv_cfg (dict): dictionary to construct and config conv layer. norm_cfg (dict): dictionary to construct and config norm layer. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. zero_init_residual (bool): whether to use zero init for last norm layer in resblocks to let them behave as identity. Example: >>> from annotator.mmpkg.mmseg.models import HRNet >>> import torch >>> extra = dict( >>> stage1=dict( >>> num_modules=1, >>> num_branches=1, >>> block='BOTTLENECK', >>> num_blocks=(4, ), >>> num_channels=(64, )), >>> stage2=dict( >>> num_modules=1, >>> num_branches=2, >>> block='BASIC', >>> num_blocks=(4, 4), >>> num_channels=(32, 64)), >>> stage3=dict( >>> num_modules=4, >>> num_branches=3, >>> block='BASIC', >>> num_blocks=(4, 4, 4), >>> num_channels=(32, 64, 128)), >>> stage4=dict( >>> num_modules=3, >>> num_branches=4, >>> block='BASIC', >>> num_blocks=(4, 4, 4, 4), >>> num_channels=(32, 64, 128, 256))) >>> self = HRNet(extra, in_channels=1) >>> self.eval() >>> inputs = torch.rand(1, 1, 32, 32) >>> level_outputs = self.forward(inputs) >>> for level_out in level_outputs: ... print(tuple(level_out.shape)) (1, 32, 8, 8) (1, 64, 4, 4) (1, 128, 2, 2) (1, 256, 1, 1) """ blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} def __init__(self, extra, in_channels=3, conv_cfg=None, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=False, with_cp=False, zero_init_residual=False): super(HRNet, self).__init__() self.extra = extra self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.norm_eval = norm_eval self.with_cp = with_cp self.zero_init_residual = zero_init_residual # stem net self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) self.conv1 = build_conv_layer( self.conv_cfg, in_channels, 64, kernel_size=3, stride=2, padding=1, bias=False) self.add_module(self.norm1_name, norm1) self.conv2 = build_conv_layer( self.conv_cfg, 64, 64, kernel_size=3, stride=2, padding=1, bias=False) self.add_module(self.norm2_name, norm2) self.relu = nn.ReLU(inplace=True) # stage 1 self.stage1_cfg = self.extra['stage1'] num_channels = self.stage1_cfg['num_channels'][0] block_type = self.stage1_cfg['block'] num_blocks = self.stage1_cfg['num_blocks'][0] block = self.blocks_dict[block_type] stage1_out_channels = num_channels * block.expansion self.layer1 = self._make_layer(block, 64, num_channels, num_blocks) # stage 2 self.stage2_cfg = self.extra['stage2'] num_channels = self.stage2_cfg['num_channels'] block_type = self.stage2_cfg['block'] block = self.blocks_dict[block_type] num_channels = [channel * block.expansion for channel in num_channels] self.transition1 = self._make_transition_layer([stage1_out_channels], num_channels) self.stage2, pre_stage_channels = self._make_stage( self.stage2_cfg, num_channels) # stage 3 self.stage3_cfg = self.extra['stage3'] num_channels = self.stage3_cfg['num_channels'] block_type = self.stage3_cfg['block'] block = self.blocks_dict[block_type] num_channels = [channel * block.expansion for channel in num_channels] self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels) self.stage3, pre_stage_channels = self._make_stage( self.stage3_cfg, num_channels) # stage 4 self.stage4_cfg = self.extra['stage4'] num_channels = self.stage4_cfg['num_channels'] block_type = self.stage4_cfg['block'] block = self.blocks_dict[block_type] num_channels = [channel * block.expansion for channel in num_channels] self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels) self.stage4, pre_stage_channels = self._make_stage( self.stage4_cfg, num_channels) @property def norm1(self): """nn.Module: the normalization layer named "norm1" """ return getattr(self, self.norm1_name) @property def norm2(self): """nn.Module: the normalization layer named "norm2" """ return getattr(self, self.norm2_name) def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer): """Make transition layer.""" num_branches_cur = len(num_channels_cur_layer) num_branches_pre = len(num_channels_pre_layer) transition_layers = [] for i in range(num_branches_cur): if i < num_branches_pre: if num_channels_cur_layer[i] != num_channels_pre_layer[i]: transition_layers.append( nn.Sequential( build_conv_layer( self.conv_cfg, num_channels_pre_layer[i], num_channels_cur_layer[i], kernel_size=3, stride=1, padding=1, bias=False), build_norm_layer(self.norm_cfg, num_channels_cur_layer[i])[1], nn.ReLU(inplace=True))) else: transition_layers.append(None) else: conv_downsamples = [] for j in range(i + 1 - num_branches_pre): in_channels = num_channels_pre_layer[-1] out_channels = num_channels_cur_layer[i] \ if j == i - num_branches_pre else in_channels conv_downsamples.append( nn.Sequential( build_conv_layer( self.conv_cfg, in_channels, out_channels, kernel_size=3, stride=2, padding=1, bias=False), build_norm_layer(self.norm_cfg, out_channels)[1], nn.ReLU(inplace=True))) transition_layers.append(nn.Sequential(*conv_downsamples)) return nn.ModuleList(transition_layers) def _make_layer(self, block, inplanes, planes, blocks, stride=1): """Make each layer.""" downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( build_conv_layer( self.conv_cfg, inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), build_norm_layer(self.norm_cfg, planes * block.expansion)[1]) layers = [] layers.append( block( inplanes, planes, stride, downsample=downsample, with_cp=self.with_cp, norm_cfg=self.norm_cfg, conv_cfg=self.conv_cfg)) inplanes = planes * block.expansion for i in range(1, blocks): layers.append( block( inplanes, planes, with_cp=self.with_cp, norm_cfg=self.norm_cfg, conv_cfg=self.conv_cfg)) return nn.Sequential(*layers) def _make_stage(self, layer_config, in_channels, multiscale_output=True): """Make each stage.""" num_modules = layer_config['num_modules'] num_branches = layer_config['num_branches'] num_blocks = layer_config['num_blocks'] num_channels = layer_config['num_channels'] block = self.blocks_dict[layer_config['block']] hr_modules = [] for i in range(num_modules): # multi_scale_output is only used for the last module if not multiscale_output and i == num_modules - 1: reset_multiscale_output = False else: reset_multiscale_output = True hr_modules.append( HRModule( num_branches, block, num_blocks, in_channels, num_channels, reset_multiscale_output, with_cp=self.with_cp, norm_cfg=self.norm_cfg, conv_cfg=self.conv_cfg)) return nn.Sequential(*hr_modules), in_channels def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ if isinstance(pretrained, str): logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) if self.zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): constant_init(m.norm3, 0) elif isinstance(m, BasicBlock): constant_init(m.norm2, 0) else: raise TypeError('pretrained must be a str or None') def forward(self, x): """Forward function.""" x = self.conv1(x) x = self.norm1(x) x = self.relu(x) x = self.conv2(x) x = self.norm2(x) x = self.relu(x) x = self.layer1(x) x_list = [] for i in range(self.stage2_cfg['num_branches']): if self.transition1[i] is not None: x_list.append(self.transition1[i](x)) else: x_list.append(x) y_list = self.stage2(x_list) x_list = [] for i in range(self.stage3_cfg['num_branches']): if self.transition2[i] is not None: x_list.append(self.transition2[i](y_list[-1])) else: x_list.append(y_list[i]) y_list = self.stage3(x_list) x_list = [] for i in range(self.stage4_cfg['num_branches']): if self.transition3[i] is not None: x_list.append(self.transition3[i](y_list[-1])) else: x_list.append(y_list[i]) y_list = self.stage4(x_list) return y_list def train(self, mode=True): """Convert the model into training mode will keeping the normalization layer freezed.""" super(HRNet, self).train(mode) if mode and self.norm_eval: for m in self.modules(): # trick: eval have effect on BatchNorm only if isinstance(m, _BatchNorm): m.eval() ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/mobilenet_v2.py ================================================ import logging import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule, constant_init, kaiming_init from annotator.mmpkg.mmcv.runner import load_checkpoint from torch.nn.modules.batchnorm import _BatchNorm from ..builder import BACKBONES from ..utils import InvertedResidual, make_divisible @BACKBONES.register_module() class MobileNetV2(nn.Module): """MobileNetV2 backbone. Args: widen_factor (float): Width multiplier, multiply number of channels in each layer by this amount. Default: 1.0. strides (Sequence[int], optional): Strides of the first block of each layer. If not specified, default config in ``arch_setting`` will be used. dilations (Sequence[int]): Dilation of each layer. out_indices (None or Sequence[int]): Output from which stages. Default: (7, ). frozen_stages (int): Stages to be frozen (all param fixed). Default: -1, which means not freezing any parameters. conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict): Config dict for activation layer. Default: dict(type='ReLU6'). norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. Default: False. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. """ # Parameters to build layers. 3 parameters are needed to construct a # layer, from left to right: expand_ratio, channel, num_blocks. arch_settings = [[1, 16, 1], [6, 24, 2], [6, 32, 3], [6, 64, 4], [6, 96, 3], [6, 160, 3], [6, 320, 1]] def __init__(self, widen_factor=1., strides=(1, 2, 2, 2, 1, 2, 1), dilations=(1, 1, 1, 1, 1, 1, 1), out_indices=(1, 2, 4, 6), frozen_stages=-1, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU6'), norm_eval=False, with_cp=False): super(MobileNetV2, self).__init__() self.widen_factor = widen_factor self.strides = strides self.dilations = dilations assert len(strides) == len(dilations) == len(self.arch_settings) self.out_indices = out_indices for index in out_indices: if index not in range(0, 7): raise ValueError('the item in out_indices must in ' f'range(0, 8). But received {index}') if frozen_stages not in range(-1, 7): raise ValueError('frozen_stages must be in range(-1, 7). ' f'But received {frozen_stages}') self.out_indices = out_indices self.frozen_stages = frozen_stages self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.norm_eval = norm_eval self.with_cp = with_cp self.in_channels = make_divisible(32 * widen_factor, 8) self.conv1 = ConvModule( in_channels=3, out_channels=self.in_channels, kernel_size=3, stride=2, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.layers = [] for i, layer_cfg in enumerate(self.arch_settings): expand_ratio, channel, num_blocks = layer_cfg stride = self.strides[i] dilation = self.dilations[i] out_channels = make_divisible(channel * widen_factor, 8) inverted_res_layer = self.make_layer( out_channels=out_channels, num_blocks=num_blocks, stride=stride, dilation=dilation, expand_ratio=expand_ratio) layer_name = f'layer{i + 1}' self.add_module(layer_name, inverted_res_layer) self.layers.append(layer_name) def make_layer(self, out_channels, num_blocks, stride, dilation, expand_ratio): """Stack InvertedResidual blocks to build a layer for MobileNetV2. Args: out_channels (int): out_channels of block. num_blocks (int): Number of blocks. stride (int): Stride of the first block. dilation (int): Dilation of the first block. expand_ratio (int): Expand the number of channels of the hidden layer in InvertedResidual by this ratio. """ layers = [] for i in range(num_blocks): layers.append( InvertedResidual( self.in_channels, out_channels, stride if i == 0 else 1, expand_ratio=expand_ratio, dilation=dilation if i == 0 else 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, with_cp=self.with_cp)) self.in_channels = out_channels return nn.Sequential(*layers) def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = logging.getLogger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) else: raise TypeError('pretrained must be a str or None') def forward(self, x): x = self.conv1(x) outs = [] for i, layer_name in enumerate(self.layers): layer = getattr(self, layer_name) x = layer(x) if i in self.out_indices: outs.append(x) if len(outs) == 1: return outs[0] else: return tuple(outs) def _freeze_stages(self): if self.frozen_stages >= 0: for param in self.conv1.parameters(): param.requires_grad = False for i in range(1, self.frozen_stages + 1): layer = getattr(self, f'layer{i}') layer.eval() for param in layer.parameters(): param.requires_grad = False def train(self, mode=True): super(MobileNetV2, self).train(mode) self._freeze_stages() if mode and self.norm_eval: for m in self.modules(): if isinstance(m, _BatchNorm): m.eval() ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/mobilenet_v3.py ================================================ import logging import annotator.mmpkg.mmcv as mmcv import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule, constant_init, kaiming_init from annotator.mmpkg.mmcv.cnn.bricks import Conv2dAdaptivePadding from annotator.mmpkg.mmcv.runner import load_checkpoint from torch.nn.modules.batchnorm import _BatchNorm from ..builder import BACKBONES from ..utils import InvertedResidualV3 as InvertedResidual @BACKBONES.register_module() class MobileNetV3(nn.Module): """MobileNetV3 backbone. This backbone is the improved implementation of `Searching for MobileNetV3 `_. Args: arch (str): Architecture of mobilnetv3, from {'small', 'large'}. Default: 'small'. conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN'). out_indices (tuple[int]): Output from which layer. Default: (0, 1, 12). frozen_stages (int): Stages to be frozen (all param fixed). Default: -1, which means not freezing any parameters. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. Default: False. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. """ # Parameters to build each block: # [kernel size, mid channels, out channels, with_se, act type, stride] arch_settings = { 'small': [[3, 16, 16, True, 'ReLU', 2], # block0 layer1 os=4 [3, 72, 24, False, 'ReLU', 2], # block1 layer2 os=8 [3, 88, 24, False, 'ReLU', 1], [5, 96, 40, True, 'HSwish', 2], # block2 layer4 os=16 [5, 240, 40, True, 'HSwish', 1], [5, 240, 40, True, 'HSwish', 1], [5, 120, 48, True, 'HSwish', 1], # block3 layer7 os=16 [5, 144, 48, True, 'HSwish', 1], [5, 288, 96, True, 'HSwish', 2], # block4 layer9 os=32 [5, 576, 96, True, 'HSwish', 1], [5, 576, 96, True, 'HSwish', 1]], 'large': [[3, 16, 16, False, 'ReLU', 1], # block0 layer1 os=2 [3, 64, 24, False, 'ReLU', 2], # block1 layer2 os=4 [3, 72, 24, False, 'ReLU', 1], [5, 72, 40, True, 'ReLU', 2], # block2 layer4 os=8 [5, 120, 40, True, 'ReLU', 1], [5, 120, 40, True, 'ReLU', 1], [3, 240, 80, False, 'HSwish', 2], # block3 layer7 os=16 [3, 200, 80, False, 'HSwish', 1], [3, 184, 80, False, 'HSwish', 1], [3, 184, 80, False, 'HSwish', 1], [3, 480, 112, True, 'HSwish', 1], # block4 layer11 os=16 [3, 672, 112, True, 'HSwish', 1], [5, 672, 160, True, 'HSwish', 2], # block5 layer13 os=32 [5, 960, 160, True, 'HSwish', 1], [5, 960, 160, True, 'HSwish', 1]] } # yapf: disable def __init__(self, arch='small', conv_cfg=None, norm_cfg=dict(type='BN'), out_indices=(0, 1, 12), frozen_stages=-1, reduction_factor=1, norm_eval=False, with_cp=False): super(MobileNetV3, self).__init__() assert arch in self.arch_settings assert isinstance(reduction_factor, int) and reduction_factor > 0 assert mmcv.is_tuple_of(out_indices, int) for index in out_indices: if index not in range(0, len(self.arch_settings[arch]) + 2): raise ValueError( 'the item in out_indices must in ' f'range(0, {len(self.arch_settings[arch])+2}). ' f'But received {index}') if frozen_stages not in range(-1, len(self.arch_settings[arch]) + 2): raise ValueError('frozen_stages must be in range(-1, ' f'{len(self.arch_settings[arch])+2}). ' f'But received {frozen_stages}') self.arch = arch self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.out_indices = out_indices self.frozen_stages = frozen_stages self.reduction_factor = reduction_factor self.norm_eval = norm_eval self.with_cp = with_cp self.layers = self._make_layer() def _make_layer(self): layers = [] # build the first layer (layer0) in_channels = 16 layer = ConvModule( in_channels=3, out_channels=in_channels, kernel_size=3, stride=2, padding=1, conv_cfg=dict(type='Conv2dAdaptivePadding'), norm_cfg=self.norm_cfg, act_cfg=dict(type='HSwish')) self.add_module('layer0', layer) layers.append('layer0') layer_setting = self.arch_settings[self.arch] for i, params in enumerate(layer_setting): (kernel_size, mid_channels, out_channels, with_se, act, stride) = params if self.arch == 'large' and i >= 12 or self.arch == 'small' and \ i >= 8: mid_channels = mid_channels // self.reduction_factor out_channels = out_channels // self.reduction_factor if with_se: se_cfg = dict( channels=mid_channels, ratio=4, act_cfg=(dict(type='ReLU'), dict(type='HSigmoid', bias=3.0, divisor=6.0))) else: se_cfg = None layer = InvertedResidual( in_channels=in_channels, out_channels=out_channels, mid_channels=mid_channels, kernel_size=kernel_size, stride=stride, se_cfg=se_cfg, with_expand_conv=(in_channels != mid_channels), conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=dict(type=act), with_cp=self.with_cp) in_channels = out_channels layer_name = 'layer{}'.format(i + 1) self.add_module(layer_name, layer) layers.append(layer_name) # build the last layer # block5 layer12 os=32 for small model # block6 layer16 os=32 for large model layer = ConvModule( in_channels=in_channels, out_channels=576 if self.arch == 'small' else 960, kernel_size=1, stride=1, dilation=4, padding=0, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=dict(type='HSwish')) layer_name = 'layer{}'.format(len(layer_setting) + 1) self.add_module(layer_name, layer) layers.append(layer_name) # next, convert backbone MobileNetV3 to a semantic segmentation version if self.arch == 'small': self.layer4.depthwise_conv.conv.stride = (1, 1) self.layer9.depthwise_conv.conv.stride = (1, 1) for i in range(4, len(layers)): layer = getattr(self, layers[i]) if isinstance(layer, InvertedResidual): modified_module = layer.depthwise_conv.conv else: modified_module = layer.conv if i < 9: modified_module.dilation = (2, 2) pad = 2 else: modified_module.dilation = (4, 4) pad = 4 if not isinstance(modified_module, Conv2dAdaptivePadding): # Adjust padding pad *= (modified_module.kernel_size[0] - 1) // 2 modified_module.padding = (pad, pad) else: self.layer7.depthwise_conv.conv.stride = (1, 1) self.layer13.depthwise_conv.conv.stride = (1, 1) for i in range(7, len(layers)): layer = getattr(self, layers[i]) if isinstance(layer, InvertedResidual): modified_module = layer.depthwise_conv.conv else: modified_module = layer.conv if i < 13: modified_module.dilation = (2, 2) pad = 2 else: modified_module.dilation = (4, 4) pad = 4 if not isinstance(modified_module, Conv2dAdaptivePadding): # Adjust padding pad *= (modified_module.kernel_size[0] - 1) // 2 modified_module.padding = (pad, pad) return layers def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = logging.getLogger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, nn.BatchNorm2d): constant_init(m, 1) else: raise TypeError('pretrained must be a str or None') def forward(self, x): outs = [] for i, layer_name in enumerate(self.layers): layer = getattr(self, layer_name) x = layer(x) if i in self.out_indices: outs.append(x) return outs def _freeze_stages(self): for i in range(self.frozen_stages + 1): layer = getattr(self, f'layer{i}') layer.eval() for param in layer.parameters(): param.requires_grad = False def train(self, mode=True): super(MobileNetV3, self).train(mode) self._freeze_stages() if mode and self.norm_eval: for m in self.modules(): if isinstance(m, _BatchNorm): m.eval() ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/resnest.py ================================================ import math import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as cp from annotator.mmpkg.mmcv.cnn import build_conv_layer, build_norm_layer from ..builder import BACKBONES from ..utils import ResLayer from .resnet import Bottleneck as _Bottleneck from .resnet import ResNetV1d class RSoftmax(nn.Module): """Radix Softmax module in ``SplitAttentionConv2d``. Args: radix (int): Radix of input. groups (int): Groups of input. """ def __init__(self, radix, groups): super().__init__() self.radix = radix self.groups = groups def forward(self, x): batch = x.size(0) if self.radix > 1: x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2) x = F.softmax(x, dim=1) x = x.reshape(batch, -1) else: x = torch.sigmoid(x) return x class SplitAttentionConv2d(nn.Module): """Split-Attention Conv2d in ResNeSt. Args: in_channels (int): Same as nn.Conv2d. out_channels (int): Same as nn.Conv2d. kernel_size (int | tuple[int]): Same as nn.Conv2d. stride (int | tuple[int]): Same as nn.Conv2d. padding (int | tuple[int]): Same as nn.Conv2d. dilation (int | tuple[int]): Same as nn.Conv2d. groups (int): Same as nn.Conv2d. radix (int): Radix of SpltAtConv2d. Default: 2 reduction_factor (int): Reduction factor of inter_channels. Default: 4. conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: None. dcn (dict): Config dict for DCN. Default: None. """ def __init__(self, in_channels, channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, radix=2, reduction_factor=4, conv_cfg=None, norm_cfg=dict(type='BN'), dcn=None): super(SplitAttentionConv2d, self).__init__() inter_channels = max(in_channels * radix // reduction_factor, 32) self.radix = radix self.groups = groups self.channels = channels self.with_dcn = dcn is not None self.dcn = dcn fallback_on_stride = False if self.with_dcn: fallback_on_stride = self.dcn.pop('fallback_on_stride', False) if self.with_dcn and not fallback_on_stride: assert conv_cfg is None, 'conv_cfg must be None for DCN' conv_cfg = dcn self.conv = build_conv_layer( conv_cfg, in_channels, channels * radix, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups * radix, bias=False) self.norm0_name, norm0 = build_norm_layer( norm_cfg, channels * radix, postfix=0) self.add_module(self.norm0_name, norm0) self.relu = nn.ReLU(inplace=True) self.fc1 = build_conv_layer( None, channels, inter_channels, 1, groups=self.groups) self.norm1_name, norm1 = build_norm_layer( norm_cfg, inter_channels, postfix=1) self.add_module(self.norm1_name, norm1) self.fc2 = build_conv_layer( None, inter_channels, channels * radix, 1, groups=self.groups) self.rsoftmax = RSoftmax(radix, groups) @property def norm0(self): """nn.Module: the normalization layer named "norm0" """ return getattr(self, self.norm0_name) @property def norm1(self): """nn.Module: the normalization layer named "norm1" """ return getattr(self, self.norm1_name) def forward(self, x): x = self.conv(x) x = self.norm0(x) x = self.relu(x) batch, rchannel = x.shape[:2] batch = x.size(0) if self.radix > 1: splits = x.view(batch, self.radix, -1, *x.shape[2:]) gap = splits.sum(dim=1) else: gap = x gap = F.adaptive_avg_pool2d(gap, 1) gap = self.fc1(gap) gap = self.norm1(gap) gap = self.relu(gap) atten = self.fc2(gap) atten = self.rsoftmax(atten).view(batch, -1, 1, 1) if self.radix > 1: attens = atten.view(batch, self.radix, -1, *atten.shape[2:]) out = torch.sum(attens * splits, dim=1) else: out = atten * x return out.contiguous() class Bottleneck(_Bottleneck): """Bottleneck block for ResNeSt. Args: inplane (int): Input planes of this block. planes (int): Middle planes of this block. groups (int): Groups of conv2. width_per_group (int): Width per group of conv2. 64x4d indicates ``groups=64, width_per_group=4`` and 32x8d indicates ``groups=32, width_per_group=8``. radix (int): Radix of SpltAtConv2d. Default: 2 reduction_factor (int): Reduction factor of inter_channels in SplitAttentionConv2d. Default: 4. avg_down_stride (bool): Whether to use average pool for stride in Bottleneck. Default: True. kwargs (dict): Key word arguments for base class. """ expansion = 4 def __init__(self, inplanes, planes, groups=1, base_width=4, base_channels=64, radix=2, reduction_factor=4, avg_down_stride=True, **kwargs): """Bottleneck block for ResNeSt.""" super(Bottleneck, self).__init__(inplanes, planes, **kwargs) if groups == 1: width = self.planes else: width = math.floor(self.planes * (base_width / base_channels)) * groups self.avg_down_stride = avg_down_stride and self.conv2_stride > 1 self.norm1_name, norm1 = build_norm_layer( self.norm_cfg, width, postfix=1) self.norm3_name, norm3 = build_norm_layer( self.norm_cfg, self.planes * self.expansion, postfix=3) self.conv1 = build_conv_layer( self.conv_cfg, self.inplanes, width, kernel_size=1, stride=self.conv1_stride, bias=False) self.add_module(self.norm1_name, norm1) self.with_modulated_dcn = False self.conv2 = SplitAttentionConv2d( width, width, kernel_size=3, stride=1 if self.avg_down_stride else self.conv2_stride, padding=self.dilation, dilation=self.dilation, groups=groups, radix=radix, reduction_factor=reduction_factor, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, dcn=self.dcn) delattr(self, self.norm2_name) if self.avg_down_stride: self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1) self.conv3 = build_conv_layer( self.conv_cfg, width, self.planes * self.expansion, kernel_size=1, bias=False) self.add_module(self.norm3_name, norm3) def forward(self, x): def _inner_forward(x): identity = x out = self.conv1(x) out = self.norm1(out) out = self.relu(out) if self.with_plugins: out = self.forward_plugin(out, self.after_conv1_plugin_names) out = self.conv2(out) if self.avg_down_stride: out = self.avd_layer(out) if self.with_plugins: out = self.forward_plugin(out, self.after_conv2_plugin_names) out = self.conv3(out) out = self.norm3(out) if self.with_plugins: out = self.forward_plugin(out, self.after_conv3_plugin_names) if self.downsample is not None: identity = self.downsample(x) out += identity return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) out = self.relu(out) return out @BACKBONES.register_module() class ResNeSt(ResNetV1d): """ResNeSt backbone. Args: groups (int): Number of groups of Bottleneck. Default: 1 base_width (int): Base width of Bottleneck. Default: 4 radix (int): Radix of SpltAtConv2d. Default: 2 reduction_factor (int): Reduction factor of inter_channels in SplitAttentionConv2d. Default: 4. avg_down_stride (bool): Whether to use average pool for stride in Bottleneck. Default: True. kwargs (dict): Keyword arguments for ResNet. """ arch_settings = { 50: (Bottleneck, (3, 4, 6, 3)), 101: (Bottleneck, (3, 4, 23, 3)), 152: (Bottleneck, (3, 8, 36, 3)), 200: (Bottleneck, (3, 24, 36, 3)) } def __init__(self, groups=1, base_width=4, radix=2, reduction_factor=4, avg_down_stride=True, **kwargs): self.groups = groups self.base_width = base_width self.radix = radix self.reduction_factor = reduction_factor self.avg_down_stride = avg_down_stride super(ResNeSt, self).__init__(**kwargs) def make_res_layer(self, **kwargs): """Pack all blocks in a stage into a ``ResLayer``.""" return ResLayer( groups=self.groups, base_width=self.base_width, base_channels=self.base_channels, radix=self.radix, reduction_factor=self.reduction_factor, avg_down_stride=self.avg_down_stride, **kwargs) ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/resnet.py ================================================ import torch.nn as nn import torch.utils.checkpoint as cp from annotator.mmpkg.mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer, constant_init, kaiming_init) from annotator.mmpkg.mmcv.runner import load_checkpoint from annotator.mmpkg.mmcv.utils.parrots_wrapper import _BatchNorm from annotator.mmpkg.mmseg.utils import get_root_logger from ..builder import BACKBONES from ..utils import ResLayer class BasicBlock(nn.Module): """Basic block for ResNet.""" expansion = 1 def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, style='pytorch', with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), dcn=None, plugins=None): super(BasicBlock, self).__init__() assert dcn is None, 'Not implemented yet.' assert plugins is None, 'Not implemented yet.' self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) self.conv1 = build_conv_layer( conv_cfg, inplanes, planes, 3, stride=stride, padding=dilation, dilation=dilation, bias=False) self.add_module(self.norm1_name, norm1) self.conv2 = build_conv_layer( conv_cfg, planes, planes, 3, padding=1, bias=False) self.add_module(self.norm2_name, norm2) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride self.dilation = dilation self.with_cp = with_cp @property def norm1(self): """nn.Module: normalization layer after the first convolution layer""" return getattr(self, self.norm1_name) @property def norm2(self): """nn.Module: normalization layer after the second convolution layer""" return getattr(self, self.norm2_name) def forward(self, x): """Forward function.""" def _inner_forward(x): identity = x out = self.conv1(x) out = self.norm1(out) out = self.relu(out) out = self.conv2(out) out = self.norm2(out) if self.downsample is not None: identity = self.downsample(x) out += identity return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) out = self.relu(out) return out class Bottleneck(nn.Module): """Bottleneck block for ResNet. If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is "caffe", the stride-two layer is the first 1x1 conv layer. """ expansion = 4 def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, style='pytorch', with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), dcn=None, plugins=None): super(Bottleneck, self).__init__() assert style in ['pytorch', 'caffe'] assert dcn is None or isinstance(dcn, dict) assert plugins is None or isinstance(plugins, list) if plugins is not None: allowed_position = ['after_conv1', 'after_conv2', 'after_conv3'] assert all(p['position'] in allowed_position for p in plugins) self.inplanes = inplanes self.planes = planes self.stride = stride self.dilation = dilation self.style = style self.with_cp = with_cp self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.dcn = dcn self.with_dcn = dcn is not None self.plugins = plugins self.with_plugins = plugins is not None if self.with_plugins: # collect plugins for conv1/conv2/conv3 self.after_conv1_plugins = [ plugin['cfg'] for plugin in plugins if plugin['position'] == 'after_conv1' ] self.after_conv2_plugins = [ plugin['cfg'] for plugin in plugins if plugin['position'] == 'after_conv2' ] self.after_conv3_plugins = [ plugin['cfg'] for plugin in plugins if plugin['position'] == 'after_conv3' ] if self.style == 'pytorch': self.conv1_stride = 1 self.conv2_stride = stride else: self.conv1_stride = stride self.conv2_stride = 1 self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) self.norm3_name, norm3 = build_norm_layer( norm_cfg, planes * self.expansion, postfix=3) self.conv1 = build_conv_layer( conv_cfg, inplanes, planes, kernel_size=1, stride=self.conv1_stride, bias=False) self.add_module(self.norm1_name, norm1) fallback_on_stride = False if self.with_dcn: fallback_on_stride = dcn.pop('fallback_on_stride', False) if not self.with_dcn or fallback_on_stride: self.conv2 = build_conv_layer( conv_cfg, planes, planes, kernel_size=3, stride=self.conv2_stride, padding=dilation, dilation=dilation, bias=False) else: assert self.conv_cfg is None, 'conv_cfg must be None for DCN' self.conv2 = build_conv_layer( dcn, planes, planes, kernel_size=3, stride=self.conv2_stride, padding=dilation, dilation=dilation, bias=False) self.add_module(self.norm2_name, norm2) self.conv3 = build_conv_layer( conv_cfg, planes, planes * self.expansion, kernel_size=1, bias=False) self.add_module(self.norm3_name, norm3) self.relu = nn.ReLU(inplace=True) self.downsample = downsample if self.with_plugins: self.after_conv1_plugin_names = self.make_block_plugins( planes, self.after_conv1_plugins) self.after_conv2_plugin_names = self.make_block_plugins( planes, self.after_conv2_plugins) self.after_conv3_plugin_names = self.make_block_plugins( planes * self.expansion, self.after_conv3_plugins) def make_block_plugins(self, in_channels, plugins): """make plugins for block. Args: in_channels (int): Input channels of plugin. plugins (list[dict]): List of plugins cfg to build. Returns: list[str]: List of the names of plugin. """ assert isinstance(plugins, list) plugin_names = [] for plugin in plugins: plugin = plugin.copy() name, layer = build_plugin_layer( plugin, in_channels=in_channels, postfix=plugin.pop('postfix', '')) assert not hasattr(self, name), f'duplicate plugin {name}' self.add_module(name, layer) plugin_names.append(name) return plugin_names def forward_plugin(self, x, plugin_names): """Forward function for plugins.""" out = x for name in plugin_names: out = getattr(self, name)(x) return out @property def norm1(self): """nn.Module: normalization layer after the first convolution layer""" return getattr(self, self.norm1_name) @property def norm2(self): """nn.Module: normalization layer after the second convolution layer""" return getattr(self, self.norm2_name) @property def norm3(self): """nn.Module: normalization layer after the third convolution layer""" return getattr(self, self.norm3_name) def forward(self, x): """Forward function.""" def _inner_forward(x): identity = x out = self.conv1(x) out = self.norm1(out) out = self.relu(out) if self.with_plugins: out = self.forward_plugin(out, self.after_conv1_plugin_names) out = self.conv2(out) out = self.norm2(out) out = self.relu(out) if self.with_plugins: out = self.forward_plugin(out, self.after_conv2_plugin_names) out = self.conv3(out) out = self.norm3(out) if self.with_plugins: out = self.forward_plugin(out, self.after_conv3_plugin_names) if self.downsample is not None: identity = self.downsample(x) out += identity return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) out = self.relu(out) return out @BACKBONES.register_module() class ResNet(nn.Module): """ResNet backbone. Args: depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. in_channels (int): Number of input image channels. Default" 3. stem_channels (int): Number of stem channels. Default: 64. base_channels (int): Number of base channels of res layer. Default: 64. num_stages (int): Resnet stages, normally 4. strides (Sequence[int]): Strides of the first block of each stage. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two layer is the 3x3 conv layer, otherwise the stride-two layer is the first 1x1 conv layer. deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv avg_down (bool): Use AvgPool instead of stride conv when downsampling in the bottleneck. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. norm_cfg (dict): Dictionary to construct and config norm layer. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. plugins (list[dict]): List of plugins for stages, each dict contains: - cfg (dict, required): Cfg dict to build plugin. - position (str, required): Position inside block to insert plugin, options: 'after_conv1', 'after_conv2', 'after_conv3'. - stages (tuple[bool], optional): Stages to apply plugin, length should be same as 'num_stages' multi_grid (Sequence[int]|None): Multi grid dilation rates of last stage. Default: None contract_dilation (bool): Whether contract first dilation of each layer Default: False with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. zero_init_residual (bool): Whether to use zero init for last norm layer in resblocks to let them behave as identity. Example: >>> from annotator.mmpkg.mmseg.models import ResNet >>> import torch >>> self = ResNet(depth=18) >>> self.eval() >>> inputs = torch.rand(1, 3, 32, 32) >>> level_outputs = self.forward(inputs) >>> for level_out in level_outputs: ... print(tuple(level_out.shape)) (1, 64, 8, 8) (1, 128, 4, 4) (1, 256, 2, 2) (1, 512, 1, 1) """ arch_settings = { 18: (BasicBlock, (2, 2, 2, 2)), 34: (BasicBlock, (3, 4, 6, 3)), 50: (Bottleneck, (3, 4, 6, 3)), 101: (Bottleneck, (3, 4, 23, 3)), 152: (Bottleneck, (3, 8, 36, 3)) } def __init__(self, depth, in_channels=3, stem_channels=64, base_channels=64, num_stages=4, strides=(1, 2, 2, 2), dilations=(1, 1, 1, 1), out_indices=(0, 1, 2, 3), style='pytorch', deep_stem=False, avg_down=False, frozen_stages=-1, conv_cfg=None, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=False, dcn=None, stage_with_dcn=(False, False, False, False), plugins=None, multi_grid=None, contract_dilation=False, with_cp=False, zero_init_residual=True): super(ResNet, self).__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for resnet') self.depth = depth self.stem_channels = stem_channels self.base_channels = base_channels self.num_stages = num_stages assert num_stages >= 1 and num_stages <= 4 self.strides = strides self.dilations = dilations assert len(strides) == len(dilations) == num_stages self.out_indices = out_indices assert max(out_indices) < num_stages self.style = style self.deep_stem = deep_stem self.avg_down = avg_down self.frozen_stages = frozen_stages self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.with_cp = with_cp self.norm_eval = norm_eval self.dcn = dcn self.stage_with_dcn = stage_with_dcn if dcn is not None: assert len(stage_with_dcn) == num_stages self.plugins = plugins self.multi_grid = multi_grid self.contract_dilation = contract_dilation self.zero_init_residual = zero_init_residual self.block, stage_blocks = self.arch_settings[depth] self.stage_blocks = stage_blocks[:num_stages] self.inplanes = stem_channels self._make_stem_layer(in_channels, stem_channels) self.res_layers = [] for i, num_blocks in enumerate(self.stage_blocks): stride = strides[i] dilation = dilations[i] dcn = self.dcn if self.stage_with_dcn[i] else None if plugins is not None: stage_plugins = self.make_stage_plugins(plugins, i) else: stage_plugins = None # multi grid is applied to last layer only stage_multi_grid = multi_grid if i == len( self.stage_blocks) - 1 else None planes = base_channels * 2**i res_layer = self.make_res_layer( block=self.block, inplanes=self.inplanes, planes=planes, num_blocks=num_blocks, stride=stride, dilation=dilation, style=self.style, avg_down=self.avg_down, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, dcn=dcn, plugins=stage_plugins, multi_grid=stage_multi_grid, contract_dilation=contract_dilation) self.inplanes = planes * self.block.expansion layer_name = f'layer{i+1}' self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) self._freeze_stages() self.feat_dim = self.block.expansion * base_channels * 2**( len(self.stage_blocks) - 1) def make_stage_plugins(self, plugins, stage_idx): """make plugins for ResNet 'stage_idx'th stage . Currently we support to insert 'context_block', 'empirical_attention_block', 'nonlocal_block' into the backbone like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of Bottleneck. An example of plugins format could be : >>> plugins=[ ... dict(cfg=dict(type='xxx', arg1='xxx'), ... stages=(False, True, True, True), ... position='after_conv2'), ... dict(cfg=dict(type='yyy'), ... stages=(True, True, True, True), ... position='after_conv3'), ... dict(cfg=dict(type='zzz', postfix='1'), ... stages=(True, True, True, True), ... position='after_conv3'), ... dict(cfg=dict(type='zzz', postfix='2'), ... stages=(True, True, True, True), ... position='after_conv3') ... ] >>> self = ResNet(depth=18) >>> stage_plugins = self.make_stage_plugins(plugins, 0) >>> assert len(stage_plugins) == 3 Suppose 'stage_idx=0', the structure of blocks in the stage would be: conv1-> conv2->conv3->yyy->zzz1->zzz2 Suppose 'stage_idx=1', the structure of blocks in the stage would be: conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2 If stages is missing, the plugin would be applied to all stages. Args: plugins (list[dict]): List of plugins cfg to build. The postfix is required if multiple same type plugins are inserted. stage_idx (int): Index of stage to build Returns: list[dict]: Plugins for current stage """ stage_plugins = [] for plugin in plugins: plugin = plugin.copy() stages = plugin.pop('stages', None) assert stages is None or len(stages) == self.num_stages # whether to insert plugin into current stage if stages is None or stages[stage_idx]: stage_plugins.append(plugin) return stage_plugins def make_res_layer(self, **kwargs): """Pack all blocks in a stage into a ``ResLayer``.""" return ResLayer(**kwargs) @property def norm1(self): """nn.Module: the normalization layer named "norm1" """ return getattr(self, self.norm1_name) def _make_stem_layer(self, in_channels, stem_channels): """Make stem layer for ResNet.""" if self.deep_stem: self.stem = nn.Sequential( build_conv_layer( self.conv_cfg, in_channels, stem_channels // 2, kernel_size=3, stride=2, padding=1, bias=False), build_norm_layer(self.norm_cfg, stem_channels // 2)[1], nn.ReLU(inplace=True), build_conv_layer( self.conv_cfg, stem_channels // 2, stem_channels // 2, kernel_size=3, stride=1, padding=1, bias=False), build_norm_layer(self.norm_cfg, stem_channels // 2)[1], nn.ReLU(inplace=True), build_conv_layer( self.conv_cfg, stem_channels // 2, stem_channels, kernel_size=3, stride=1, padding=1, bias=False), build_norm_layer(self.norm_cfg, stem_channels)[1], nn.ReLU(inplace=True)) else: self.conv1 = build_conv_layer( self.conv_cfg, in_channels, stem_channels, kernel_size=7, stride=2, padding=3, bias=False) self.norm1_name, norm1 = build_norm_layer( self.norm_cfg, stem_channels, postfix=1) self.add_module(self.norm1_name, norm1) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) def _freeze_stages(self): """Freeze stages param and norm stats.""" if self.frozen_stages >= 0: if self.deep_stem: self.stem.eval() for param in self.stem.parameters(): param.requires_grad = False else: self.norm1.eval() for m in [self.conv1, self.norm1]: for param in m.parameters(): param.requires_grad = False for i in range(1, self.frozen_stages + 1): m = getattr(self, f'layer{i}') m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ if isinstance(pretrained, str): logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) if self.dcn is not None: for m in self.modules(): if isinstance(m, Bottleneck) and hasattr( m, 'conv2_offset'): constant_init(m.conv2_offset, 0) if self.zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): constant_init(m.norm3, 0) elif isinstance(m, BasicBlock): constant_init(m.norm2, 0) else: raise TypeError('pretrained must be a str or None') def forward(self, x): """Forward function.""" if self.deep_stem: x = self.stem(x) else: x = self.conv1(x) x = self.norm1(x) x = self.relu(x) x = self.maxpool(x) outs = [] for i, layer_name in enumerate(self.res_layers): res_layer = getattr(self, layer_name) x = res_layer(x) if i in self.out_indices: outs.append(x) return tuple(outs) def train(self, mode=True): """Convert the model into training mode while keep normalization layer freezed.""" super(ResNet, self).train(mode) self._freeze_stages() if mode and self.norm_eval: for m in self.modules(): # trick: eval have effect on BatchNorm only if isinstance(m, _BatchNorm): m.eval() @BACKBONES.register_module() class ResNetV1c(ResNet): """ResNetV1c variant described in [1]_. Compared with default ResNet(ResNetV1b), ResNetV1c replaces the 7x7 conv in the input stem with three 3x3 convs. References: .. [1] https://arxiv.org/pdf/1812.01187.pdf """ def __init__(self, **kwargs): super(ResNetV1c, self).__init__( deep_stem=True, avg_down=False, **kwargs) @BACKBONES.register_module() class ResNetV1d(ResNet): """ResNetV1d variant described in [1]_. Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in the input stem with three 3x3 convs. And in the downsampling block, a 2x2 avg_pool with stride 2 is added before conv, whose stride is changed to 1. """ def __init__(self, **kwargs): super(ResNetV1d, self).__init__( deep_stem=True, avg_down=True, **kwargs) ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/resnext.py ================================================ import math from annotator.mmpkg.mmcv.cnn import build_conv_layer, build_norm_layer from ..builder import BACKBONES from ..utils import ResLayer from .resnet import Bottleneck as _Bottleneck from .resnet import ResNet class Bottleneck(_Bottleneck): """Bottleneck block for ResNeXt. If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is "caffe", the stride-two layer is the first 1x1 conv layer. """ def __init__(self, inplanes, planes, groups=1, base_width=4, base_channels=64, **kwargs): super(Bottleneck, self).__init__(inplanes, planes, **kwargs) if groups == 1: width = self.planes else: width = math.floor(self.planes * (base_width / base_channels)) * groups self.norm1_name, norm1 = build_norm_layer( self.norm_cfg, width, postfix=1) self.norm2_name, norm2 = build_norm_layer( self.norm_cfg, width, postfix=2) self.norm3_name, norm3 = build_norm_layer( self.norm_cfg, self.planes * self.expansion, postfix=3) self.conv1 = build_conv_layer( self.conv_cfg, self.inplanes, width, kernel_size=1, stride=self.conv1_stride, bias=False) self.add_module(self.norm1_name, norm1) fallback_on_stride = False self.with_modulated_dcn = False if self.with_dcn: fallback_on_stride = self.dcn.pop('fallback_on_stride', False) if not self.with_dcn or fallback_on_stride: self.conv2 = build_conv_layer( self.conv_cfg, width, width, kernel_size=3, stride=self.conv2_stride, padding=self.dilation, dilation=self.dilation, groups=groups, bias=False) else: assert self.conv_cfg is None, 'conv_cfg must be None for DCN' self.conv2 = build_conv_layer( self.dcn, width, width, kernel_size=3, stride=self.conv2_stride, padding=self.dilation, dilation=self.dilation, groups=groups, bias=False) self.add_module(self.norm2_name, norm2) self.conv3 = build_conv_layer( self.conv_cfg, width, self.planes * self.expansion, kernel_size=1, bias=False) self.add_module(self.norm3_name, norm3) @BACKBONES.register_module() class ResNeXt(ResNet): """ResNeXt backbone. Args: depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. in_channels (int): Number of input image channels. Normally 3. num_stages (int): Resnet stages, normally 4. groups (int): Group of resnext. base_width (int): Base width of resnext. strides (Sequence[int]): Strides of the first block of each stage. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two layer is the 3x3 conv layer, otherwise the stride-two layer is the first 1x1 conv layer. frozen_stages (int): Stages to be frozen (all param fixed). -1 means not freezing any parameters. norm_cfg (dict): dictionary to construct and config norm layer. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. zero_init_residual (bool): whether to use zero init for last norm layer in resblocks to let them behave as identity. Example: >>> from annotator.mmpkg.mmseg.models import ResNeXt >>> import torch >>> self = ResNeXt(depth=50) >>> self.eval() >>> inputs = torch.rand(1, 3, 32, 32) >>> level_outputs = self.forward(inputs) >>> for level_out in level_outputs: ... print(tuple(level_out.shape)) (1, 256, 8, 8) (1, 512, 4, 4) (1, 1024, 2, 2) (1, 2048, 1, 1) """ arch_settings = { 50: (Bottleneck, (3, 4, 6, 3)), 101: (Bottleneck, (3, 4, 23, 3)), 152: (Bottleneck, (3, 8, 36, 3)) } def __init__(self, groups=1, base_width=4, **kwargs): self.groups = groups self.base_width = base_width super(ResNeXt, self).__init__(**kwargs) def make_res_layer(self, **kwargs): """Pack all blocks in a stage into a ``ResLayer``""" return ResLayer( groups=self.groups, base_width=self.base_width, base_channels=self.base_channels, **kwargs) ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/unet.py ================================================ import torch.nn as nn import torch.utils.checkpoint as cp from annotator.mmpkg.mmcv.cnn import (UPSAMPLE_LAYERS, ConvModule, build_activation_layer, build_norm_layer, constant_init, kaiming_init) from annotator.mmpkg.mmcv.runner import load_checkpoint from annotator.mmpkg.mmcv.utils.parrots_wrapper import _BatchNorm from annotator.mmpkg.mmseg.utils import get_root_logger from ..builder import BACKBONES from ..utils import UpConvBlock class BasicConvBlock(nn.Module): """Basic convolutional block for UNet. This module consists of several plain convolutional layers. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. num_convs (int): Number of convolutional layers. Default: 2. stride (int): Whether use stride convolution to downsample the input feature map. If stride=2, it only uses stride convolution in the first convolutional layer to downsample the input feature map. Options are 1 or 2. Default: 1. dilation (int): Whether use dilated convolution to expand the receptive field. Set dilation rate of each convolutional layer and the dilation rate of the first convolutional layer is always 1. Default: 1. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. conv_cfg (dict | None): Config dict for convolution layer. Default: None. norm_cfg (dict | None): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict | None): Config dict for activation layer in ConvModule. Default: dict(type='ReLU'). dcn (bool): Use deformable convolution in convolutional layer or not. Default: None. plugins (dict): plugins for convolutional layers. Default: None. """ def __init__(self, in_channels, out_channels, num_convs=2, stride=1, dilation=1, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), dcn=None, plugins=None): super(BasicConvBlock, self).__init__() assert dcn is None, 'Not implemented yet.' assert plugins is None, 'Not implemented yet.' self.with_cp = with_cp convs = [] for i in range(num_convs): convs.append( ConvModule( in_channels=in_channels if i == 0 else out_channels, out_channels=out_channels, kernel_size=3, stride=stride if i == 0 else 1, dilation=1 if i == 0 else dilation, padding=1 if i == 0 else dilation, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) self.convs = nn.Sequential(*convs) def forward(self, x): """Forward function.""" if self.with_cp and x.requires_grad: out = cp.checkpoint(self.convs, x) else: out = self.convs(x) return out @UPSAMPLE_LAYERS.register_module() class DeconvModule(nn.Module): """Deconvolution upsample module in decoder for UNet (2X upsample). This module uses deconvolution to upsample feature map in the decoder of UNet. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. norm_cfg (dict | None): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict | None): Config dict for activation layer in ConvModule. Default: dict(type='ReLU'). kernel_size (int): Kernel size of the convolutional layer. Default: 4. """ def __init__(self, in_channels, out_channels, with_cp=False, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), *, kernel_size=4, scale_factor=2): super(DeconvModule, self).__init__() assert (kernel_size - scale_factor >= 0) and\ (kernel_size - scale_factor) % 2 == 0,\ f'kernel_size should be greater than or equal to scale_factor '\ f'and (kernel_size - scale_factor) should be even numbers, '\ f'while the kernel size is {kernel_size} and scale_factor is '\ f'{scale_factor}.' stride = scale_factor padding = (kernel_size - scale_factor) // 2 self.with_cp = with_cp deconv = nn.ConvTranspose2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding) norm_name, norm = build_norm_layer(norm_cfg, out_channels) activate = build_activation_layer(act_cfg) self.deconv_upsamping = nn.Sequential(deconv, norm, activate) def forward(self, x): """Forward function.""" if self.with_cp and x.requires_grad: out = cp.checkpoint(self.deconv_upsamping, x) else: out = self.deconv_upsamping(x) return out @UPSAMPLE_LAYERS.register_module() class InterpConv(nn.Module): """Interpolation upsample module in decoder for UNet. This module uses interpolation to upsample feature map in the decoder of UNet. It consists of one interpolation upsample layer and one convolutional layer. It can be one interpolation upsample layer followed by one convolutional layer (conv_first=False) or one convolutional layer followed by one interpolation upsample layer (conv_first=True). Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. norm_cfg (dict | None): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict | None): Config dict for activation layer in ConvModule. Default: dict(type='ReLU'). conv_cfg (dict | None): Config dict for convolution layer. Default: None. conv_first (bool): Whether convolutional layer or interpolation upsample layer first. Default: False. It means interpolation upsample layer followed by one convolutional layer. kernel_size (int): Kernel size of the convolutional layer. Default: 1. stride (int): Stride of the convolutional layer. Default: 1. padding (int): Padding of the convolutional layer. Default: 1. upsample_cfg (dict): Interpolation config of the upsample layer. Default: dict( scale_factor=2, mode='bilinear', align_corners=False). """ def __init__(self, in_channels, out_channels, with_cp=False, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), *, conv_cfg=None, conv_first=False, kernel_size=1, stride=1, padding=0, upsample_cfg=dict( scale_factor=2, mode='bilinear', align_corners=False)): super(InterpConv, self).__init__() self.with_cp = with_cp conv = ConvModule( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) upsample = nn.Upsample(**upsample_cfg) if conv_first: self.interp_upsample = nn.Sequential(conv, upsample) else: self.interp_upsample = nn.Sequential(upsample, conv) def forward(self, x): """Forward function.""" if self.with_cp and x.requires_grad: out = cp.checkpoint(self.interp_upsample, x) else: out = self.interp_upsample(x) return out @BACKBONES.register_module() class UNet(nn.Module): """UNet backbone. U-Net: Convolutional Networks for Biomedical Image Segmentation. https://arxiv.org/pdf/1505.04597.pdf Args: in_channels (int): Number of input image channels. Default" 3. base_channels (int): Number of base channels of each stage. The output channels of the first stage. Default: 64. num_stages (int): Number of stages in encoder, normally 5. Default: 5. strides (Sequence[int 1 | 2]): Strides of each stage in encoder. len(strides) is equal to num_stages. Normally the stride of the first stage in encoder is 1. If strides[i]=2, it uses stride convolution to downsample in the correspondence encoder stage. Default: (1, 1, 1, 1, 1). enc_num_convs (Sequence[int]): Number of convolutional layers in the convolution block of the correspondence encoder stage. Default: (2, 2, 2, 2, 2). dec_num_convs (Sequence[int]): Number of convolutional layers in the convolution block of the correspondence decoder stage. Default: (2, 2, 2, 2). downsamples (Sequence[int]): Whether use MaxPool to downsample the feature map after the first stage of encoder (stages: [1, num_stages)). If the correspondence encoder stage use stride convolution (strides[i]=2), it will never use MaxPool to downsample, even downsamples[i-1]=True. Default: (True, True, True, True). enc_dilations (Sequence[int]): Dilation rate of each stage in encoder. Default: (1, 1, 1, 1, 1). dec_dilations (Sequence[int]): Dilation rate of each stage in decoder. Default: (1, 1, 1, 1). with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. conv_cfg (dict | None): Config dict for convolution layer. Default: None. norm_cfg (dict | None): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict | None): Config dict for activation layer in ConvModule. Default: dict(type='ReLU'). upsample_cfg (dict): The upsample config of the upsample module in decoder. Default: dict(type='InterpConv'). norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. Default: False. dcn (bool): Use deformable convolution in convolutional layer or not. Default: None. plugins (dict): plugins for convolutional layers. Default: None. Notice: The input image size should be divisible by the whole downsample rate of the encoder. More detail of the whole downsample rate can be found in UNet._check_input_divisible. """ def __init__(self, in_channels=3, base_channels=64, num_stages=5, strides=(1, 1, 1, 1, 1), enc_num_convs=(2, 2, 2, 2, 2), dec_num_convs=(2, 2, 2, 2), downsamples=(True, True, True, True), enc_dilations=(1, 1, 1, 1, 1), dec_dilations=(1, 1, 1, 1), with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), upsample_cfg=dict(type='InterpConv'), norm_eval=False, dcn=None, plugins=None): super(UNet, self).__init__() assert dcn is None, 'Not implemented yet.' assert plugins is None, 'Not implemented yet.' assert len(strides) == num_stages, \ 'The length of strides should be equal to num_stages, '\ f'while the strides is {strides}, the length of '\ f'strides is {len(strides)}, and the num_stages is '\ f'{num_stages}.' assert len(enc_num_convs) == num_stages, \ 'The length of enc_num_convs should be equal to num_stages, '\ f'while the enc_num_convs is {enc_num_convs}, the length of '\ f'enc_num_convs is {len(enc_num_convs)}, and the num_stages is '\ f'{num_stages}.' assert len(dec_num_convs) == (num_stages-1), \ 'The length of dec_num_convs should be equal to (num_stages-1), '\ f'while the dec_num_convs is {dec_num_convs}, the length of '\ f'dec_num_convs is {len(dec_num_convs)}, and the num_stages is '\ f'{num_stages}.' assert len(downsamples) == (num_stages-1), \ 'The length of downsamples should be equal to (num_stages-1), '\ f'while the downsamples is {downsamples}, the length of '\ f'downsamples is {len(downsamples)}, and the num_stages is '\ f'{num_stages}.' assert len(enc_dilations) == num_stages, \ 'The length of enc_dilations should be equal to num_stages, '\ f'while the enc_dilations is {enc_dilations}, the length of '\ f'enc_dilations is {len(enc_dilations)}, and the num_stages is '\ f'{num_stages}.' assert len(dec_dilations) == (num_stages-1), \ 'The length of dec_dilations should be equal to (num_stages-1), '\ f'while the dec_dilations is {dec_dilations}, the length of '\ f'dec_dilations is {len(dec_dilations)}, and the num_stages is '\ f'{num_stages}.' self.num_stages = num_stages self.strides = strides self.downsamples = downsamples self.norm_eval = norm_eval self.base_channels = base_channels self.encoder = nn.ModuleList() self.decoder = nn.ModuleList() for i in range(num_stages): enc_conv_block = [] if i != 0: if strides[i] == 1 and downsamples[i - 1]: enc_conv_block.append(nn.MaxPool2d(kernel_size=2)) upsample = (strides[i] != 1 or downsamples[i - 1]) self.decoder.append( UpConvBlock( conv_block=BasicConvBlock, in_channels=base_channels * 2**i, skip_channels=base_channels * 2**(i - 1), out_channels=base_channels * 2**(i - 1), num_convs=dec_num_convs[i - 1], stride=1, dilation=dec_dilations[i - 1], with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, upsample_cfg=upsample_cfg if upsample else None, dcn=None, plugins=None)) enc_conv_block.append( BasicConvBlock( in_channels=in_channels, out_channels=base_channels * 2**i, num_convs=enc_num_convs[i], stride=strides[i], dilation=enc_dilations[i], with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, dcn=None, plugins=None)) self.encoder.append((nn.Sequential(*enc_conv_block))) in_channels = base_channels * 2**i def forward(self, x): self._check_input_divisible(x) enc_outs = [] for enc in self.encoder: x = enc(x) enc_outs.append(x) dec_outs = [x] for i in reversed(range(len(self.decoder))): x = self.decoder[i](enc_outs[i], x) dec_outs.append(x) return dec_outs def train(self, mode=True): """Convert the model into training mode while keep normalization layer freezed.""" super(UNet, self).train(mode) if mode and self.norm_eval: for m in self.modules(): # trick: eval have effect on BatchNorm only if isinstance(m, _BatchNorm): m.eval() def _check_input_divisible(self, x): h, w = x.shape[-2:] whole_downsample_rate = 1 for i in range(1, self.num_stages): if self.strides[i] == 2 or self.downsamples[i - 1]: whole_downsample_rate *= 2 assert (h % whole_downsample_rate == 0) \ and (w % whole_downsample_rate == 0),\ f'The input image size {(h, w)} should be divisible by the whole '\ f'downsample rate {whole_downsample_rate}, when num_stages is '\ f'{self.num_stages}, strides is {self.strides}, and downsamples '\ f'is {self.downsamples}.' def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ if isinstance(pretrained, str): logger = get_root_logger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) else: raise TypeError('pretrained must be a str or None') ================================================ FILE: annotator/mmpkg/mmseg/models/backbones/vit.py ================================================ """Modified from https://github.com/rwightman/pytorch-image- models/blob/master/timm/models/vision_transformer.py.""" import math import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as cp from annotator.mmpkg.mmcv.cnn import (Conv2d, Linear, build_activation_layer, build_norm_layer, constant_init, kaiming_init, normal_init) from annotator.mmpkg.mmcv.runner import _load_checkpoint from annotator.mmpkg.mmcv.utils.parrots_wrapper import _BatchNorm from annotator.mmpkg.mmseg.utils import get_root_logger from ..builder import BACKBONES from ..utils import DropPath, trunc_normal_ class Mlp(nn.Module): """MLP layer for Encoder block. Args: in_features(int): Input dimension for the first fully connected layer. hidden_features(int): Output dimension for the first fully connected layer. out_features(int): Output dementsion for the second fully connected layer. act_cfg(dict): Config dict for activation layer. Default: dict(type='GELU'). drop(float): Drop rate for the dropout layer. Dropout rate has to be between 0 and 1. Default: 0. """ def __init__(self, in_features, hidden_features=None, out_features=None, act_cfg=dict(type='GELU'), drop=0.): super(Mlp, self).__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = Linear(in_features, hidden_features) self.act = build_activation_layer(act_cfg) self.fc2 = Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class Attention(nn.Module): """Attention layer for Encoder block. Args: dim (int): Dimension for the input vector. num_heads (int): Number of parallel attention heads. qkv_bias (bool): Enable bias for qkv if True. Default: False. qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. attn_drop (float): Drop rate for attention output weights. Default: 0. proj_drop (float): Drop rate for output weights. Default: 0. """ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): super(Attention, self).__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x): b, n, c = x.shape qkv = self.qkv(x).reshape(b, n, 3, self.num_heads, c // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(b, n, c) x = self.proj(x) x = self.proj_drop(x) return x class Block(nn.Module): """Implements encoder block with residual connection. Args: dim (int): The feature dimension. num_heads (int): Number of parallel attention heads. mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. drop (float): Drop rate for mlp output weights. Default: 0. attn_drop (float): Drop rate for attention output weights. Default: 0. proj_drop (float): Drop rate for attn layer output weights. Default: 0. drop_path (float): Drop rate for paths of model. Default: 0. act_cfg (dict): Config dict for activation layer. Default: dict(type='GELU'). norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN', requires_grad=True). with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. """ def __init__(self, dim, num_heads, mlp_ratio=4, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., proj_drop=0., drop_path=0., act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN', eps=1e-6), with_cp=False): super(Block, self).__init__() self.with_cp = with_cp _, self.norm1 = build_norm_layer(norm_cfg, dim) self.attn = Attention(dim, num_heads, qkv_bias, qk_scale, attn_drop, proj_drop) self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() _, self.norm2 = build_norm_layer(norm_cfg, dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_cfg=act_cfg, drop=drop) def forward(self, x): def _inner_forward(x): out = x + self.drop_path(self.attn(self.norm1(x))) out = out + self.drop_path(self.mlp(self.norm2(out))) return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) return out class PatchEmbed(nn.Module): """Image to Patch Embedding. Args: img_size (int | tuple): Input image size. default: 224. patch_size (int): Width and height for a patch. default: 16. in_channels (int): Input channels for images. Default: 3. embed_dim (int): The embedding dimension. Default: 768. """ def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=768): super(PatchEmbed, self).__init__() if isinstance(img_size, int): self.img_size = (img_size, img_size) elif isinstance(img_size, tuple): self.img_size = img_size else: raise TypeError('img_size must be type of int or tuple') h, w = self.img_size self.patch_size = (patch_size, patch_size) self.num_patches = (h // patch_size) * (w // patch_size) self.proj = Conv2d( in_channels, embed_dim, kernel_size=patch_size, stride=patch_size) def forward(self, x): return self.proj(x).flatten(2).transpose(1, 2) @BACKBONES.register_module() class VisionTransformer(nn.Module): """Vision transformer backbone. A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 Args: img_size (tuple): input image size. Default: (224, 224). patch_size (int, tuple): patch size. Default: 16. in_channels (int): number of input channels. Default: 3. embed_dim (int): embedding dimension. Default: 768. depth (int): depth of transformer. Default: 12. num_heads (int): number of attention heads. Default: 12. mlp_ratio (int): ratio of mlp hidden dim to embedding dim. Default: 4. out_indices (list | tuple | int): Output from which stages. Default: -1. qkv_bias (bool): enable bias for qkv if True. Default: True. qk_scale (float): override default qk scale of head_dim ** -0.5 if set. drop_rate (float): dropout rate. Default: 0. attn_drop_rate (float): attention dropout rate. Default: 0. drop_path_rate (float): Rate of DropPath. Default: 0. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN', eps=1e-6, requires_grad=True). act_cfg (dict): Config dict for activation layer. Default: dict(type='GELU'). norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. Default: False. final_norm (bool): Whether to add a additional layer to normalize final feature map. Default: False. interpolate_mode (str): Select the interpolate mode for position embeding vector resize. Default: bicubic. with_cls_token (bool): If concatenating class token into image tokens as transformer input. Default: True. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. """ def __init__(self, img_size=(224, 224), patch_size=16, in_channels=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, out_indices=11, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_cfg=dict(type='LN', eps=1e-6, requires_grad=True), act_cfg=dict(type='GELU'), norm_eval=False, final_norm=False, with_cls_token=True, interpolate_mode='bicubic', with_cp=False): super(VisionTransformer, self).__init__() self.img_size = img_size self.patch_size = patch_size self.features = self.embed_dim = embed_dim self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim) self.with_cls_token = with_cls_token self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) self.pos_embed = nn.Parameter( torch.zeros(1, self.patch_embed.num_patches + 1, embed_dim)) self.pos_drop = nn.Dropout(p=drop_rate) if isinstance(out_indices, int): self.out_indices = [out_indices] elif isinstance(out_indices, list) or isinstance(out_indices, tuple): self.out_indices = out_indices else: raise TypeError('out_indices must be type of int, list or tuple') dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) ] # stochastic depth decay rule self.blocks = nn.ModuleList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=dpr[i], attn_drop=attn_drop_rate, act_cfg=act_cfg, norm_cfg=norm_cfg, with_cp=with_cp) for i in range(depth) ]) self.interpolate_mode = interpolate_mode self.final_norm = final_norm if final_norm: _, self.norm = build_norm_layer(norm_cfg, embed_dim) self.norm_eval = norm_eval self.with_cp = with_cp def init_weights(self, pretrained=None): if isinstance(pretrained, str): logger = get_root_logger() checkpoint = _load_checkpoint(pretrained, logger=logger) if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] else: state_dict = checkpoint if 'pos_embed' in state_dict.keys(): if self.pos_embed.shape != state_dict['pos_embed'].shape: logger.info(msg=f'Resize the pos_embed shape from \ {state_dict["pos_embed"].shape} to {self.pos_embed.shape}') h, w = self.img_size pos_size = int( math.sqrt(state_dict['pos_embed'].shape[1] - 1)) state_dict['pos_embed'] = self.resize_pos_embed( state_dict['pos_embed'], (h, w), (pos_size, pos_size), self.patch_size, self.interpolate_mode) self.load_state_dict(state_dict, False) elif pretrained is None: # We only implement the 'jax_impl' initialization implemented at # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501 trunc_normal_(self.pos_embed, std=.02) trunc_normal_(self.cls_token, std=.02) for n, m in self.named_modules(): if isinstance(m, Linear): trunc_normal_(m.weight, std=.02) if m.bias is not None: if 'mlp' in n: normal_init(m.bias, std=1e-6) else: constant_init(m.bias, 0) elif isinstance(m, Conv2d): kaiming_init(m.weight, mode='fan_in') if m.bias is not None: constant_init(m.bias, 0) elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)): constant_init(m.bias, 0) constant_init(m.weight, 1.0) else: raise TypeError('pretrained must be a str or None') def _pos_embeding(self, img, patched_img, pos_embed): """Positiong embeding method. Resize the pos_embed, if the input image size doesn't match the training size. Args: img (torch.Tensor): The inference image tensor, the shape must be [B, C, H, W]. patched_img (torch.Tensor): The patched image, it should be shape of [B, L1, C]. pos_embed (torch.Tensor): The pos_embed weighs, it should be shape of [B, L2, c]. Return: torch.Tensor: The pos encoded image feature. """ assert patched_img.ndim == 3 and pos_embed.ndim == 3, \ 'the shapes of patched_img and pos_embed must be [B, L, C]' x_len, pos_len = patched_img.shape[1], pos_embed.shape[1] if x_len != pos_len: if pos_len == (self.img_size[0] // self.patch_size) * ( self.img_size[1] // self.patch_size) + 1: pos_h = self.img_size[0] // self.patch_size pos_w = self.img_size[1] // self.patch_size else: raise ValueError( 'Unexpected shape of pos_embed, got {}.'.format( pos_embed.shape)) pos_embed = self.resize_pos_embed(pos_embed, img.shape[2:], (pos_h, pos_w), self.patch_size, self.interpolate_mode) return self.pos_drop(patched_img + pos_embed) @staticmethod def resize_pos_embed(pos_embed, input_shpae, pos_shape, patch_size, mode): """Resize pos_embed weights. Resize pos_embed using bicubic interpolate method. Args: pos_embed (torch.Tensor): pos_embed weights. input_shpae (tuple): Tuple for (input_h, intput_w). pos_shape (tuple): Tuple for (pos_h, pos_w). patch_size (int): Patch size. Return: torch.Tensor: The resized pos_embed of shape [B, L_new, C] """ assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]' input_h, input_w = input_shpae pos_h, pos_w = pos_shape cls_token_weight = pos_embed[:, 0] pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):] pos_embed_weight = pos_embed_weight.reshape( 1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2) pos_embed_weight = F.interpolate( pos_embed_weight, size=[input_h // patch_size, input_w // patch_size], align_corners=False, mode=mode) cls_token_weight = cls_token_weight.unsqueeze(1) pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2) pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1) return pos_embed def forward(self, inputs): B = inputs.shape[0] x = self.patch_embed(inputs) cls_tokens = self.cls_token.expand(B, -1, -1) x = torch.cat((cls_tokens, x), dim=1) x = self._pos_embeding(inputs, x, self.pos_embed) if not self.with_cls_token: # Remove class token for transformer input x = x[:, 1:] outs = [] for i, blk in enumerate(self.blocks): x = blk(x) if i == len(self.blocks) - 1: if self.final_norm: x = self.norm(x) if i in self.out_indices: if self.with_cls_token: # Remove class token and reshape token for decoder head out = x[:, 1:] else: out = x B, _, C = out.shape out = out.reshape(B, inputs.shape[2] // self.patch_size, inputs.shape[3] // self.patch_size, C).permute(0, 3, 1, 2) outs.append(out) return tuple(outs) def train(self, mode=True): super(VisionTransformer, self).train(mode) if mode and self.norm_eval: for m in self.modules(): if isinstance(m, nn.LayerNorm): m.eval() ================================================ FILE: annotator/mmpkg/mmseg/models/builder.py ================================================ import warnings from annotator.mmpkg.mmcv.cnn import MODELS as MMCV_MODELS from annotator.mmpkg.mmcv.utils import Registry MODELS = Registry('models', parent=MMCV_MODELS) BACKBONES = MODELS NECKS = MODELS HEADS = MODELS LOSSES = MODELS SEGMENTORS = MODELS def build_backbone(cfg): """Build backbone.""" return BACKBONES.build(cfg) def build_neck(cfg): """Build neck.""" return NECKS.build(cfg) def build_head(cfg): """Build head.""" return HEADS.build(cfg) def build_loss(cfg): """Build loss.""" return LOSSES.build(cfg) def build_segmentor(cfg, train_cfg=None, test_cfg=None): """Build segmentor.""" if train_cfg is not None or test_cfg is not None: warnings.warn( 'train_cfg and test_cfg is deprecated, ' 'please specify them in model', UserWarning) assert cfg.get('train_cfg') is None or train_cfg is None, \ 'train_cfg specified in both outer field and model field ' assert cfg.get('test_cfg') is None or test_cfg is None, \ 'test_cfg specified in both outer field and model field ' return SEGMENTORS.build( cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/__init__.py ================================================ from .ann_head import ANNHead from .apc_head import APCHead from .aspp_head import ASPPHead from .cc_head import CCHead from .da_head import DAHead from .dm_head import DMHead from .dnl_head import DNLHead from .ema_head import EMAHead from .enc_head import EncHead from .fcn_head import FCNHead from .fpn_head import FPNHead from .gc_head import GCHead from .lraspp_head import LRASPPHead from .nl_head import NLHead from .ocr_head import OCRHead # from .point_head import PointHead from .psa_head import PSAHead from .psp_head import PSPHead from .sep_aspp_head import DepthwiseSeparableASPPHead from .sep_fcn_head import DepthwiseSeparableFCNHead from .uper_head import UPerHead __all__ = [ 'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead', 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead', 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead', 'APCHead', 'DMHead', 'LRASPPHead' ] ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/ann_head.py ================================================ import torch import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule from ..builder import HEADS from ..utils import SelfAttentionBlock as _SelfAttentionBlock from .decode_head import BaseDecodeHead class PPMConcat(nn.ModuleList): """Pyramid Pooling Module that only concat the features of each layer. Args: pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid Module. """ def __init__(self, pool_scales=(1, 3, 6, 8)): super(PPMConcat, self).__init__( [nn.AdaptiveAvgPool2d(pool_scale) for pool_scale in pool_scales]) def forward(self, feats): """Forward function.""" ppm_outs = [] for ppm in self: ppm_out = ppm(feats) ppm_outs.append(ppm_out.view(*feats.shape[:2], -1)) concat_outs = torch.cat(ppm_outs, dim=2) return concat_outs class SelfAttentionBlock(_SelfAttentionBlock): """Make a ANN used SelfAttentionBlock. Args: low_in_channels (int): Input channels of lower level feature, which is the key feature for self-attention. high_in_channels (int): Input channels of higher level feature, which is the query feature for self-attention. channels (int): Output channels of key/query transform. out_channels (int): Output channels. share_key_query (bool): Whether share projection weight between key and query projection. query_scale (int): The scale of query feature map. key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid Module of key feature. conv_cfg (dict|None): Config of conv layers. norm_cfg (dict|None): Config of norm layers. act_cfg (dict|None): Config of activation layers. """ def __init__(self, low_in_channels, high_in_channels, channels, out_channels, share_key_query, query_scale, key_pool_scales, conv_cfg, norm_cfg, act_cfg): key_psp = PPMConcat(key_pool_scales) if query_scale > 1: query_downsample = nn.MaxPool2d(kernel_size=query_scale) else: query_downsample = None super(SelfAttentionBlock, self).__init__( key_in_channels=low_in_channels, query_in_channels=high_in_channels, channels=channels, out_channels=out_channels, share_key_query=share_key_query, query_downsample=query_downsample, key_downsample=key_psp, key_query_num_convs=1, key_query_norm=True, value_out_num_convs=1, value_out_norm=False, matmul_norm=True, with_out=True, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) class AFNB(nn.Module): """Asymmetric Fusion Non-local Block(AFNB) Args: low_in_channels (int): Input channels of lower level feature, which is the key feature for self-attention. high_in_channels (int): Input channels of higher level feature, which is the query feature for self-attention. channels (int): Output channels of key/query transform. out_channels (int): Output channels. and query projection. query_scales (tuple[int]): The scales of query feature map. Default: (1,) key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid Module of key feature. conv_cfg (dict|None): Config of conv layers. norm_cfg (dict|None): Config of norm layers. act_cfg (dict|None): Config of activation layers. """ def __init__(self, low_in_channels, high_in_channels, channels, out_channels, query_scales, key_pool_scales, conv_cfg, norm_cfg, act_cfg): super(AFNB, self).__init__() self.stages = nn.ModuleList() for query_scale in query_scales: self.stages.append( SelfAttentionBlock( low_in_channels=low_in_channels, high_in_channels=high_in_channels, channels=channels, out_channels=out_channels, share_key_query=False, query_scale=query_scale, key_pool_scales=key_pool_scales, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) self.bottleneck = ConvModule( out_channels + high_in_channels, out_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=None) def forward(self, low_feats, high_feats): """Forward function.""" priors = [stage(high_feats, low_feats) for stage in self.stages] context = torch.stack(priors, dim=0).sum(dim=0) output = self.bottleneck(torch.cat([context, high_feats], 1)) return output class APNB(nn.Module): """Asymmetric Pyramid Non-local Block (APNB) Args: in_channels (int): Input channels of key/query feature, which is the key feature for self-attention. channels (int): Output channels of key/query transform. out_channels (int): Output channels. query_scales (tuple[int]): The scales of query feature map. Default: (1,) key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid Module of key feature. conv_cfg (dict|None): Config of conv layers. norm_cfg (dict|None): Config of norm layers. act_cfg (dict|None): Config of activation layers. """ def __init__(self, in_channels, channels, out_channels, query_scales, key_pool_scales, conv_cfg, norm_cfg, act_cfg): super(APNB, self).__init__() self.stages = nn.ModuleList() for query_scale in query_scales: self.stages.append( SelfAttentionBlock( low_in_channels=in_channels, high_in_channels=in_channels, channels=channels, out_channels=out_channels, share_key_query=True, query_scale=query_scale, key_pool_scales=key_pool_scales, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) self.bottleneck = ConvModule( 2 * in_channels, out_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) def forward(self, feats): """Forward function.""" priors = [stage(feats, feats) for stage in self.stages] context = torch.stack(priors, dim=0).sum(dim=0) output = self.bottleneck(torch.cat([context, feats], 1)) return output @HEADS.register_module() class ANNHead(BaseDecodeHead): """Asymmetric Non-local Neural Networks for Semantic Segmentation. This head is the implementation of `ANNNet `_. Args: project_channels (int): Projection channels for Nonlocal. query_scales (tuple[int]): The scales of query feature map. Default: (1,) key_pool_scales (tuple[int]): The pooling scales of key feature map. Default: (1, 3, 6, 8). """ def __init__(self, project_channels, query_scales=(1, ), key_pool_scales=(1, 3, 6, 8), **kwargs): super(ANNHead, self).__init__( input_transform='multiple_select', **kwargs) assert len(self.in_channels) == 2 low_in_channels, high_in_channels = self.in_channels self.project_channels = project_channels self.fusion = AFNB( low_in_channels=low_in_channels, high_in_channels=high_in_channels, out_channels=high_in_channels, channels=project_channels, query_scales=query_scales, key_pool_scales=key_pool_scales, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.bottleneck = ConvModule( high_in_channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.context = APNB( in_channels=self.channels, out_channels=self.channels, channels=project_channels, query_scales=query_scales, key_pool_scales=key_pool_scales, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, inputs): """Forward function.""" low_feats, high_feats = self._transform_inputs(inputs) output = self.fusion(low_feats, high_feats) output = self.dropout(output) output = self.bottleneck(output) output = self.context(output) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/apc_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import ConvModule from annotator.mmpkg.mmseg.ops import resize from ..builder import HEADS from .decode_head import BaseDecodeHead class ACM(nn.Module): """Adaptive Context Module used in APCNet. Args: pool_scale (int): Pooling scale used in Adaptive Context Module to extract region features. fusion (bool): Add one conv to fuse residual feature. in_channels (int): Input channels. channels (int): Channels after modules, before conv_seg. conv_cfg (dict | None): Config of conv layers. norm_cfg (dict | None): Config of norm layers. act_cfg (dict): Config of activation layers. """ def __init__(self, pool_scale, fusion, in_channels, channels, conv_cfg, norm_cfg, act_cfg): super(ACM, self).__init__() self.pool_scale = pool_scale self.fusion = fusion self.in_channels = in_channels self.channels = channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.pooled_redu_conv = ConvModule( self.in_channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.input_redu_conv = ConvModule( self.in_channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.global_info = ConvModule( self.channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.gla = nn.Conv2d(self.channels, self.pool_scale**2, 1, 1, 0) self.residual_conv = ConvModule( self.channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) if self.fusion: self.fusion_conv = ConvModule( self.channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, x): """Forward function.""" pooled_x = F.adaptive_avg_pool2d(x, self.pool_scale) # [batch_size, channels, h, w] x = self.input_redu_conv(x) # [batch_size, channels, pool_scale, pool_scale] pooled_x = self.pooled_redu_conv(pooled_x) batch_size = x.size(0) # [batch_size, pool_scale * pool_scale, channels] pooled_x = pooled_x.view(batch_size, self.channels, -1).permute(0, 2, 1).contiguous() # [batch_size, h * w, pool_scale * pool_scale] affinity_matrix = self.gla(x + resize( self.global_info(F.adaptive_avg_pool2d(x, 1)), size=x.shape[2:]) ).permute(0, 2, 3, 1).reshape( batch_size, -1, self.pool_scale**2) affinity_matrix = F.sigmoid(affinity_matrix) # [batch_size, h * w, channels] z_out = torch.matmul(affinity_matrix, pooled_x) # [batch_size, channels, h * w] z_out = z_out.permute(0, 2, 1).contiguous() # [batch_size, channels, h, w] z_out = z_out.view(batch_size, self.channels, x.size(2), x.size(3)) z_out = self.residual_conv(z_out) z_out = F.relu(z_out + x) if self.fusion: z_out = self.fusion_conv(z_out) return z_out @HEADS.register_module() class APCHead(BaseDecodeHead): """Adaptive Pyramid Context Network for Semantic Segmentation. This head is the implementation of `APCNet `_. Args: pool_scales (tuple[int]): Pooling scales used in Adaptive Context Module. Default: (1, 2, 3, 6). fusion (bool): Add one conv to fuse residual feature. """ def __init__(self, pool_scales=(1, 2, 3, 6), fusion=True, **kwargs): super(APCHead, self).__init__(**kwargs) assert isinstance(pool_scales, (list, tuple)) self.pool_scales = pool_scales self.fusion = fusion acm_modules = [] for pool_scale in self.pool_scales: acm_modules.append( ACM(pool_scale, self.fusion, self.in_channels, self.channels, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) self.acm_modules = nn.ModuleList(acm_modules) self.bottleneck = ConvModule( self.in_channels + len(pool_scales) * self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) acm_outs = [x] for acm_module in self.acm_modules: acm_outs.append(acm_module(x)) acm_outs = torch.cat(acm_outs, dim=1) output = self.bottleneck(acm_outs) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/aspp_head.py ================================================ import torch import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule from annotator.mmpkg.mmseg.ops import resize from ..builder import HEADS from .decode_head import BaseDecodeHead class ASPPModule(nn.ModuleList): """Atrous Spatial Pyramid Pooling (ASPP) Module. Args: dilations (tuple[int]): Dilation rate of each layer. in_channels (int): Input channels. channels (int): Channels after modules, before conv_seg. conv_cfg (dict|None): Config of conv layers. norm_cfg (dict|None): Config of norm layers. act_cfg (dict): Config of activation layers. """ def __init__(self, dilations, in_channels, channels, conv_cfg, norm_cfg, act_cfg): super(ASPPModule, self).__init__() self.dilations = dilations self.in_channels = in_channels self.channels = channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg for dilation in dilations: self.append( ConvModule( self.in_channels, self.channels, 1 if dilation == 1 else 3, dilation=dilation, padding=0 if dilation == 1 else dilation, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) def forward(self, x): """Forward function.""" aspp_outs = [] for aspp_module in self: aspp_outs.append(aspp_module(x)) return aspp_outs @HEADS.register_module() class ASPPHead(BaseDecodeHead): """Rethinking Atrous Convolution for Semantic Image Segmentation. This head is the implementation of `DeepLabV3 `_. Args: dilations (tuple[int]): Dilation rates for ASPP module. Default: (1, 6, 12, 18). """ def __init__(self, dilations=(1, 6, 12, 18), **kwargs): super(ASPPHead, self).__init__(**kwargs) assert isinstance(dilations, (list, tuple)) self.dilations = dilations self.image_pool = nn.Sequential( nn.AdaptiveAvgPool2d(1), ConvModule( self.in_channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) self.aspp_modules = ASPPModule( dilations, self.in_channels, self.channels, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.bottleneck = ConvModule( (len(dilations) + 1) * self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) aspp_outs = [ resize( self.image_pool(x), size=x.size()[2:], mode='bilinear', align_corners=self.align_corners) ] aspp_outs.extend(self.aspp_modules(x)) aspp_outs = torch.cat(aspp_outs, dim=1) output = self.bottleneck(aspp_outs) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/cascade_decode_head.py ================================================ from abc import ABCMeta, abstractmethod from .decode_head import BaseDecodeHead class BaseCascadeDecodeHead(BaseDecodeHead, metaclass=ABCMeta): """Base class for cascade decode head used in :class:`CascadeEncoderDecoder.""" def __init__(self, *args, **kwargs): super(BaseCascadeDecodeHead, self).__init__(*args, **kwargs) @abstractmethod def forward(self, inputs, prev_output): """Placeholder of forward function.""" pass def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg, train_cfg): """Forward function for training. Args: inputs (list[Tensor]): List of multi-level img features. prev_output (Tensor): The output of previous decode head. img_metas (list[dict]): List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. gt_semantic_seg (Tensor): Semantic segmentation masks used if the architecture supports semantic segmentation task. train_cfg (dict): The training config. Returns: dict[str, Tensor]: a dictionary of loss components """ seg_logits = self.forward(inputs, prev_output) losses = self.losses(seg_logits, gt_semantic_seg) return losses def forward_test(self, inputs, prev_output, img_metas, test_cfg): """Forward function for testing. Args: inputs (list[Tensor]): List of multi-level img features. prev_output (Tensor): The output of previous decode head. img_metas (list[dict]): List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. test_cfg (dict): The testing config. Returns: Tensor: Output segmentation map. """ return self.forward(inputs, prev_output) ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/cc_head.py ================================================ import torch from ..builder import HEADS from .fcn_head import FCNHead try: try: from mmcv.ops import CrissCrossAttention except ImportError: from annotator.mmpkg.mmcv.ops import CrissCrossAttention except ModuleNotFoundError: CrissCrossAttention = None @HEADS.register_module() class CCHead(FCNHead): """CCNet: Criss-Cross Attention for Semantic Segmentation. This head is the implementation of `CCNet `_. Args: recurrence (int): Number of recurrence of Criss Cross Attention module. Default: 2. """ def __init__(self, recurrence=2, **kwargs): if CrissCrossAttention is None: raise RuntimeError('Please install mmcv-full for ' 'CrissCrossAttention ops') super(CCHead, self).__init__(num_convs=2, **kwargs) self.recurrence = recurrence self.cca = CrissCrossAttention(self.channels) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) output = self.convs[0](x) for _ in range(self.recurrence): output = self.cca(output) output = self.convs[1](output) if self.concat_input: output = self.conv_cat(torch.cat([x, output], dim=1)) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/da_head.py ================================================ import torch import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import ConvModule, Scale from torch import nn from annotator.mmpkg.mmseg.core import add_prefix from ..builder import HEADS from ..utils import SelfAttentionBlock as _SelfAttentionBlock from .decode_head import BaseDecodeHead class PAM(_SelfAttentionBlock): """Position Attention Module (PAM) Args: in_channels (int): Input channels of key/query feature. channels (int): Output channels of key/query transform. """ def __init__(self, in_channels, channels): super(PAM, self).__init__( key_in_channels=in_channels, query_in_channels=in_channels, channels=channels, out_channels=in_channels, share_key_query=False, query_downsample=None, key_downsample=None, key_query_num_convs=1, key_query_norm=False, value_out_num_convs=1, value_out_norm=False, matmul_norm=False, with_out=False, conv_cfg=None, norm_cfg=None, act_cfg=None) self.gamma = Scale(0) def forward(self, x): """Forward function.""" out = super(PAM, self).forward(x, x) out = self.gamma(out) + x return out class CAM(nn.Module): """Channel Attention Module (CAM)""" def __init__(self): super(CAM, self).__init__() self.gamma = Scale(0) def forward(self, x): """Forward function.""" batch_size, channels, height, width = x.size() proj_query = x.view(batch_size, channels, -1) proj_key = x.view(batch_size, channels, -1).permute(0, 2, 1) energy = torch.bmm(proj_query, proj_key) energy_new = torch.max( energy, -1, keepdim=True)[0].expand_as(energy) - energy attention = F.softmax(energy_new, dim=-1) proj_value = x.view(batch_size, channels, -1) out = torch.bmm(attention, proj_value) out = out.view(batch_size, channels, height, width) out = self.gamma(out) + x return out @HEADS.register_module() class DAHead(BaseDecodeHead): """Dual Attention Network for Scene Segmentation. This head is the implementation of `DANet `_. Args: pam_channels (int): The channels of Position Attention Module(PAM). """ def __init__(self, pam_channels, **kwargs): super(DAHead, self).__init__(**kwargs) self.pam_channels = pam_channels self.pam_in_conv = ConvModule( self.in_channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.pam = PAM(self.channels, pam_channels) self.pam_out_conv = ConvModule( self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.pam_conv_seg = nn.Conv2d( self.channels, self.num_classes, kernel_size=1) self.cam_in_conv = ConvModule( self.in_channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.cam = CAM() self.cam_out_conv = ConvModule( self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.cam_conv_seg = nn.Conv2d( self.channels, self.num_classes, kernel_size=1) def pam_cls_seg(self, feat): """PAM feature classification.""" if self.dropout is not None: feat = self.dropout(feat) output = self.pam_conv_seg(feat) return output def cam_cls_seg(self, feat): """CAM feature classification.""" if self.dropout is not None: feat = self.dropout(feat) output = self.cam_conv_seg(feat) return output def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) pam_feat = self.pam_in_conv(x) pam_feat = self.pam(pam_feat) pam_feat = self.pam_out_conv(pam_feat) pam_out = self.pam_cls_seg(pam_feat) cam_feat = self.cam_in_conv(x) cam_feat = self.cam(cam_feat) cam_feat = self.cam_out_conv(cam_feat) cam_out = self.cam_cls_seg(cam_feat) feat_sum = pam_feat + cam_feat pam_cam_out = self.cls_seg(feat_sum) return pam_cam_out, pam_out, cam_out def forward_test(self, inputs, img_metas, test_cfg): """Forward function for testing, only ``pam_cam`` is used.""" return self.forward(inputs)[0] def losses(self, seg_logit, seg_label): """Compute ``pam_cam``, ``pam``, ``cam`` loss.""" pam_cam_seg_logit, pam_seg_logit, cam_seg_logit = seg_logit loss = dict() loss.update( add_prefix( super(DAHead, self).losses(pam_cam_seg_logit, seg_label), 'pam_cam')) loss.update( add_prefix( super(DAHead, self).losses(pam_seg_logit, seg_label), 'pam')) loss.update( add_prefix( super(DAHead, self).losses(cam_seg_logit, seg_label), 'cam')) return loss ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/decode_head.py ================================================ from abc import ABCMeta, abstractmethod import torch import torch.nn as nn from annotator.mmpkg.mmcv.cnn import normal_init from annotator.mmpkg.mmcv.runner import auto_fp16, force_fp32 from annotator.mmpkg.mmseg.core import build_pixel_sampler from annotator.mmpkg.mmseg.ops import resize from ..builder import build_loss from ..losses import accuracy class BaseDecodeHead(nn.Module, metaclass=ABCMeta): """Base class for BaseDecodeHead. Args: in_channels (int|Sequence[int]): Input channels. channels (int): Channels after modules, before conv_seg. num_classes (int): Number of classes. dropout_ratio (float): Ratio of dropout layer. Default: 0.1. conv_cfg (dict|None): Config of conv layers. Default: None. norm_cfg (dict|None): Config of norm layers. Default: None. act_cfg (dict): Config of activation layers. Default: dict(type='ReLU') in_index (int|Sequence[int]): Input feature index. Default: -1 input_transform (str|None): Transformation type of input features. Options: 'resize_concat', 'multiple_select', None. 'resize_concat': Multiple feature maps will be resize to the same size as first one and than concat together. Usually used in FCN head of HRNet. 'multiple_select': Multiple feature maps will be bundle into a list and passed into decode head. None: Only one select feature map is allowed. Default: None. loss_decode (dict): Config of decode loss. Default: dict(type='CrossEntropyLoss'). ignore_index (int | None): The label index to be ignored. When using masked BCE loss, ignore_index should be set to None. Default: 255 sampler (dict|None): The config of segmentation map sampler. Default: None. align_corners (bool): align_corners argument of F.interpolate. Default: False. """ def __init__(self, in_channels, channels, *, num_classes, dropout_ratio=0.1, conv_cfg=None, norm_cfg=None, act_cfg=dict(type='ReLU'), in_index=-1, input_transform=None, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), ignore_index=255, sampler=None, align_corners=False): super(BaseDecodeHead, self).__init__() self._init_inputs(in_channels, in_index, input_transform) self.channels = channels self.num_classes = num_classes self.dropout_ratio = dropout_ratio self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.in_index = in_index self.loss_decode = build_loss(loss_decode) self.ignore_index = ignore_index self.align_corners = align_corners if sampler is not None: self.sampler = build_pixel_sampler(sampler, context=self) else: self.sampler = None self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1) if dropout_ratio > 0: self.dropout = nn.Dropout2d(dropout_ratio) else: self.dropout = None self.fp16_enabled = False def extra_repr(self): """Extra repr.""" s = f'input_transform={self.input_transform}, ' \ f'ignore_index={self.ignore_index}, ' \ f'align_corners={self.align_corners}' return s def _init_inputs(self, in_channels, in_index, input_transform): """Check and initialize input transforms. The in_channels, in_index and input_transform must match. Specifically, when input_transform is None, only single feature map will be selected. So in_channels and in_index must be of type int. When input_transform Args: in_channels (int|Sequence[int]): Input channels. in_index (int|Sequence[int]): Input feature index. input_transform (str|None): Transformation type of input features. Options: 'resize_concat', 'multiple_select', None. 'resize_concat': Multiple feature maps will be resize to the same size as first one and than concat together. Usually used in FCN head of HRNet. 'multiple_select': Multiple feature maps will be bundle into a list and passed into decode head. None: Only one select feature map is allowed. """ if input_transform is not None: assert input_transform in ['resize_concat', 'multiple_select'] self.input_transform = input_transform self.in_index = in_index if input_transform is not None: assert isinstance(in_channels, (list, tuple)) assert isinstance(in_index, (list, tuple)) assert len(in_channels) == len(in_index) if input_transform == 'resize_concat': self.in_channels = sum(in_channels) else: self.in_channels = in_channels else: assert isinstance(in_channels, int) assert isinstance(in_index, int) self.in_channels = in_channels def init_weights(self): """Initialize weights of classification layer.""" normal_init(self.conv_seg, mean=0, std=0.01) def _transform_inputs(self, inputs): """Transform inputs for decoder. Args: inputs (list[Tensor]): List of multi-level img features. Returns: Tensor: The transformed inputs """ if self.input_transform == 'resize_concat': inputs = [inputs[i] for i in self.in_index] upsampled_inputs = [ resize( input=x, size=inputs[0].shape[2:], mode='bilinear', align_corners=self.align_corners) for x in inputs ] inputs = torch.cat(upsampled_inputs, dim=1) elif self.input_transform == 'multiple_select': inputs = [inputs[i] for i in self.in_index] else: inputs = inputs[self.in_index] return inputs @auto_fp16() @abstractmethod def forward(self, inputs): """Placeholder of forward function.""" pass def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg): """Forward function for training. Args: inputs (list[Tensor]): List of multi-level img features. img_metas (list[dict]): List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. gt_semantic_seg (Tensor): Semantic segmentation masks used if the architecture supports semantic segmentation task. train_cfg (dict): The training config. Returns: dict[str, Tensor]: a dictionary of loss components """ seg_logits = self.forward(inputs) losses = self.losses(seg_logits, gt_semantic_seg) return losses def forward_test(self, inputs, img_metas, test_cfg): """Forward function for testing. Args: inputs (list[Tensor]): List of multi-level img features. img_metas (list[dict]): List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. test_cfg (dict): The testing config. Returns: Tensor: Output segmentation map. """ return self.forward(inputs) def cls_seg(self, feat): """Classify each pixel.""" if self.dropout is not None: feat = self.dropout(feat) output = self.conv_seg(feat) return output @force_fp32(apply_to=('seg_logit', )) def losses(self, seg_logit, seg_label): """Compute segmentation loss.""" loss = dict() seg_logit = resize( input=seg_logit, size=seg_label.shape[2:], mode='bilinear', align_corners=self.align_corners) if self.sampler is not None: seg_weight = self.sampler.sample(seg_logit, seg_label) else: seg_weight = None seg_label = seg_label.squeeze(1) loss['loss_seg'] = self.loss_decode( seg_logit, seg_label, weight=seg_weight, ignore_index=self.ignore_index) loss['acc_seg'] = accuracy(seg_logit, seg_label) return loss ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/dm_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer from ..builder import HEADS from .decode_head import BaseDecodeHead class DCM(nn.Module): """Dynamic Convolutional Module used in DMNet. Args: filter_size (int): The filter size of generated convolution kernel used in Dynamic Convolutional Module. fusion (bool): Add one conv to fuse DCM output feature. in_channels (int): Input channels. channels (int): Channels after modules, before conv_seg. conv_cfg (dict | None): Config of conv layers. norm_cfg (dict | None): Config of norm layers. act_cfg (dict): Config of activation layers. """ def __init__(self, filter_size, fusion, in_channels, channels, conv_cfg, norm_cfg, act_cfg): super(DCM, self).__init__() self.filter_size = filter_size self.fusion = fusion self.in_channels = in_channels self.channels = channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.filter_gen_conv = nn.Conv2d(self.in_channels, self.channels, 1, 1, 0) self.input_redu_conv = ConvModule( self.in_channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) if self.norm_cfg is not None: self.norm = build_norm_layer(self.norm_cfg, self.channels)[1] else: self.norm = None self.activate = build_activation_layer(self.act_cfg) if self.fusion: self.fusion_conv = ConvModule( self.channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, x): """Forward function.""" generated_filter = self.filter_gen_conv( F.adaptive_avg_pool2d(x, self.filter_size)) x = self.input_redu_conv(x) b, c, h, w = x.shape # [1, b * c, h, w], c = self.channels x = x.view(1, b * c, h, w) # [b * c, 1, filter_size, filter_size] generated_filter = generated_filter.view(b * c, 1, self.filter_size, self.filter_size) pad = (self.filter_size - 1) // 2 if (self.filter_size - 1) % 2 == 0: p2d = (pad, pad, pad, pad) else: p2d = (pad + 1, pad, pad + 1, pad) x = F.pad(input=x, pad=p2d, mode='constant', value=0) # [1, b * c, h, w] output = F.conv2d(input=x, weight=generated_filter, groups=b * c) # [b, c, h, w] output = output.view(b, c, h, w) if self.norm is not None: output = self.norm(output) output = self.activate(output) if self.fusion: output = self.fusion_conv(output) return output @HEADS.register_module() class DMHead(BaseDecodeHead): """Dynamic Multi-scale Filters for Semantic Segmentation. This head is the implementation of `DMNet `_. Args: filter_sizes (tuple[int]): The size of generated convolutional filters used in Dynamic Convolutional Module. Default: (1, 3, 5, 7). fusion (bool): Add one conv to fuse DCM output feature. """ def __init__(self, filter_sizes=(1, 3, 5, 7), fusion=False, **kwargs): super(DMHead, self).__init__(**kwargs) assert isinstance(filter_sizes, (list, tuple)) self.filter_sizes = filter_sizes self.fusion = fusion dcm_modules = [] for filter_size in self.filter_sizes: dcm_modules.append( DCM(filter_size, self.fusion, self.in_channels, self.channels, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) self.dcm_modules = nn.ModuleList(dcm_modules) self.bottleneck = ConvModule( self.in_channels + len(filter_sizes) * self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) dcm_outs = [x] for dcm_module in self.dcm_modules: dcm_outs.append(dcm_module(x)) dcm_outs = torch.cat(dcm_outs, dim=1) output = self.bottleneck(dcm_outs) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/dnl_head.py ================================================ import torch from annotator.mmpkg.mmcv.cnn import NonLocal2d from torch import nn from ..builder import HEADS from .fcn_head import FCNHead class DisentangledNonLocal2d(NonLocal2d): """Disentangled Non-Local Blocks. Args: temperature (float): Temperature to adjust attention. Default: 0.05 """ def __init__(self, *arg, temperature, **kwargs): super().__init__(*arg, **kwargs) self.temperature = temperature self.conv_mask = nn.Conv2d(self.in_channels, 1, kernel_size=1) def embedded_gaussian(self, theta_x, phi_x): """Embedded gaussian with temperature.""" # NonLocal2d pairwise_weight: [N, HxW, HxW] pairwise_weight = torch.matmul(theta_x, phi_x) if self.use_scale: # theta_x.shape[-1] is `self.inter_channels` pairwise_weight /= theta_x.shape[-1]**0.5 pairwise_weight /= self.temperature pairwise_weight = pairwise_weight.softmax(dim=-1) return pairwise_weight def forward(self, x): # x: [N, C, H, W] n = x.size(0) # g_x: [N, HxW, C] g_x = self.g(x).view(n, self.inter_channels, -1) g_x = g_x.permute(0, 2, 1) # theta_x: [N, HxW, C], phi_x: [N, C, HxW] if self.mode == 'gaussian': theta_x = x.view(n, self.in_channels, -1) theta_x = theta_x.permute(0, 2, 1) if self.sub_sample: phi_x = self.phi(x).view(n, self.in_channels, -1) else: phi_x = x.view(n, self.in_channels, -1) elif self.mode == 'concatenation': theta_x = self.theta(x).view(n, self.inter_channels, -1, 1) phi_x = self.phi(x).view(n, self.inter_channels, 1, -1) else: theta_x = self.theta(x).view(n, self.inter_channels, -1) theta_x = theta_x.permute(0, 2, 1) phi_x = self.phi(x).view(n, self.inter_channels, -1) # subtract mean theta_x -= theta_x.mean(dim=-2, keepdim=True) phi_x -= phi_x.mean(dim=-1, keepdim=True) pairwise_func = getattr(self, self.mode) # pairwise_weight: [N, HxW, HxW] pairwise_weight = pairwise_func(theta_x, phi_x) # y: [N, HxW, C] y = torch.matmul(pairwise_weight, g_x) # y: [N, C, H, W] y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels, *x.size()[2:]) # unary_mask: [N, 1, HxW] unary_mask = self.conv_mask(x) unary_mask = unary_mask.view(n, 1, -1) unary_mask = unary_mask.softmax(dim=-1) # unary_x: [N, 1, C] unary_x = torch.matmul(unary_mask, g_x) # unary_x: [N, C, 1, 1] unary_x = unary_x.permute(0, 2, 1).contiguous().reshape( n, self.inter_channels, 1, 1) output = x + self.conv_out(y + unary_x) return output @HEADS.register_module() class DNLHead(FCNHead): """Disentangled Non-Local Neural Networks. This head is the implementation of `DNLNet `_. Args: reduction (int): Reduction factor of projection transform. Default: 2. use_scale (bool): Whether to scale pairwise_weight by sqrt(1/inter_channels). Default: False. mode (str): The nonlocal mode. Options are 'embedded_gaussian', 'dot_product'. Default: 'embedded_gaussian.'. temperature (float): Temperature to adjust attention. Default: 0.05 """ def __init__(self, reduction=2, use_scale=True, mode='embedded_gaussian', temperature=0.05, **kwargs): super(DNLHead, self).__init__(num_convs=2, **kwargs) self.reduction = reduction self.use_scale = use_scale self.mode = mode self.temperature = temperature self.dnl_block = DisentangledNonLocal2d( in_channels=self.channels, reduction=self.reduction, use_scale=self.use_scale, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, mode=self.mode, temperature=self.temperature) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) output = self.convs[0](x) output = self.dnl_block(output) output = self.convs[1](output) if self.concat_input: output = self.conv_cat(torch.cat([x, output], dim=1)) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/ema_head.py ================================================ import math import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import ConvModule from ..builder import HEADS from .decode_head import BaseDecodeHead def reduce_mean(tensor): """Reduce mean when distributed training.""" if not (dist.is_available() and dist.is_initialized()): return tensor tensor = tensor.clone() dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM) return tensor class EMAModule(nn.Module): """Expectation Maximization Attention Module used in EMANet. Args: channels (int): Channels of the whole module. num_bases (int): Number of bases. num_stages (int): Number of the EM iterations. """ def __init__(self, channels, num_bases, num_stages, momentum): super(EMAModule, self).__init__() assert num_stages >= 1, 'num_stages must be at least 1!' self.num_bases = num_bases self.num_stages = num_stages self.momentum = momentum bases = torch.zeros(1, channels, self.num_bases) bases.normal_(0, math.sqrt(2. / self.num_bases)) # [1, channels, num_bases] bases = F.normalize(bases, dim=1, p=2) self.register_buffer('bases', bases) def forward(self, feats): """Forward function.""" batch_size, channels, height, width = feats.size() # [batch_size, channels, height*width] feats = feats.view(batch_size, channels, height * width) # [batch_size, channels, num_bases] bases = self.bases.repeat(batch_size, 1, 1) with torch.no_grad(): for i in range(self.num_stages): # [batch_size, height*width, num_bases] attention = torch.einsum('bcn,bck->bnk', feats, bases) attention = F.softmax(attention, dim=2) # l1 norm attention_normed = F.normalize(attention, dim=1, p=1) # [batch_size, channels, num_bases] bases = torch.einsum('bcn,bnk->bck', feats, attention_normed) # l2 norm bases = F.normalize(bases, dim=1, p=2) feats_recon = torch.einsum('bck,bnk->bcn', bases, attention) feats_recon = feats_recon.view(batch_size, channels, height, width) if self.training: bases = bases.mean(dim=0, keepdim=True) bases = reduce_mean(bases) # l2 norm bases = F.normalize(bases, dim=1, p=2) self.bases = (1 - self.momentum) * self.bases + self.momentum * bases return feats_recon @HEADS.register_module() class EMAHead(BaseDecodeHead): """Expectation Maximization Attention Networks for Semantic Segmentation. This head is the implementation of `EMANet `_. Args: ema_channels (int): EMA module channels num_bases (int): Number of bases. num_stages (int): Number of the EM iterations. concat_input (bool): Whether concat the input and output of convs before classification layer. Default: True momentum (float): Momentum to update the base. Default: 0.1. """ def __init__(self, ema_channels, num_bases, num_stages, concat_input=True, momentum=0.1, **kwargs): super(EMAHead, self).__init__(**kwargs) self.ema_channels = ema_channels self.num_bases = num_bases self.num_stages = num_stages self.concat_input = concat_input self.momentum = momentum self.ema_module = EMAModule(self.ema_channels, self.num_bases, self.num_stages, self.momentum) self.ema_in_conv = ConvModule( self.in_channels, self.ema_channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) # project (0, inf) -> (-inf, inf) self.ema_mid_conv = ConvModule( self.ema_channels, self.ema_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=None, act_cfg=None) for param in self.ema_mid_conv.parameters(): param.requires_grad = False self.ema_out_conv = ConvModule( self.ema_channels, self.ema_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=None) self.bottleneck = ConvModule( self.ema_channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) if self.concat_input: self.conv_cat = ConvModule( self.in_channels + self.channels, self.channels, kernel_size=3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) feats = self.ema_in_conv(x) identity = feats feats = self.ema_mid_conv(feats) recon = self.ema_module(feats) recon = F.relu(recon, inplace=True) recon = self.ema_out_conv(recon) output = F.relu(identity + recon, inplace=True) output = self.bottleneck(output) if self.concat_input: output = self.conv_cat(torch.cat([x, output], dim=1)) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/enc_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import ConvModule, build_norm_layer from annotator.mmpkg.mmseg.ops import Encoding, resize from ..builder import HEADS, build_loss from .decode_head import BaseDecodeHead class EncModule(nn.Module): """Encoding Module used in EncNet. Args: in_channels (int): Input channels. num_codes (int): Number of code words. conv_cfg (dict|None): Config of conv layers. norm_cfg (dict|None): Config of norm layers. act_cfg (dict): Config of activation layers. """ def __init__(self, in_channels, num_codes, conv_cfg, norm_cfg, act_cfg): super(EncModule, self).__init__() self.encoding_project = ConvModule( in_channels, in_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) # TODO: resolve this hack # change to 1d if norm_cfg is not None: encoding_norm_cfg = norm_cfg.copy() if encoding_norm_cfg['type'] in ['BN', 'IN']: encoding_norm_cfg['type'] += '1d' else: encoding_norm_cfg['type'] = encoding_norm_cfg['type'].replace( '2d', '1d') else: # fallback to BN1d encoding_norm_cfg = dict(type='BN1d') self.encoding = nn.Sequential( Encoding(channels=in_channels, num_codes=num_codes), build_norm_layer(encoding_norm_cfg, num_codes)[1], nn.ReLU(inplace=True)) self.fc = nn.Sequential( nn.Linear(in_channels, in_channels), nn.Sigmoid()) def forward(self, x): """Forward function.""" encoding_projection = self.encoding_project(x) encoding_feat = self.encoding(encoding_projection).mean(dim=1) batch_size, channels, _, _ = x.size() gamma = self.fc(encoding_feat) y = gamma.view(batch_size, channels, 1, 1) output = F.relu_(x + x * y) return encoding_feat, output @HEADS.register_module() class EncHead(BaseDecodeHead): """Context Encoding for Semantic Segmentation. This head is the implementation of `EncNet `_. Args: num_codes (int): Number of code words. Default: 32. use_se_loss (bool): Whether use Semantic Encoding Loss (SE-loss) to regularize the training. Default: True. add_lateral (bool): Whether use lateral connection to fuse features. Default: False. loss_se_decode (dict): Config of decode loss. Default: dict(type='CrossEntropyLoss', use_sigmoid=True). """ def __init__(self, num_codes=32, use_se_loss=True, add_lateral=False, loss_se_decode=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2), **kwargs): super(EncHead, self).__init__( input_transform='multiple_select', **kwargs) self.use_se_loss = use_se_loss self.add_lateral = add_lateral self.num_codes = num_codes self.bottleneck = ConvModule( self.in_channels[-1], self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) if add_lateral: self.lateral_convs = nn.ModuleList() for in_channels in self.in_channels[:-1]: # skip the last one self.lateral_convs.append( ConvModule( in_channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) self.fusion = ConvModule( len(self.in_channels) * self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.enc_module = EncModule( self.channels, num_codes=num_codes, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) if self.use_se_loss: self.loss_se_decode = build_loss(loss_se_decode) self.se_layer = nn.Linear(self.channels, self.num_classes) def forward(self, inputs): """Forward function.""" inputs = self._transform_inputs(inputs) feat = self.bottleneck(inputs[-1]) if self.add_lateral: laterals = [ resize( lateral_conv(inputs[i]), size=feat.shape[2:], mode='bilinear', align_corners=self.align_corners) for i, lateral_conv in enumerate(self.lateral_convs) ] feat = self.fusion(torch.cat([feat, *laterals], 1)) encode_feat, output = self.enc_module(feat) output = self.cls_seg(output) if self.use_se_loss: se_output = self.se_layer(encode_feat) return output, se_output else: return output def forward_test(self, inputs, img_metas, test_cfg): """Forward function for testing, ignore se_loss.""" if self.use_se_loss: return self.forward(inputs)[0] else: return self.forward(inputs) @staticmethod def _convert_to_onehot_labels(seg_label, num_classes): """Convert segmentation label to onehot. Args: seg_label (Tensor): Segmentation label of shape (N, H, W). num_classes (int): Number of classes. Returns: Tensor: Onehot labels of shape (N, num_classes). """ batch_size = seg_label.size(0) onehot_labels = seg_label.new_zeros((batch_size, num_classes)) for i in range(batch_size): hist = seg_label[i].float().histc( bins=num_classes, min=0, max=num_classes - 1) onehot_labels[i] = hist > 0 return onehot_labels def losses(self, seg_logit, seg_label): """Compute segmentation and semantic encoding loss.""" seg_logit, se_seg_logit = seg_logit loss = dict() loss.update(super(EncHead, self).losses(seg_logit, seg_label)) se_loss = self.loss_se_decode( se_seg_logit, self._convert_to_onehot_labels(seg_label, self.num_classes)) loss['loss_se'] = se_loss return loss ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/fcn_head.py ================================================ import torch import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule from ..builder import HEADS from .decode_head import BaseDecodeHead @HEADS.register_module() class FCNHead(BaseDecodeHead): """Fully Convolution Networks for Semantic Segmentation. This head is implemented of `FCNNet `_. Args: num_convs (int): Number of convs in the head. Default: 2. kernel_size (int): The kernel size for convs in the head. Default: 3. concat_input (bool): Whether concat the input and output of convs before classification layer. dilation (int): The dilation rate for convs in the head. Default: 1. """ def __init__(self, num_convs=2, kernel_size=3, concat_input=True, dilation=1, **kwargs): assert num_convs >= 0 and dilation > 0 and isinstance(dilation, int) self.num_convs = num_convs self.concat_input = concat_input self.kernel_size = kernel_size super(FCNHead, self).__init__(**kwargs) if num_convs == 0: assert self.in_channels == self.channels conv_padding = (kernel_size // 2) * dilation convs = [] convs.append( ConvModule( self.in_channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) for i in range(num_convs - 1): convs.append( ConvModule( self.channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) if num_convs == 0: self.convs = nn.Identity() else: self.convs = nn.Sequential(*convs) if self.concat_input: self.conv_cat = ConvModule( self.in_channels + self.channels, self.channels, kernel_size=kernel_size, padding=kernel_size // 2, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) output = self.convs(x) if self.concat_input: output = self.conv_cat(torch.cat([x, output], dim=1)) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/fpn_head.py ================================================ import numpy as np import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule from annotator.mmpkg.mmseg.ops import resize from ..builder import HEADS from .decode_head import BaseDecodeHead @HEADS.register_module() class FPNHead(BaseDecodeHead): """Panoptic Feature Pyramid Networks. This head is the implementation of `Semantic FPN `_. Args: feature_strides (tuple[int]): The strides for input feature maps. stack_lateral. All strides suppose to be power of 2. The first one is of largest resolution. """ def __init__(self, feature_strides, **kwargs): super(FPNHead, self).__init__( input_transform='multiple_select', **kwargs) assert len(feature_strides) == len(self.in_channels) assert min(feature_strides) == feature_strides[0] self.feature_strides = feature_strides self.scale_heads = nn.ModuleList() for i in range(len(feature_strides)): head_length = max( 1, int(np.log2(feature_strides[i]) - np.log2(feature_strides[0]))) scale_head = [] for k in range(head_length): scale_head.append( ConvModule( self.in_channels[i] if k == 0 else self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) if feature_strides[i] != feature_strides[0]: scale_head.append( nn.Upsample( scale_factor=2, mode='bilinear', align_corners=self.align_corners)) self.scale_heads.append(nn.Sequential(*scale_head)) def forward(self, inputs): x = self._transform_inputs(inputs) output = self.scale_heads[0](x[0]) for i in range(1, len(self.feature_strides)): # non inplace output = output + resize( self.scale_heads[i](x[i]), size=output.shape[2:], mode='bilinear', align_corners=self.align_corners) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/gc_head.py ================================================ import torch from annotator.mmpkg.mmcv.cnn import ContextBlock from ..builder import HEADS from .fcn_head import FCNHead @HEADS.register_module() class GCHead(FCNHead): """GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond. This head is the implementation of `GCNet `_. Args: ratio (float): Multiplier of channels ratio. Default: 1/4. pooling_type (str): The pooling type of context aggregation. Options are 'att', 'avg'. Default: 'avg'. fusion_types (tuple[str]): The fusion type for feature fusion. Options are 'channel_add', 'channel_mul'. Default: ('channel_add',) """ def __init__(self, ratio=1 / 4., pooling_type='att', fusion_types=('channel_add', ), **kwargs): super(GCHead, self).__init__(num_convs=2, **kwargs) self.ratio = ratio self.pooling_type = pooling_type self.fusion_types = fusion_types self.gc_block = ContextBlock( in_channels=self.channels, ratio=self.ratio, pooling_type=self.pooling_type, fusion_types=self.fusion_types) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) output = self.convs[0](x) output = self.gc_block(output) output = self.convs[1](output) if self.concat_input: output = self.conv_cat(torch.cat([x, output], dim=1)) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/lraspp_head.py ================================================ import torch import torch.nn as nn from annotator.mmpkg.mmcv import is_tuple_of from annotator.mmpkg.mmcv.cnn import ConvModule from annotator.mmpkg.mmseg.ops import resize from ..builder import HEADS from .decode_head import BaseDecodeHead @HEADS.register_module() class LRASPPHead(BaseDecodeHead): """Lite R-ASPP (LRASPP) head is proposed in Searching for MobileNetV3. This head is the improved implementation of `Searching for MobileNetV3 `_. Args: branch_channels (tuple[int]): The number of output channels in every each branch. Default: (32, 64). """ def __init__(self, branch_channels=(32, 64), **kwargs): super(LRASPPHead, self).__init__(**kwargs) if self.input_transform != 'multiple_select': raise ValueError('in Lite R-ASPP (LRASPP) head, input_transform ' f'must be \'multiple_select\'. But received ' f'\'{self.input_transform}\'') assert is_tuple_of(branch_channels, int) assert len(branch_channels) == len(self.in_channels) - 1 self.branch_channels = branch_channels self.convs = nn.Sequential() self.conv_ups = nn.Sequential() for i in range(len(branch_channels)): self.convs.add_module( f'conv{i}', nn.Conv2d( self.in_channels[i], branch_channels[i], 1, bias=False)) self.conv_ups.add_module( f'conv_up{i}', ConvModule( self.channels + branch_channels[i], self.channels, 1, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, bias=False)) self.conv_up_input = nn.Conv2d(self.channels, self.channels, 1) self.aspp_conv = ConvModule( self.in_channels[-1], self.channels, 1, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, bias=False) self.image_pool = nn.Sequential( nn.AvgPool2d(kernel_size=49, stride=(16, 20)), ConvModule( self.in_channels[2], self.channels, 1, act_cfg=dict(type='Sigmoid'), bias=False)) def forward(self, inputs): """Forward function.""" inputs = self._transform_inputs(inputs) x = inputs[-1] x = self.aspp_conv(x) * resize( self.image_pool(x), size=x.size()[2:], mode='bilinear', align_corners=self.align_corners) x = self.conv_up_input(x) for i in range(len(self.branch_channels) - 1, -1, -1): x = resize( x, size=inputs[i].size()[2:], mode='bilinear', align_corners=self.align_corners) x = torch.cat([x, self.convs[i](inputs[i])], 1) x = self.conv_ups[i](x) return self.cls_seg(x) ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/nl_head.py ================================================ import torch from annotator.mmpkg.mmcv.cnn import NonLocal2d from ..builder import HEADS from .fcn_head import FCNHead @HEADS.register_module() class NLHead(FCNHead): """Non-local Neural Networks. This head is the implementation of `NLNet `_. Args: reduction (int): Reduction factor of projection transform. Default: 2. use_scale (bool): Whether to scale pairwise_weight by sqrt(1/inter_channels). Default: True. mode (str): The nonlocal mode. Options are 'embedded_gaussian', 'dot_product'. Default: 'embedded_gaussian.'. """ def __init__(self, reduction=2, use_scale=True, mode='embedded_gaussian', **kwargs): super(NLHead, self).__init__(num_convs=2, **kwargs) self.reduction = reduction self.use_scale = use_scale self.mode = mode self.nl_block = NonLocal2d( in_channels=self.channels, reduction=self.reduction, use_scale=self.use_scale, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, mode=self.mode) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) output = self.convs[0](x) output = self.nl_block(output) output = self.convs[1](output) if self.concat_input: output = self.conv_cat(torch.cat([x, output], dim=1)) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/ocr_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import ConvModule from annotator.mmpkg.mmseg.ops import resize from ..builder import HEADS from ..utils import SelfAttentionBlock as _SelfAttentionBlock from .cascade_decode_head import BaseCascadeDecodeHead class SpatialGatherModule(nn.Module): """Aggregate the context features according to the initial predicted probability distribution. Employ the soft-weighted method to aggregate the context. """ def __init__(self, scale): super(SpatialGatherModule, self).__init__() self.scale = scale def forward(self, feats, probs): """Forward function.""" batch_size, num_classes, height, width = probs.size() channels = feats.size(1) probs = probs.view(batch_size, num_classes, -1) feats = feats.view(batch_size, channels, -1) # [batch_size, height*width, num_classes] feats = feats.permute(0, 2, 1) # [batch_size, channels, height*width] probs = F.softmax(self.scale * probs, dim=2) # [batch_size, channels, num_classes] ocr_context = torch.matmul(probs, feats) ocr_context = ocr_context.permute(0, 2, 1).contiguous().unsqueeze(3) return ocr_context class ObjectAttentionBlock(_SelfAttentionBlock): """Make a OCR used SelfAttentionBlock.""" def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg, act_cfg): if scale > 1: query_downsample = nn.MaxPool2d(kernel_size=scale) else: query_downsample = None super(ObjectAttentionBlock, self).__init__( key_in_channels=in_channels, query_in_channels=in_channels, channels=channels, out_channels=in_channels, share_key_query=False, query_downsample=query_downsample, key_downsample=None, key_query_num_convs=2, key_query_norm=True, value_out_num_convs=1, value_out_norm=True, matmul_norm=True, with_out=True, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) self.bottleneck = ConvModule( in_channels * 2, in_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, query_feats, key_feats): """Forward function.""" context = super(ObjectAttentionBlock, self).forward(query_feats, key_feats) output = self.bottleneck(torch.cat([context, query_feats], dim=1)) if self.query_downsample is not None: output = resize(query_feats) return output @HEADS.register_module() class OCRHead(BaseCascadeDecodeHead): """Object-Contextual Representations for Semantic Segmentation. This head is the implementation of `OCRNet `_. Args: ocr_channels (int): The intermediate channels of OCR block. scale (int): The scale of probability map in SpatialGatherModule in Default: 1. """ def __init__(self, ocr_channels, scale=1, **kwargs): super(OCRHead, self).__init__(**kwargs) self.ocr_channels = ocr_channels self.scale = scale self.object_context_block = ObjectAttentionBlock( self.channels, self.ocr_channels, self.scale, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.spatial_gather_module = SpatialGatherModule(self.scale) self.bottleneck = ConvModule( self.in_channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, inputs, prev_output): """Forward function.""" x = self._transform_inputs(inputs) feats = self.bottleneck(x) context = self.spatial_gather_module(feats, prev_output) object_context = self.object_context_block(feats, context) output = self.cls_seg(object_context) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/point_head.py ================================================ # Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py # noqa import torch import torch.nn as nn try: from mmcv.cnn import ConvModule, normal_init from mmcv.ops import point_sample except ImportError: from annotator.mmpkg.mmcv.cnn import ConvModule, normal_init from annotator.mmpkg.mmcv.ops import point_sample from annotator.mmpkg.mmseg.models.builder import HEADS from annotator.mmpkg.mmseg.ops import resize from ..losses import accuracy from .cascade_decode_head import BaseCascadeDecodeHead def calculate_uncertainty(seg_logits): """Estimate uncertainty based on seg logits. For each location of the prediction ``seg_logits`` we estimate uncertainty as the difference between top first and top second predicted logits. Args: seg_logits (Tensor): Semantic segmentation logits, shape (batch_size, num_classes, height, width). Returns: scores (Tensor): T uncertainty scores with the most uncertain locations having the highest uncertainty score, shape ( batch_size, 1, height, width) """ top2_scores = torch.topk(seg_logits, k=2, dim=1)[0] return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1) @HEADS.register_module() class PointHead(BaseCascadeDecodeHead): """A mask point head use in PointRend. ``PointHead`` use shared multi-layer perceptron (equivalent to nn.Conv1d) to predict the logit of input points. The fine-grained feature and coarse feature will be concatenate together for predication. Args: num_fcs (int): Number of fc layers in the head. Default: 3. in_channels (int): Number of input channels. Default: 256. fc_channels (int): Number of fc channels. Default: 256. num_classes (int): Number of classes for logits. Default: 80. class_agnostic (bool): Whether use class agnostic classification. If so, the output channels of logits will be 1. Default: False. coarse_pred_each_layer (bool): Whether concatenate coarse feature with the output of each fc layer. Default: True. conv_cfg (dict|None): Dictionary to construct and config conv layer. Default: dict(type='Conv1d')) norm_cfg (dict|None): Dictionary to construct and config norm layer. Default: None. loss_point (dict): Dictionary to construct and config loss layer of point head. Default: dict(type='CrossEntropyLoss', use_mask=True, loss_weight=1.0). """ def __init__(self, num_fcs=3, coarse_pred_each_layer=True, conv_cfg=dict(type='Conv1d'), norm_cfg=None, act_cfg=dict(type='ReLU', inplace=False), **kwargs): super(PointHead, self).__init__( input_transform='multiple_select', conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, **kwargs) self.num_fcs = num_fcs self.coarse_pred_each_layer = coarse_pred_each_layer fc_in_channels = sum(self.in_channels) + self.num_classes fc_channels = self.channels self.fcs = nn.ModuleList() for k in range(num_fcs): fc = ConvModule( fc_in_channels, fc_channels, kernel_size=1, stride=1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) self.fcs.append(fc) fc_in_channels = fc_channels fc_in_channels += self.num_classes if self.coarse_pred_each_layer \ else 0 self.fc_seg = nn.Conv1d( fc_in_channels, self.num_classes, kernel_size=1, stride=1, padding=0) if self.dropout_ratio > 0: self.dropout = nn.Dropout(self.dropout_ratio) delattr(self, 'conv_seg') def init_weights(self): """Initialize weights of classification layer.""" normal_init(self.fc_seg, std=0.001) def cls_seg(self, feat): """Classify each pixel with fc.""" if self.dropout is not None: feat = self.dropout(feat) output = self.fc_seg(feat) return output def forward(self, fine_grained_point_feats, coarse_point_feats): x = torch.cat([fine_grained_point_feats, coarse_point_feats], dim=1) for fc in self.fcs: x = fc(x) if self.coarse_pred_each_layer: x = torch.cat((x, coarse_point_feats), dim=1) return self.cls_seg(x) def _get_fine_grained_point_feats(self, x, points): """Sample from fine grained features. Args: x (list[Tensor]): Feature pyramid from by neck or backbone. points (Tensor): Point coordinates, shape (batch_size, num_points, 2). Returns: fine_grained_feats (Tensor): Sampled fine grained feature, shape (batch_size, sum(channels of x), num_points). """ fine_grained_feats_list = [ point_sample(_, points, align_corners=self.align_corners) for _ in x ] if len(fine_grained_feats_list) > 1: fine_grained_feats = torch.cat(fine_grained_feats_list, dim=1) else: fine_grained_feats = fine_grained_feats_list[0] return fine_grained_feats def _get_coarse_point_feats(self, prev_output, points): """Sample from fine grained features. Args: prev_output (list[Tensor]): Prediction of previous decode head. points (Tensor): Point coordinates, shape (batch_size, num_points, 2). Returns: coarse_feats (Tensor): Sampled coarse feature, shape (batch_size, num_classes, num_points). """ coarse_feats = point_sample( prev_output, points, align_corners=self.align_corners) return coarse_feats def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg, train_cfg): """Forward function for training. Args: inputs (list[Tensor]): List of multi-level img features. prev_output (Tensor): The output of previous decode head. img_metas (list[dict]): List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. gt_semantic_seg (Tensor): Semantic segmentation masks used if the architecture supports semantic segmentation task. train_cfg (dict): The training config. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self._transform_inputs(inputs) with torch.no_grad(): points = self.get_points_train( prev_output, calculate_uncertainty, cfg=train_cfg) fine_grained_point_feats = self._get_fine_grained_point_feats( x, points) coarse_point_feats = self._get_coarse_point_feats(prev_output, points) point_logits = self.forward(fine_grained_point_feats, coarse_point_feats) point_label = point_sample( gt_semantic_seg.float(), points, mode='nearest', align_corners=self.align_corners) point_label = point_label.squeeze(1).long() losses = self.losses(point_logits, point_label) return losses def forward_test(self, inputs, prev_output, img_metas, test_cfg): """Forward function for testing. Args: inputs (list[Tensor]): List of multi-level img features. prev_output (Tensor): The output of previous decode head. img_metas (list[dict]): List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. test_cfg (dict): The testing config. Returns: Tensor: Output segmentation map. """ x = self._transform_inputs(inputs) refined_seg_logits = prev_output.clone() for _ in range(test_cfg.subdivision_steps): refined_seg_logits = resize( refined_seg_logits, scale_factor=test_cfg.scale_factor, mode='bilinear', align_corners=self.align_corners) batch_size, channels, height, width = refined_seg_logits.shape point_indices, points = self.get_points_test( refined_seg_logits, calculate_uncertainty, cfg=test_cfg) fine_grained_point_feats = self._get_fine_grained_point_feats( x, points) coarse_point_feats = self._get_coarse_point_feats( prev_output, points) point_logits = self.forward(fine_grained_point_feats, coarse_point_feats) point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1) refined_seg_logits = refined_seg_logits.reshape( batch_size, channels, height * width) refined_seg_logits = refined_seg_logits.scatter_( 2, point_indices, point_logits) refined_seg_logits = refined_seg_logits.view( batch_size, channels, height, width) return refined_seg_logits def losses(self, point_logits, point_label): """Compute segmentation loss.""" loss = dict() loss['loss_point'] = self.loss_decode( point_logits, point_label, ignore_index=self.ignore_index) loss['acc_point'] = accuracy(point_logits, point_label) return loss def get_points_train(self, seg_logits, uncertainty_func, cfg): """Sample points for training. Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The uncertainties are calculated for each point using 'uncertainty_func' function that takes point's logit prediction as input. Args: seg_logits (Tensor): Semantic segmentation logits, shape ( batch_size, num_classes, height, width). uncertainty_func (func): uncertainty calculation function. cfg (dict): Training config of point head. Returns: point_coords (Tensor): A tensor of shape (batch_size, num_points, 2) that contains the coordinates of ``num_points`` sampled points. """ num_points = cfg.num_points oversample_ratio = cfg.oversample_ratio importance_sample_ratio = cfg.importance_sample_ratio assert oversample_ratio >= 1 assert 0 <= importance_sample_ratio <= 1 batch_size = seg_logits.shape[0] num_sampled = int(num_points * oversample_ratio) point_coords = torch.rand( batch_size, num_sampled, 2, device=seg_logits.device) point_logits = point_sample(seg_logits, point_coords) # It is crucial to calculate uncertainty based on the sampled # prediction value for the points. Calculating uncertainties of the # coarse predictions first and sampling them for points leads to # incorrect results. To illustrate this: assume uncertainty func( # logits)=-abs(logits), a sampled point between two coarse # predictions with -1 and 1 logits has 0 logits, and therefore 0 # uncertainty value. However, if we calculate uncertainties for the # coarse predictions first, both will have -1 uncertainty, # and sampled point will get -1 uncertainty. point_uncertainties = uncertainty_func(point_logits) num_uncertain_points = int(importance_sample_ratio * num_points) num_random_points = num_points - num_uncertain_points idx = torch.topk( point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] shift = num_sampled * torch.arange( batch_size, dtype=torch.long, device=seg_logits.device) idx += shift[:, None] point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view( batch_size, num_uncertain_points, 2) if num_random_points > 0: rand_point_coords = torch.rand( batch_size, num_random_points, 2, device=seg_logits.device) point_coords = torch.cat((point_coords, rand_point_coords), dim=1) return point_coords def get_points_test(self, seg_logits, uncertainty_func, cfg): """Sample points for testing. Find ``num_points`` most uncertain points from ``uncertainty_map``. Args: seg_logits (Tensor): A tensor of shape (batch_size, num_classes, height, width) for class-specific or class-agnostic prediction. uncertainty_func (func): uncertainty calculation function. cfg (dict): Testing config of point head. Returns: point_indices (Tensor): A tensor of shape (batch_size, num_points) that contains indices from [0, height x width) of the most uncertain points. point_coords (Tensor): A tensor of shape (batch_size, num_points, 2) that contains [0, 1] x [0, 1] normalized coordinates of the most uncertain points from the ``height x width`` grid . """ num_points = cfg.subdivision_num_points uncertainty_map = uncertainty_func(seg_logits) batch_size, _, height, width = uncertainty_map.shape h_step = 1.0 / height w_step = 1.0 / width uncertainty_map = uncertainty_map.view(batch_size, height * width) num_points = min(height * width, num_points) point_indices = uncertainty_map.topk(num_points, dim=1)[1] point_coords = torch.zeros( batch_size, num_points, 2, dtype=torch.float, device=seg_logits.device) point_coords[:, :, 0] = w_step / 2.0 + (point_indices % width).float() * w_step point_coords[:, :, 1] = h_step / 2.0 + (point_indices // width).float() * h_step return point_indices, point_coords ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/psa_head.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import ConvModule from annotator.mmpkg.mmseg.ops import resize from ..builder import HEADS from .decode_head import BaseDecodeHead try: try: from mmcv.ops import PSAMask except ImportError: from annotator.mmpkg.mmcv.ops import PSAMask except ModuleNotFoundError: PSAMask = None @HEADS.register_module() class PSAHead(BaseDecodeHead): """Point-wise Spatial Attention Network for Scene Parsing. This head is the implementation of `PSANet `_. Args: mask_size (tuple[int]): The PSA mask size. It usually equals input size. psa_type (str): The type of psa module. Options are 'collect', 'distribute', 'bi-direction'. Default: 'bi-direction' compact (bool): Whether use compact map for 'collect' mode. Default: True. shrink_factor (int): The downsample factors of psa mask. Default: 2. normalization_factor (float): The normalize factor of attention. psa_softmax (bool): Whether use softmax for attention. """ def __init__(self, mask_size, psa_type='bi-direction', compact=False, shrink_factor=2, normalization_factor=1.0, psa_softmax=True, **kwargs): if PSAMask is None: raise RuntimeError('Please install mmcv-full for PSAMask ops') super(PSAHead, self).__init__(**kwargs) assert psa_type in ['collect', 'distribute', 'bi-direction'] self.psa_type = psa_type self.compact = compact self.shrink_factor = shrink_factor self.mask_size = mask_size mask_h, mask_w = mask_size self.psa_softmax = psa_softmax if normalization_factor is None: normalization_factor = mask_h * mask_w self.normalization_factor = normalization_factor self.reduce = ConvModule( self.in_channels, self.channels, kernel_size=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.attention = nn.Sequential( ConvModule( self.channels, self.channels, kernel_size=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg), nn.Conv2d( self.channels, mask_h * mask_w, kernel_size=1, bias=False)) if psa_type == 'bi-direction': self.reduce_p = ConvModule( self.in_channels, self.channels, kernel_size=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.attention_p = nn.Sequential( ConvModule( self.channels, self.channels, kernel_size=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg), nn.Conv2d( self.channels, mask_h * mask_w, kernel_size=1, bias=False)) self.psamask_collect = PSAMask('collect', mask_size) self.psamask_distribute = PSAMask('distribute', mask_size) else: self.psamask = PSAMask(psa_type, mask_size) self.proj = ConvModule( self.channels * (2 if psa_type == 'bi-direction' else 1), self.in_channels, kernel_size=1, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) self.bottleneck = ConvModule( self.in_channels * 2, self.channels, kernel_size=3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) identity = x align_corners = self.align_corners if self.psa_type in ['collect', 'distribute']: out = self.reduce(x) n, c, h, w = out.size() if self.shrink_factor != 1: if h % self.shrink_factor and w % self.shrink_factor: h = (h - 1) // self.shrink_factor + 1 w = (w - 1) // self.shrink_factor + 1 align_corners = True else: h = h // self.shrink_factor w = w // self.shrink_factor align_corners = False out = resize( out, size=(h, w), mode='bilinear', align_corners=align_corners) y = self.attention(out) if self.compact: if self.psa_type == 'collect': y = y.view(n, h * w, h * w).transpose(1, 2).view(n, h * w, h, w) else: y = self.psamask(y) if self.psa_softmax: y = F.softmax(y, dim=1) out = torch.bmm( out.view(n, c, h * w), y.view(n, h * w, h * w)).view( n, c, h, w) * (1.0 / self.normalization_factor) else: x_col = self.reduce(x) x_dis = self.reduce_p(x) n, c, h, w = x_col.size() if self.shrink_factor != 1: if h % self.shrink_factor and w % self.shrink_factor: h = (h - 1) // self.shrink_factor + 1 w = (w - 1) // self.shrink_factor + 1 align_corners = True else: h = h // self.shrink_factor w = w // self.shrink_factor align_corners = False x_col = resize( x_col, size=(h, w), mode='bilinear', align_corners=align_corners) x_dis = resize( x_dis, size=(h, w), mode='bilinear', align_corners=align_corners) y_col = self.attention(x_col) y_dis = self.attention_p(x_dis) if self.compact: y_dis = y_dis.view(n, h * w, h * w).transpose(1, 2).view(n, h * w, h, w) else: y_col = self.psamask_collect(y_col) y_dis = self.psamask_distribute(y_dis) if self.psa_softmax: y_col = F.softmax(y_col, dim=1) y_dis = F.softmax(y_dis, dim=1) x_col = torch.bmm( x_col.view(n, c, h * w), y_col.view(n, h * w, h * w)).view( n, c, h, w) * (1.0 / self.normalization_factor) x_dis = torch.bmm( x_dis.view(n, c, h * w), y_dis.view(n, h * w, h * w)).view( n, c, h, w) * (1.0 / self.normalization_factor) out = torch.cat([x_col, x_dis], 1) out = self.proj(out) out = resize( out, size=identity.shape[2:], mode='bilinear', align_corners=align_corners) out = self.bottleneck(torch.cat((identity, out), dim=1)) out = self.cls_seg(out) return out ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/psp_head.py ================================================ import torch import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule from annotator.mmpkg.mmseg.ops import resize from ..builder import HEADS from .decode_head import BaseDecodeHead class PPM(nn.ModuleList): """Pooling Pyramid Module used in PSPNet. Args: pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid Module. in_channels (int): Input channels. channels (int): Channels after modules, before conv_seg. conv_cfg (dict|None): Config of conv layers. norm_cfg (dict|None): Config of norm layers. act_cfg (dict): Config of activation layers. align_corners (bool): align_corners argument of F.interpolate. """ def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg, act_cfg, align_corners): super(PPM, self).__init__() self.pool_scales = pool_scales self.align_corners = align_corners self.in_channels = in_channels self.channels = channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg for pool_scale in pool_scales: self.append( nn.Sequential( nn.AdaptiveAvgPool2d(pool_scale), ConvModule( self.in_channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg))) def forward(self, x): """Forward function.""" ppm_outs = [] for ppm in self: ppm_out = ppm(x) upsampled_ppm_out = resize( ppm_out, size=x.size()[2:], mode='bilinear', align_corners=self.align_corners) ppm_outs.append(upsampled_ppm_out) return ppm_outs @HEADS.register_module() class PSPHead(BaseDecodeHead): """Pyramid Scene Parsing Network. This head is the implementation of `PSPNet `_. Args: pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid Module. Default: (1, 2, 3, 6). """ def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): super(PSPHead, self).__init__(**kwargs) assert isinstance(pool_scales, (list, tuple)) self.pool_scales = pool_scales self.psp_modules = PPM( self.pool_scales, self.in_channels, self.channels, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, align_corners=self.align_corners) self.bottleneck = ConvModule( self.in_channels + len(pool_scales) * self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) psp_outs = [x] psp_outs.extend(self.psp_modules(x)) psp_outs = torch.cat(psp_outs, dim=1) output = self.bottleneck(psp_outs) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/sep_aspp_head.py ================================================ import torch import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule, DepthwiseSeparableConvModule from annotator.mmpkg.mmseg.ops import resize from ..builder import HEADS from .aspp_head import ASPPHead, ASPPModule class DepthwiseSeparableASPPModule(ASPPModule): """Atrous Spatial Pyramid Pooling (ASPP) Module with depthwise separable conv.""" def __init__(self, **kwargs): super(DepthwiseSeparableASPPModule, self).__init__(**kwargs) for i, dilation in enumerate(self.dilations): if dilation > 1: self[i] = DepthwiseSeparableConvModule( self.in_channels, self.channels, 3, dilation=dilation, padding=dilation, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) @HEADS.register_module() class DepthwiseSeparableASPPHead(ASPPHead): """Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation. This head is the implementation of `DeepLabV3+ `_. Args: c1_in_channels (int): The input channels of c1 decoder. If is 0, the no decoder will be used. c1_channels (int): The intermediate channels of c1 decoder. """ def __init__(self, c1_in_channels, c1_channels, **kwargs): super(DepthwiseSeparableASPPHead, self).__init__(**kwargs) assert c1_in_channels >= 0 self.aspp_modules = DepthwiseSeparableASPPModule( dilations=self.dilations, in_channels=self.in_channels, channels=self.channels, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) if c1_in_channels > 0: self.c1_bottleneck = ConvModule( c1_in_channels, c1_channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) else: self.c1_bottleneck = None self.sep_bottleneck = nn.Sequential( DepthwiseSeparableConvModule( self.channels + c1_channels, self.channels, 3, padding=1, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg), DepthwiseSeparableConvModule( self.channels, self.channels, 3, padding=1, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) def forward(self, inputs): """Forward function.""" x = self._transform_inputs(inputs) aspp_outs = [ resize( self.image_pool(x), size=x.size()[2:], mode='bilinear', align_corners=self.align_corners) ] aspp_outs.extend(self.aspp_modules(x)) aspp_outs = torch.cat(aspp_outs, dim=1) output = self.bottleneck(aspp_outs) if self.c1_bottleneck is not None: c1_output = self.c1_bottleneck(inputs[0]) output = resize( input=output, size=c1_output.shape[2:], mode='bilinear', align_corners=self.align_corners) output = torch.cat([output, c1_output], dim=1) output = self.sep_bottleneck(output) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/sep_fcn_head.py ================================================ from annotator.mmpkg.mmcv.cnn import DepthwiseSeparableConvModule from ..builder import HEADS from .fcn_head import FCNHead @HEADS.register_module() class DepthwiseSeparableFCNHead(FCNHead): """Depthwise-Separable Fully Convolutional Network for Semantic Segmentation. This head is implemented according to Fast-SCNN paper. Args: in_channels(int): Number of output channels of FFM. channels(int): Number of middle-stage channels in the decode head. concat_input(bool): Whether to concatenate original decode input into the result of several consecutive convolution layers. Default: True. num_classes(int): Used to determine the dimension of final prediction tensor. in_index(int): Correspond with 'out_indices' in FastSCNN backbone. norm_cfg (dict | None): Config of norm layers. align_corners (bool): align_corners argument of F.interpolate. Default: False. loss_decode(dict): Config of loss type and some relevant additional options. """ def __init__(self, **kwargs): super(DepthwiseSeparableFCNHead, self).__init__(**kwargs) self.convs[0] = DepthwiseSeparableConvModule( self.in_channels, self.channels, kernel_size=self.kernel_size, padding=self.kernel_size // 2, norm_cfg=self.norm_cfg) for i in range(1, self.num_convs): self.convs[i] = DepthwiseSeparableConvModule( self.channels, self.channels, kernel_size=self.kernel_size, padding=self.kernel_size // 2, norm_cfg=self.norm_cfg) if self.concat_input: self.conv_cat = DepthwiseSeparableConvModule( self.in_channels + self.channels, self.channels, kernel_size=self.kernel_size, padding=self.kernel_size // 2, norm_cfg=self.norm_cfg) ================================================ FILE: annotator/mmpkg/mmseg/models/decode_heads/uper_head.py ================================================ import torch import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule from annotator.mmpkg.mmseg.ops import resize from ..builder import HEADS from .decode_head import BaseDecodeHead from .psp_head import PPM @HEADS.register_module() class UPerHead(BaseDecodeHead): """Unified Perceptual Parsing for Scene Understanding. This head is the implementation of `UPerNet `_. Args: pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid Module applied on the last feature. Default: (1, 2, 3, 6). """ def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): super(UPerHead, self).__init__( input_transform='multiple_select', **kwargs) # PSP Module self.psp_modules = PPM( pool_scales, self.in_channels[-1], self.channels, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, align_corners=self.align_corners) self.bottleneck = ConvModule( self.in_channels[-1] + len(pool_scales) * self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) # FPN Module self.lateral_convs = nn.ModuleList() self.fpn_convs = nn.ModuleList() for in_channels in self.in_channels[:-1]: # skip the top layer l_conv = ConvModule( in_channels, self.channels, 1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) fpn_conv = ConvModule( self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, inplace=False) self.lateral_convs.append(l_conv) self.fpn_convs.append(fpn_conv) self.fpn_bottleneck = ConvModule( len(self.in_channels) * self.channels, self.channels, 3, padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def psp_forward(self, inputs): """Forward function of PSP module.""" x = inputs[-1] psp_outs = [x] psp_outs.extend(self.psp_modules(x)) psp_outs = torch.cat(psp_outs, dim=1) output = self.bottleneck(psp_outs) return output def forward(self, inputs): """Forward function.""" inputs = self._transform_inputs(inputs) # build laterals laterals = [ lateral_conv(inputs[i]) for i, lateral_conv in enumerate(self.lateral_convs) ] laterals.append(self.psp_forward(inputs)) # build top-down path used_backbone_levels = len(laterals) for i in range(used_backbone_levels - 1, 0, -1): prev_shape = laterals[i - 1].shape[2:] laterals[i - 1] += resize( laterals[i], size=prev_shape, mode='bilinear', align_corners=self.align_corners) # build outputs fpn_outs = [ self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1) ] # append psp feature fpn_outs.append(laterals[-1]) for i in range(used_backbone_levels - 1, 0, -1): fpn_outs[i] = resize( fpn_outs[i], size=fpn_outs[0].shape[2:], mode='bilinear', align_corners=self.align_corners) fpn_outs = torch.cat(fpn_outs, dim=1) output = self.fpn_bottleneck(fpn_outs) output = self.cls_seg(output) return output ================================================ FILE: annotator/mmpkg/mmseg/models/losses/__init__.py ================================================ from .accuracy import Accuracy, accuracy from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy, cross_entropy, mask_cross_entropy) from .dice_loss import DiceLoss from .lovasz_loss import LovaszLoss from .utils import reduce_loss, weight_reduce_loss, weighted_loss __all__ = [ 'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy', 'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss', 'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss' ] ================================================ FILE: annotator/mmpkg/mmseg/models/losses/accuracy.py ================================================ import torch.nn as nn def accuracy(pred, target, topk=1, thresh=None): """Calculate accuracy according to the prediction and target. Args: pred (torch.Tensor): The model prediction, shape (N, num_class, ...) target (torch.Tensor): The target of each prediction, shape (N, , ...) topk (int | tuple[int], optional): If the predictions in ``topk`` matches the target, the predictions will be regarded as correct ones. Defaults to 1. thresh (float, optional): If not None, predictions with scores under this threshold are considered incorrect. Default to None. Returns: float | tuple[float]: If the input ``topk`` is a single integer, the function will return a single float as accuracy. If ``topk`` is a tuple containing multiple integers, the function will return a tuple containing accuracies of each ``topk`` number. """ assert isinstance(topk, (int, tuple)) if isinstance(topk, int): topk = (topk, ) return_single = True else: return_single = False maxk = max(topk) if pred.size(0) == 0: accu = [pred.new_tensor(0.) for i in range(len(topk))] return accu[0] if return_single else accu assert pred.ndim == target.ndim + 1 assert pred.size(0) == target.size(0) assert maxk <= pred.size(1), \ f'maxk {maxk} exceeds pred dimension {pred.size(1)}' pred_value, pred_label = pred.topk(maxk, dim=1) # transpose to shape (maxk, N, ...) pred_label = pred_label.transpose(0, 1) correct = pred_label.eq(target.unsqueeze(0).expand_as(pred_label)) if thresh is not None: # Only prediction values larger than thresh are counted as correct correct = correct & (pred_value > thresh).t() res = [] for k in topk: correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(100.0 / target.numel())) return res[0] if return_single else res class Accuracy(nn.Module): """Accuracy calculation module.""" def __init__(self, topk=(1, ), thresh=None): """Module to calculate the accuracy. Args: topk (tuple, optional): The criterion used to calculate the accuracy. Defaults to (1,). thresh (float, optional): If not None, predictions with scores under this threshold are considered incorrect. Default to None. """ super().__init__() self.topk = topk self.thresh = thresh def forward(self, pred, target): """Forward function to calculate accuracy. Args: pred (torch.Tensor): Prediction of models. target (torch.Tensor): Target for each prediction. Returns: tuple[float]: The accuracies under different topk criterions. """ return accuracy(pred, target, self.topk, self.thresh) ================================================ FILE: annotator/mmpkg/mmseg/models/losses/cross_entropy_loss.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from ..builder import LOSSES from .utils import get_class_weight, weight_reduce_loss def cross_entropy(pred, label, weight=None, class_weight=None, reduction='mean', avg_factor=None, ignore_index=-100): """The wrapper function for :func:`F.cross_entropy`""" # class_weight is a manual rescaling weight given to each class. # If given, has to be a Tensor of size C element-wise losses loss = F.cross_entropy( pred, label, weight=class_weight, reduction='none', ignore_index=ignore_index) # apply weights and do the reduction if weight is not None: weight = weight.float() loss = weight_reduce_loss( loss, weight=weight, reduction=reduction, avg_factor=avg_factor) return loss def _expand_onehot_labels(labels, label_weights, target_shape, ignore_index): """Expand onehot labels to match the size of prediction.""" bin_labels = labels.new_zeros(target_shape) valid_mask = (labels >= 0) & (labels != ignore_index) inds = torch.nonzero(valid_mask, as_tuple=True) if inds[0].numel() > 0: if labels.dim() == 3: bin_labels[inds[0], labels[valid_mask], inds[1], inds[2]] = 1 else: bin_labels[inds[0], labels[valid_mask]] = 1 valid_mask = valid_mask.unsqueeze(1).expand(target_shape).float() if label_weights is None: bin_label_weights = valid_mask else: bin_label_weights = label_weights.unsqueeze(1).expand(target_shape) bin_label_weights *= valid_mask return bin_labels, bin_label_weights def binary_cross_entropy(pred, label, weight=None, reduction='mean', avg_factor=None, class_weight=None, ignore_index=255): """Calculate the binary CrossEntropy loss. Args: pred (torch.Tensor): The prediction with shape (N, 1). label (torch.Tensor): The learning label of the prediction. weight (torch.Tensor, optional): Sample-wise loss weight. reduction (str, optional): The method used to reduce the loss. Options are "none", "mean" and "sum". avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. class_weight (list[float], optional): The weight for each class. ignore_index (int | None): The label index to be ignored. Default: 255 Returns: torch.Tensor: The calculated loss """ if pred.dim() != label.dim(): assert (pred.dim() == 2 and label.dim() == 1) or ( pred.dim() == 4 and label.dim() == 3), \ 'Only pred shape [N, C], label shape [N] or pred shape [N, C, ' \ 'H, W], label shape [N, H, W] are supported' label, weight = _expand_onehot_labels(label, weight, pred.shape, ignore_index) # weighted element-wise losses if weight is not None: weight = weight.float() loss = F.binary_cross_entropy_with_logits( pred, label.float(), pos_weight=class_weight, reduction='none') # do the reduction for the weighted loss loss = weight_reduce_loss( loss, weight, reduction=reduction, avg_factor=avg_factor) return loss def mask_cross_entropy(pred, target, label, reduction='mean', avg_factor=None, class_weight=None, ignore_index=None): """Calculate the CrossEntropy loss for masks. Args: pred (torch.Tensor): The prediction with shape (N, C), C is the number of classes. target (torch.Tensor): The learning label of the prediction. label (torch.Tensor): ``label`` indicates the class label of the mask' corresponding object. This will be used to select the mask in the of the class which the object belongs to when the mask prediction if not class-agnostic. reduction (str, optional): The method used to reduce the loss. Options are "none", "mean" and "sum". avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. class_weight (list[float], optional): The weight for each class. ignore_index (None): Placeholder, to be consistent with other loss. Default: None. Returns: torch.Tensor: The calculated loss """ assert ignore_index is None, 'BCE loss does not support ignore_index' # TODO: handle these two reserved arguments assert reduction == 'mean' and avg_factor is None num_rois = pred.size()[0] inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device) pred_slice = pred[inds, label].squeeze(1) return F.binary_cross_entropy_with_logits( pred_slice, target, weight=class_weight, reduction='mean')[None] @LOSSES.register_module() class CrossEntropyLoss(nn.Module): """CrossEntropyLoss. Args: use_sigmoid (bool, optional): Whether the prediction uses sigmoid of softmax. Defaults to False. use_mask (bool, optional): Whether to use mask cross entropy loss. Defaults to False. reduction (str, optional): . Defaults to 'mean'. Options are "none", "mean" and "sum". class_weight (list[float] | str, optional): Weight of each class. If in str format, read them from a file. Defaults to None. loss_weight (float, optional): Weight of the loss. Defaults to 1.0. """ def __init__(self, use_sigmoid=False, use_mask=False, reduction='mean', class_weight=None, loss_weight=1.0): super(CrossEntropyLoss, self).__init__() assert (use_sigmoid is False) or (use_mask is False) self.use_sigmoid = use_sigmoid self.use_mask = use_mask self.reduction = reduction self.loss_weight = loss_weight self.class_weight = get_class_weight(class_weight) if self.use_sigmoid: self.cls_criterion = binary_cross_entropy elif self.use_mask: self.cls_criterion = mask_cross_entropy else: self.cls_criterion = cross_entropy def forward(self, cls_score, label, weight=None, avg_factor=None, reduction_override=None, **kwargs): """Forward function.""" assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if self.class_weight is not None: class_weight = cls_score.new_tensor(self.class_weight) else: class_weight = None loss_cls = self.loss_weight * self.cls_criterion( cls_score, label, weight, class_weight=class_weight, reduction=reduction, avg_factor=avg_factor, **kwargs) return loss_cls ================================================ FILE: annotator/mmpkg/mmseg/models/losses/dice_loss.py ================================================ """Modified from https://github.com/LikeLy-Journey/SegmenTron/blob/master/ segmentron/solver/loss.py (Apache-2.0 License)""" import torch import torch.nn as nn import torch.nn.functional as F from ..builder import LOSSES from .utils import get_class_weight, weighted_loss @weighted_loss def dice_loss(pred, target, valid_mask, smooth=1, exponent=2, class_weight=None, ignore_index=255): assert pred.shape[0] == target.shape[0] total_loss = 0 num_classes = pred.shape[1] for i in range(num_classes): if i != ignore_index: dice_loss = binary_dice_loss( pred[:, i], target[..., i], valid_mask=valid_mask, smooth=smooth, exponent=exponent) if class_weight is not None: dice_loss *= class_weight[i] total_loss += dice_loss return total_loss / num_classes @weighted_loss def binary_dice_loss(pred, target, valid_mask, smooth=1, exponent=2, **kwards): assert pred.shape[0] == target.shape[0] pred = pred.reshape(pred.shape[0], -1) target = target.reshape(target.shape[0], -1) valid_mask = valid_mask.reshape(valid_mask.shape[0], -1) num = torch.sum(torch.mul(pred, target) * valid_mask, dim=1) * 2 + smooth den = torch.sum(pred.pow(exponent) + target.pow(exponent), dim=1) + smooth return 1 - num / den @LOSSES.register_module() class DiceLoss(nn.Module): """DiceLoss. This loss is proposed in `V-Net: Fully Convolutional Neural Networks for Volumetric Medical Image Segmentation `_. Args: loss_type (str, optional): Binary or multi-class loss. Default: 'multi_class'. Options are "binary" and "multi_class". smooth (float): A float number to smooth loss, and avoid NaN error. Default: 1 exponent (float): An float number to calculate denominator value: \\sum{x^exponent} + \\sum{y^exponent}. Default: 2. reduction (str, optional): The method used to reduce the loss. Options are "none", "mean" and "sum". This parameter only works when per_image is True. Default: 'mean'. class_weight (list[float] | str, optional): Weight of each class. If in str format, read them from a file. Defaults to None. loss_weight (float, optional): Weight of the loss. Default to 1.0. ignore_index (int | None): The label index to be ignored. Default: 255. """ def __init__(self, smooth=1, exponent=2, reduction='mean', class_weight=None, loss_weight=1.0, ignore_index=255, **kwards): super(DiceLoss, self).__init__() self.smooth = smooth self.exponent = exponent self.reduction = reduction self.class_weight = get_class_weight(class_weight) self.loss_weight = loss_weight self.ignore_index = ignore_index def forward(self, pred, target, avg_factor=None, reduction_override=None, **kwards): assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if self.class_weight is not None: class_weight = pred.new_tensor(self.class_weight) else: class_weight = None pred = F.softmax(pred, dim=1) num_classes = pred.shape[1] one_hot_target = F.one_hot( torch.clamp(target.long(), 0, num_classes - 1), num_classes=num_classes) valid_mask = (target != self.ignore_index).long() loss = self.loss_weight * dice_loss( pred, one_hot_target, valid_mask=valid_mask, reduction=reduction, avg_factor=avg_factor, smooth=self.smooth, exponent=self.exponent, class_weight=class_weight, ignore_index=self.ignore_index) return loss ================================================ FILE: annotator/mmpkg/mmseg/models/losses/lovasz_loss.py ================================================ """Modified from https://github.com/bermanmaxim/LovaszSoftmax/blob/master/pytor ch/lovasz_losses.py Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License)""" import annotator.mmpkg.mmcv as mmcv import torch import torch.nn as nn import torch.nn.functional as F from ..builder import LOSSES from .utils import get_class_weight, weight_reduce_loss def lovasz_grad(gt_sorted): """Computes gradient of the Lovasz extension w.r.t sorted errors. See Alg. 1 in paper. """ p = len(gt_sorted) gts = gt_sorted.sum() intersection = gts - gt_sorted.float().cumsum(0) union = gts + (1 - gt_sorted).float().cumsum(0) jaccard = 1. - intersection / union if p > 1: # cover 1-pixel case jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] return jaccard def flatten_binary_logits(logits, labels, ignore_index=None): """Flattens predictions in the batch (binary case) Remove labels equal to 'ignore_index'.""" logits = logits.view(-1) labels = labels.view(-1) if ignore_index is None: return logits, labels valid = (labels != ignore_index) vlogits = logits[valid] vlabels = labels[valid] return vlogits, vlabels def flatten_probs(probs, labels, ignore_index=None): """Flattens predictions in the batch.""" if probs.dim() == 3: # assumes output of a sigmoid layer B, H, W = probs.size() probs = probs.view(B, 1, H, W) B, C, H, W = probs.size() probs = probs.permute(0, 2, 3, 1).contiguous().view(-1, C) # B*H*W, C=P,C labels = labels.view(-1) if ignore_index is None: return probs, labels valid = (labels != ignore_index) vprobs = probs[valid.nonzero().squeeze()] vlabels = labels[valid] return vprobs, vlabels def lovasz_hinge_flat(logits, labels): """Binary Lovasz hinge loss. Args: logits (torch.Tensor): [P], logits at each prediction (between -infty and +infty). labels (torch.Tensor): [P], binary ground truth labels (0 or 1). Returns: torch.Tensor: The calculated loss. """ if len(labels) == 0: # only void pixels, the gradients should be 0 return logits.sum() * 0. signs = 2. * labels.float() - 1. errors = (1. - logits * signs) errors_sorted, perm = torch.sort(errors, dim=0, descending=True) perm = perm.data gt_sorted = labels[perm] grad = lovasz_grad(gt_sorted) loss = torch.dot(F.relu(errors_sorted), grad) return loss def lovasz_hinge(logits, labels, classes='present', per_image=False, class_weight=None, reduction='mean', avg_factor=None, ignore_index=255): """Binary Lovasz hinge loss. Args: logits (torch.Tensor): [B, H, W], logits at each pixel (between -infty and +infty). labels (torch.Tensor): [B, H, W], binary ground truth masks (0 or 1). classes (str | list[int], optional): Placeholder, to be consistent with other loss. Default: None. per_image (bool, optional): If per_image is True, compute the loss per image instead of per batch. Default: False. class_weight (list[float], optional): Placeholder, to be consistent with other loss. Default: None. reduction (str, optional): The method used to reduce the loss. Options are "none", "mean" and "sum". This parameter only works when per_image is True. Default: 'mean'. avg_factor (int, optional): Average factor that is used to average the loss. This parameter only works when per_image is True. Default: None. ignore_index (int | None): The label index to be ignored. Default: 255. Returns: torch.Tensor: The calculated loss. """ if per_image: loss = [ lovasz_hinge_flat(*flatten_binary_logits( logit.unsqueeze(0), label.unsqueeze(0), ignore_index)) for logit, label in zip(logits, labels) ] loss = weight_reduce_loss( torch.stack(loss), None, reduction, avg_factor) else: loss = lovasz_hinge_flat( *flatten_binary_logits(logits, labels, ignore_index)) return loss def lovasz_softmax_flat(probs, labels, classes='present', class_weight=None): """Multi-class Lovasz-Softmax loss. Args: probs (torch.Tensor): [P, C], class probabilities at each prediction (between 0 and 1). labels (torch.Tensor): [P], ground truth labels (between 0 and C - 1). classes (str | list[int], optional): Classes chosen to calculate loss. 'all' for all classes, 'present' for classes present in labels, or a list of classes to average. Default: 'present'. class_weight (list[float], optional): The weight for each class. Default: None. Returns: torch.Tensor: The calculated loss. """ if probs.numel() == 0: # only void pixels, the gradients should be 0 return probs * 0. C = probs.size(1) losses = [] class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes for c in class_to_sum: fg = (labels == c).float() # foreground for class c if (classes == 'present' and fg.sum() == 0): continue if C == 1: if len(classes) > 1: raise ValueError('Sigmoid output possible only with 1 class') class_pred = probs[:, 0] else: class_pred = probs[:, c] errors = (fg - class_pred).abs() errors_sorted, perm = torch.sort(errors, 0, descending=True) perm = perm.data fg_sorted = fg[perm] loss = torch.dot(errors_sorted, lovasz_grad(fg_sorted)) if class_weight is not None: loss *= class_weight[c] losses.append(loss) return torch.stack(losses).mean() def lovasz_softmax(probs, labels, classes='present', per_image=False, class_weight=None, reduction='mean', avg_factor=None, ignore_index=255): """Multi-class Lovasz-Softmax loss. Args: probs (torch.Tensor): [B, C, H, W], class probabilities at each prediction (between 0 and 1). labels (torch.Tensor): [B, H, W], ground truth labels (between 0 and C - 1). classes (str | list[int], optional): Classes chosen to calculate loss. 'all' for all classes, 'present' for classes present in labels, or a list of classes to average. Default: 'present'. per_image (bool, optional): If per_image is True, compute the loss per image instead of per batch. Default: False. class_weight (list[float], optional): The weight for each class. Default: None. reduction (str, optional): The method used to reduce the loss. Options are "none", "mean" and "sum". This parameter only works when per_image is True. Default: 'mean'. avg_factor (int, optional): Average factor that is used to average the loss. This parameter only works when per_image is True. Default: None. ignore_index (int | None): The label index to be ignored. Default: 255. Returns: torch.Tensor: The calculated loss. """ if per_image: loss = [ lovasz_softmax_flat( *flatten_probs( prob.unsqueeze(0), label.unsqueeze(0), ignore_index), classes=classes, class_weight=class_weight) for prob, label in zip(probs, labels) ] loss = weight_reduce_loss( torch.stack(loss), None, reduction, avg_factor) else: loss = lovasz_softmax_flat( *flatten_probs(probs, labels, ignore_index), classes=classes, class_weight=class_weight) return loss @LOSSES.register_module() class LovaszLoss(nn.Module): """LovaszLoss. This loss is proposed in `The Lovasz-Softmax loss: A tractable surrogate for the optimization of the intersection-over-union measure in neural networks `_. Args: loss_type (str, optional): Binary or multi-class loss. Default: 'multi_class'. Options are "binary" and "multi_class". classes (str | list[int], optional): Classes chosen to calculate loss. 'all' for all classes, 'present' for classes present in labels, or a list of classes to average. Default: 'present'. per_image (bool, optional): If per_image is True, compute the loss per image instead of per batch. Default: False. reduction (str, optional): The method used to reduce the loss. Options are "none", "mean" and "sum". This parameter only works when per_image is True. Default: 'mean'. class_weight (list[float] | str, optional): Weight of each class. If in str format, read them from a file. Defaults to None. loss_weight (float, optional): Weight of the loss. Defaults to 1.0. """ def __init__(self, loss_type='multi_class', classes='present', per_image=False, reduction='mean', class_weight=None, loss_weight=1.0): super(LovaszLoss, self).__init__() assert loss_type in ('binary', 'multi_class'), "loss_type should be \ 'binary' or 'multi_class'." if loss_type == 'binary': self.cls_criterion = lovasz_hinge else: self.cls_criterion = lovasz_softmax assert classes in ('all', 'present') or mmcv.is_list_of(classes, int) if not per_image: assert reduction == 'none', "reduction should be 'none' when \ per_image is False." self.classes = classes self.per_image = per_image self.reduction = reduction self.loss_weight = loss_weight self.class_weight = get_class_weight(class_weight) def forward(self, cls_score, label, weight=None, avg_factor=None, reduction_override=None, **kwargs): """Forward function.""" assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if self.class_weight is not None: class_weight = cls_score.new_tensor(self.class_weight) else: class_weight = None # if multi-class loss, transform logits to probs if self.cls_criterion == lovasz_softmax: cls_score = F.softmax(cls_score, dim=1) loss_cls = self.loss_weight * self.cls_criterion( cls_score, label, self.classes, self.per_image, class_weight=class_weight, reduction=reduction, avg_factor=avg_factor, **kwargs) return loss_cls ================================================ FILE: annotator/mmpkg/mmseg/models/losses/utils.py ================================================ import functools import annotator.mmpkg.mmcv as mmcv import numpy as np import torch.nn.functional as F def get_class_weight(class_weight): """Get class weight for loss function. Args: class_weight (list[float] | str | None): If class_weight is a str, take it as a file name and read from it. """ if isinstance(class_weight, str): # take it as a file path if class_weight.endswith('.npy'): class_weight = np.load(class_weight) else: # pkl, json or yaml class_weight = mmcv.load(class_weight) return class_weight def reduce_loss(loss, reduction): """Reduce loss as specified. Args: loss (Tensor): Elementwise loss tensor. reduction (str): Options are "none", "mean" and "sum". Return: Tensor: Reduced loss tensor. """ reduction_enum = F._Reduction.get_enum(reduction) # none: 0, elementwise_mean:1, sum: 2 if reduction_enum == 0: return loss elif reduction_enum == 1: return loss.mean() elif reduction_enum == 2: return loss.sum() def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): """Apply element-wise weight and reduce loss. Args: loss (Tensor): Element-wise loss. weight (Tensor): Element-wise weights. reduction (str): Same as built-in losses of PyTorch. avg_factor (float): Avarage factor when computing the mean of losses. Returns: Tensor: Processed loss values. """ # if weight is specified, apply element-wise weight if weight is not None: assert weight.dim() == loss.dim() if weight.dim() > 1: assert weight.size(1) == 1 or weight.size(1) == loss.size(1) loss = loss * weight # if avg_factor is not specified, just reduce the loss if avg_factor is None: loss = reduce_loss(loss, reduction) else: # if reduction is mean, then average the loss by avg_factor if reduction == 'mean': loss = loss.sum() / avg_factor # if reduction is 'none', then do nothing, otherwise raise an error elif reduction != 'none': raise ValueError('avg_factor can not be used with reduction="sum"') return loss def weighted_loss(loss_func): """Create a weighted version of a given loss function. To use this decorator, the loss function must have the signature like `loss_func(pred, target, **kwargs)`. The function only needs to compute element-wise loss without any reduction. This decorator will add weight and reduction arguments to the function. The decorated function will have the signature like `loss_func(pred, target, weight=None, reduction='mean', avg_factor=None, **kwargs)`. :Example: >>> import torch >>> @weighted_loss >>> def l1_loss(pred, target): >>> return (pred - target).abs() >>> pred = torch.Tensor([0, 2, 3]) >>> target = torch.Tensor([1, 1, 1]) >>> weight = torch.Tensor([1, 0, 1]) >>> l1_loss(pred, target) tensor(1.3333) >>> l1_loss(pred, target, weight) tensor(1.) >>> l1_loss(pred, target, reduction='none') tensor([1., 1., 2.]) >>> l1_loss(pred, target, weight, avg_factor=2) tensor(1.5000) """ @functools.wraps(loss_func) def wrapper(pred, target, weight=None, reduction='mean', avg_factor=None, **kwargs): # get element-wise loss loss = loss_func(pred, target, **kwargs) loss = weight_reduce_loss(loss, weight, reduction, avg_factor) return loss return wrapper ================================================ FILE: annotator/mmpkg/mmseg/models/necks/__init__.py ================================================ from .fpn import FPN from .multilevel_neck import MultiLevelNeck __all__ = ['FPN', 'MultiLevelNeck'] ================================================ FILE: annotator/mmpkg/mmseg/models/necks/fpn.py ================================================ import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import ConvModule, xavier_init from ..builder import NECKS @NECKS.register_module() class FPN(nn.Module): """Feature Pyramid Network. This is an implementation of - Feature Pyramid Networks for Object Detection (https://arxiv.org/abs/1612.03144) Args: in_channels (List[int]): Number of input channels per scale. out_channels (int): Number of output channels (used at each scale) num_outs (int): Number of output scales. start_level (int): Index of the start input backbone level used to build the feature pyramid. Default: 0. end_level (int): Index of the end input backbone level (exclusive) to build the feature pyramid. Default: -1, which means the last level. add_extra_convs (bool | str): If bool, it decides whether to add conv layers on top of the original feature maps. Default to False. If True, its actual mode is specified by `extra_convs_on_inputs`. If str, it specifies the source feature map of the extra convs. Only the following options are allowed - 'on_input': Last feat map of neck inputs (i.e. backbone feature). - 'on_lateral': Last feature map after lateral convs. - 'on_output': The last output feature map after fpn convs. extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs on the original feature from the backbone. If True, it is equivalent to `add_extra_convs='on_input'`. If False, it is equivalent to set `add_extra_convs='on_output'`. Default to True. relu_before_extra_convs (bool): Whether to apply relu before the extra conv. Default: False. no_norm_on_lateral (bool): Whether to apply norm on lateral. Default: False. conv_cfg (dict): Config dict for convolution layer. Default: None. norm_cfg (dict): Config dict for normalization layer. Default: None. act_cfg (str): Config dict for activation layer in ConvModule. Default: None. upsample_cfg (dict): Config dict for interpolate layer. Default: `dict(mode='nearest')` Example: >>> import torch >>> in_channels = [2, 3, 5, 7] >>> scales = [340, 170, 84, 43] >>> inputs = [torch.rand(1, c, s, s) ... for c, s in zip(in_channels, scales)] >>> self = FPN(in_channels, 11, len(in_channels)).eval() >>> outputs = self.forward(inputs) >>> for i in range(len(outputs)): ... print(f'outputs[{i}].shape = {outputs[i].shape}') outputs[0].shape = torch.Size([1, 11, 340, 340]) outputs[1].shape = torch.Size([1, 11, 170, 170]) outputs[2].shape = torch.Size([1, 11, 84, 84]) outputs[3].shape = torch.Size([1, 11, 43, 43]) """ def __init__(self, in_channels, out_channels, num_outs, start_level=0, end_level=-1, add_extra_convs=False, extra_convs_on_inputs=False, relu_before_extra_convs=False, no_norm_on_lateral=False, conv_cfg=None, norm_cfg=None, act_cfg=None, upsample_cfg=dict(mode='nearest')): super(FPN, self).__init__() assert isinstance(in_channels, list) self.in_channels = in_channels self.out_channels = out_channels self.num_ins = len(in_channels) self.num_outs = num_outs self.relu_before_extra_convs = relu_before_extra_convs self.no_norm_on_lateral = no_norm_on_lateral self.fp16_enabled = False self.upsample_cfg = upsample_cfg.copy() if end_level == -1: self.backbone_end_level = self.num_ins assert num_outs >= self.num_ins - start_level else: # if end_level < inputs, no extra level is allowed self.backbone_end_level = end_level assert end_level <= len(in_channels) assert num_outs == end_level - start_level self.start_level = start_level self.end_level = end_level self.add_extra_convs = add_extra_convs assert isinstance(add_extra_convs, (str, bool)) if isinstance(add_extra_convs, str): # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') elif add_extra_convs: # True if extra_convs_on_inputs: # For compatibility with previous release # TODO: deprecate `extra_convs_on_inputs` self.add_extra_convs = 'on_input' else: self.add_extra_convs = 'on_output' self.lateral_convs = nn.ModuleList() self.fpn_convs = nn.ModuleList() for i in range(self.start_level, self.backbone_end_level): l_conv = ConvModule( in_channels[i], out_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, act_cfg=act_cfg, inplace=False) fpn_conv = ConvModule( out_channels, out_channels, 3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) self.lateral_convs.append(l_conv) self.fpn_convs.append(fpn_conv) # add extra conv layers (e.g., RetinaNet) extra_levels = num_outs - self.backbone_end_level + self.start_level if self.add_extra_convs and extra_levels >= 1: for i in range(extra_levels): if i == 0 and self.add_extra_convs == 'on_input': in_channels = self.in_channels[self.backbone_end_level - 1] else: in_channels = out_channels extra_fpn_conv = ConvModule( in_channels, out_channels, 3, stride=2, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) self.fpn_convs.append(extra_fpn_conv) # default init_weights for conv(msra) and norm in ConvModule def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): xavier_init(m, distribution='uniform') def forward(self, inputs): assert len(inputs) == len(self.in_channels) # build laterals laterals = [ lateral_conv(inputs[i + self.start_level]) for i, lateral_conv in enumerate(self.lateral_convs) ] # build top-down path used_backbone_levels = len(laterals) for i in range(used_backbone_levels - 1, 0, -1): # In some cases, fixing `scale factor` (e.g. 2) is preferred, but # it cannot co-exist with `size` in `F.interpolate`. if 'scale_factor' in self.upsample_cfg: laterals[i - 1] += F.interpolate(laterals[i], **self.upsample_cfg) else: prev_shape = laterals[i - 1].shape[2:] laterals[i - 1] += F.interpolate( laterals[i], size=prev_shape, **self.upsample_cfg) # build outputs # part 1: from original levels outs = [ self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) ] # part 2: add extra levels if self.num_outs > len(outs): # use max pool to get more levels on top of outputs # (e.g., Faster R-CNN, Mask R-CNN) if not self.add_extra_convs: for i in range(self.num_outs - used_backbone_levels): outs.append(F.max_pool2d(outs[-1], 1, stride=2)) # add conv layers on top of original feature maps (RetinaNet) else: if self.add_extra_convs == 'on_input': extra_source = inputs[self.backbone_end_level - 1] elif self.add_extra_convs == 'on_lateral': extra_source = laterals[-1] elif self.add_extra_convs == 'on_output': extra_source = outs[-1] else: raise NotImplementedError outs.append(self.fpn_convs[used_backbone_levels](extra_source)) for i in range(used_backbone_levels + 1, self.num_outs): if self.relu_before_extra_convs: outs.append(self.fpn_convs[i](F.relu(outs[-1]))) else: outs.append(self.fpn_convs[i](outs[-1])) return tuple(outs) ================================================ FILE: annotator/mmpkg/mmseg/models/necks/multilevel_neck.py ================================================ import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmcv.cnn import ConvModule from ..builder import NECKS @NECKS.register_module() class MultiLevelNeck(nn.Module): """MultiLevelNeck. A neck structure connect vit backbone and decoder_heads. Args: in_channels (List[int]): Number of input channels per scale. out_channels (int): Number of output channels (used at each scale). scales (List[int]): Scale factors for each input feature map. norm_cfg (dict): Config dict for normalization layer. Default: None. act_cfg (dict): Config dict for activation layer in ConvModule. Default: None. """ def __init__(self, in_channels, out_channels, scales=[0.5, 1, 2, 4], norm_cfg=None, act_cfg=None): super(MultiLevelNeck, self).__init__() assert isinstance(in_channels, list) self.in_channels = in_channels self.out_channels = out_channels self.scales = scales self.num_outs = len(scales) self.lateral_convs = nn.ModuleList() self.convs = nn.ModuleList() for in_channel in in_channels: self.lateral_convs.append( ConvModule( in_channel, out_channels, kernel_size=1, norm_cfg=norm_cfg, act_cfg=act_cfg)) for _ in range(self.num_outs): self.convs.append( ConvModule( out_channels, out_channels, kernel_size=3, padding=1, stride=1, norm_cfg=norm_cfg, act_cfg=act_cfg)) def forward(self, inputs): assert len(inputs) == len(self.in_channels) print(inputs[0].shape) inputs = [ lateral_conv(inputs[i]) for i, lateral_conv in enumerate(self.lateral_convs) ] # for len(inputs) not equal to self.num_outs if len(inputs) == 1: inputs = [inputs[0] for _ in range(self.num_outs)] outs = [] for i in range(self.num_outs): x_resize = F.interpolate( inputs[i], scale_factor=self.scales[i], mode='bilinear') outs.append(self.convs[i](x_resize)) return tuple(outs) ================================================ FILE: annotator/mmpkg/mmseg/models/segmentors/__init__.py ================================================ from .base import BaseSegmentor from .cascade_encoder_decoder import CascadeEncoderDecoder from .encoder_decoder import EncoderDecoder __all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder'] ================================================ FILE: annotator/mmpkg/mmseg/models/segmentors/base.py ================================================ import logging import warnings from abc import ABCMeta, abstractmethod from collections import OrderedDict import annotator.mmpkg.mmcv as mmcv import numpy as np import torch import torch.distributed as dist import torch.nn as nn from annotator.mmpkg.mmcv.runner import auto_fp16 class BaseSegmentor(nn.Module): """Base class for segmentors.""" __metaclass__ = ABCMeta def __init__(self): super(BaseSegmentor, self).__init__() self.fp16_enabled = False @property def with_neck(self): """bool: whether the segmentor has neck""" return hasattr(self, 'neck') and self.neck is not None @property def with_auxiliary_head(self): """bool: whether the segmentor has auxiliary head""" return hasattr(self, 'auxiliary_head') and self.auxiliary_head is not None @property def with_decode_head(self): """bool: whether the segmentor has decode head""" return hasattr(self, 'decode_head') and self.decode_head is not None @abstractmethod def extract_feat(self, imgs): """Placeholder for extract features from images.""" pass @abstractmethod def encode_decode(self, img, img_metas): """Placeholder for encode images with backbone and decode into a semantic segmentation map of the same size as input.""" pass @abstractmethod def forward_train(self, imgs, img_metas, **kwargs): """Placeholder for Forward function for training.""" pass @abstractmethod def simple_test(self, img, img_meta, **kwargs): """Placeholder for single image test.""" pass @abstractmethod def aug_test(self, imgs, img_metas, **kwargs): """Placeholder for augmentation test.""" pass def init_weights(self, pretrained=None): """Initialize the weights in segmentor. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ if pretrained is not None: logger = logging.getLogger() logger.info(f'load model from: {pretrained}') def forward_test(self, imgs, img_metas, **kwargs): """ Args: imgs (List[Tensor]): the outer list indicates test-time augmentations and inner Tensor should have a shape NxCxHxW, which contains all images in the batch. img_metas (List[List[dict]]): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch. """ for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError(f'{name} must be a list, but got ' f'{type(var)}') num_augs = len(imgs) if num_augs != len(img_metas): raise ValueError(f'num of augmentations ({len(imgs)}) != ' f'num of image meta ({len(img_metas)})') # all images in the same aug batch all of the same ori_shape and pad # shape for img_meta in img_metas: ori_shapes = [_['ori_shape'] for _ in img_meta] assert all(shape == ori_shapes[0] for shape in ori_shapes) img_shapes = [_['img_shape'] for _ in img_meta] assert all(shape == img_shapes[0] for shape in img_shapes) pad_shapes = [_['pad_shape'] for _ in img_meta] assert all(shape == pad_shapes[0] for shape in pad_shapes) if num_augs == 1: return self.simple_test(imgs[0], img_metas[0], **kwargs) else: return self.aug_test(imgs, img_metas, **kwargs) @auto_fp16(apply_to=('img', )) def forward(self, img, img_metas, return_loss=True, **kwargs): """Calls either :func:`forward_train` or :func:`forward_test` depending on whether ``return_loss`` is ``True``. Note this setting will change the expected inputs. When ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor and List[dict]), and when ``resturn_loss=False``, img and img_meta should be double nested (i.e. List[Tensor], List[List[dict]]), with the outer list indicating test time augmentations. """ if return_loss: return self.forward_train(img, img_metas, **kwargs) else: return self.forward_test(img, img_metas, **kwargs) def train_step(self, data_batch, optimizer, **kwargs): """The iteration step during training. This method defines an iteration step during training, except for the back propagation and optimizer updating, which are done in an optimizer hook. Note that in some complicated cases or models, the whole process including back propagation and optimizer updating is also defined in this method, such as GAN. Args: data (dict): The output of dataloader. optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of runner is passed to ``train_step()``. This argument is unused and reserved. Returns: dict: It should contain at least 3 keys: ``loss``, ``log_vars``, ``num_samples``. ``loss`` is a tensor for back propagation, which can be a weighted sum of multiple losses. ``log_vars`` contains all the variables to be sent to the logger. ``num_samples`` indicates the batch size (when the model is DDP, it means the batch size on each GPU), which is used for averaging the logs. """ losses = self(**data_batch) loss, log_vars = self._parse_losses(losses) outputs = dict( loss=loss, log_vars=log_vars, num_samples=len(data_batch['img_metas'])) return outputs def val_step(self, data_batch, **kwargs): """The iteration step during validation. This method shares the same signature as :func:`train_step`, but used during val epochs. Note that the evaluation after training epochs is not implemented with this method, but an evaluation hook. """ output = self(**data_batch, **kwargs) return output @staticmethod def _parse_losses(losses): """Parse the raw outputs (losses) of the network. Args: losses (dict): Raw output of the network, which usually contain losses and other necessary information. Returns: tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor which may be a weighted sum of all losses, log_vars contains all the variables to be sent to the logger. """ log_vars = OrderedDict() for loss_name, loss_value in losses.items(): if isinstance(loss_value, torch.Tensor): log_vars[loss_name] = loss_value.mean() elif isinstance(loss_value, list): log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) else: raise TypeError( f'{loss_name} is not a tensor or list of tensors') loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) log_vars['loss'] = loss for loss_name, loss_value in log_vars.items(): # reduce loss when distributed training if dist.is_available() and dist.is_initialized(): loss_value = loss_value.data.clone() dist.all_reduce(loss_value.div_(dist.get_world_size())) log_vars[loss_name] = loss_value.item() return loss, log_vars def show_result(self, img, result, palette=None, win_name='', show=False, wait_time=0, out_file=None, opacity=0.5): """Draw `result` over `img`. Args: img (str or Tensor): The image to be displayed. result (Tensor): The semantic segmentation results to draw over `img`. palette (list[list[int]]] | np.ndarray | None): The palette of segmentation map. If None is given, random palette will be generated. Default: None win_name (str): The window name. wait_time (int): Value of waitKey param. Default: 0. show (bool): Whether to show the image. Default: False. out_file (str or None): The filename to write the image. Default: None. opacity(float): Opacity of painted segmentation map. Default 0.5. Must be in (0, 1] range. Returns: img (Tensor): Only if not `show` or `out_file` """ img = mmcv.imread(img) img = img.copy() seg = result[0] if palette is None: if self.PALETTE is None: palette = np.random.randint( 0, 255, size=(len(self.CLASSES), 3)) else: palette = self.PALETTE palette = np.array(palette) assert palette.shape[0] == len(self.CLASSES) assert palette.shape[1] == 3 assert len(palette.shape) == 2 assert 0 < opacity <= 1.0 color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) for label, color in enumerate(palette): color_seg[seg == label, :] = color # convert to BGR color_seg = color_seg[..., ::-1] img = img * (1 - opacity) + color_seg * opacity img = img.astype(np.uint8) # if out_file specified, do not show image in window if out_file is not None: show = False if show: mmcv.imshow(img, win_name, wait_time) if out_file is not None: mmcv.imwrite(img, out_file) if not (show or out_file): warnings.warn('show==False and out_file is not specified, only ' 'result image will be returned') return img ================================================ FILE: annotator/mmpkg/mmseg/models/segmentors/cascade_encoder_decoder.py ================================================ from torch import nn from annotator.mmpkg.mmseg.core import add_prefix from annotator.mmpkg.mmseg.ops import resize from .. import builder from ..builder import SEGMENTORS from .encoder_decoder import EncoderDecoder @SEGMENTORS.register_module() class CascadeEncoderDecoder(EncoderDecoder): """Cascade Encoder Decoder segmentors. CascadeEncoderDecoder almost the same as EncoderDecoder, while decoders of CascadeEncoderDecoder are cascaded. The output of previous decoder_head will be the input of next decoder_head. """ def __init__(self, num_stages, backbone, decode_head, neck=None, auxiliary_head=None, train_cfg=None, test_cfg=None, pretrained=None): self.num_stages = num_stages super(CascadeEncoderDecoder, self).__init__( backbone=backbone, decode_head=decode_head, neck=neck, auxiliary_head=auxiliary_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained) def _init_decode_head(self, decode_head): """Initialize ``decode_head``""" assert isinstance(decode_head, list) assert len(decode_head) == self.num_stages self.decode_head = nn.ModuleList() for i in range(self.num_stages): self.decode_head.append(builder.build_head(decode_head[i])) self.align_corners = self.decode_head[-1].align_corners self.num_classes = self.decode_head[-1].num_classes def init_weights(self, pretrained=None): """Initialize the weights in backbone and heads. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ self.backbone.init_weights(pretrained=pretrained) for i in range(self.num_stages): self.decode_head[i].init_weights() if self.with_auxiliary_head: if isinstance(self.auxiliary_head, nn.ModuleList): for aux_head in self.auxiliary_head: aux_head.init_weights() else: self.auxiliary_head.init_weights() def encode_decode(self, img, img_metas): """Encode images with backbone and decode into a semantic segmentation map of the same size as input.""" x = self.extract_feat(img) out = self.decode_head[0].forward_test(x, img_metas, self.test_cfg) for i in range(1, self.num_stages): out = self.decode_head[i].forward_test(x, out, img_metas, self.test_cfg) out = resize( input=out, size=img.shape[2:], mode='bilinear', align_corners=self.align_corners) return out def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg): """Run forward function and calculate loss for decode head in training.""" losses = dict() loss_decode = self.decode_head[0].forward_train( x, img_metas, gt_semantic_seg, self.train_cfg) losses.update(add_prefix(loss_decode, 'decode_0')) for i in range(1, self.num_stages): # forward test again, maybe unnecessary for most methods. prev_outputs = self.decode_head[i - 1].forward_test( x, img_metas, self.test_cfg) loss_decode = self.decode_head[i].forward_train( x, prev_outputs, img_metas, gt_semantic_seg, self.train_cfg) losses.update(add_prefix(loss_decode, f'decode_{i}')) return losses ================================================ FILE: annotator/mmpkg/mmseg/models/segmentors/encoder_decoder.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from annotator.mmpkg.mmseg.core import add_prefix from annotator.mmpkg.mmseg.ops import resize from .. import builder from ..builder import SEGMENTORS from .base import BaseSegmentor @SEGMENTORS.register_module() class EncoderDecoder(BaseSegmentor): """Encoder Decoder segmentors. EncoderDecoder typically consists of backbone, decode_head, auxiliary_head. Note that auxiliary_head is only used for deep supervision during training, which could be dumped during inference. """ def __init__(self, backbone, decode_head, neck=None, auxiliary_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(EncoderDecoder, self).__init__() self.backbone = builder.build_backbone(backbone) if neck is not None: self.neck = builder.build_neck(neck) self._init_decode_head(decode_head) self._init_auxiliary_head(auxiliary_head) self.train_cfg = train_cfg self.test_cfg = test_cfg self.init_weights(pretrained=pretrained) assert self.with_decode_head def _init_decode_head(self, decode_head): """Initialize ``decode_head``""" self.decode_head = builder.build_head(decode_head) self.align_corners = self.decode_head.align_corners self.num_classes = self.decode_head.num_classes def _init_auxiliary_head(self, auxiliary_head): """Initialize ``auxiliary_head``""" if auxiliary_head is not None: if isinstance(auxiliary_head, list): self.auxiliary_head = nn.ModuleList() for head_cfg in auxiliary_head: self.auxiliary_head.append(builder.build_head(head_cfg)) else: self.auxiliary_head = builder.build_head(auxiliary_head) def init_weights(self, pretrained=None): """Initialize the weights in backbone and heads. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ super(EncoderDecoder, self).init_weights(pretrained) self.backbone.init_weights(pretrained=pretrained) self.decode_head.init_weights() if self.with_auxiliary_head: if isinstance(self.auxiliary_head, nn.ModuleList): for aux_head in self.auxiliary_head: aux_head.init_weights() else: self.auxiliary_head.init_weights() def extract_feat(self, img): """Extract features from images.""" x = self.backbone(img) if self.with_neck: x = self.neck(x) return x def encode_decode(self, img, img_metas): """Encode images with backbone and decode into a semantic segmentation map of the same size as input.""" x = self.extract_feat(img) out = self._decode_head_forward_test(x, img_metas) out = resize( input=out, size=img.shape[2:], mode='bilinear', align_corners=self.align_corners) return out def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg): """Run forward function and calculate loss for decode head in training.""" losses = dict() loss_decode = self.decode_head.forward_train(x, img_metas, gt_semantic_seg, self.train_cfg) losses.update(add_prefix(loss_decode, 'decode')) return losses def _decode_head_forward_test(self, x, img_metas): """Run forward function and calculate loss for decode head in inference.""" seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg) return seg_logits def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg): """Run forward function and calculate loss for auxiliary head in training.""" losses = dict() if isinstance(self.auxiliary_head, nn.ModuleList): for idx, aux_head in enumerate(self.auxiliary_head): loss_aux = aux_head.forward_train(x, img_metas, gt_semantic_seg, self.train_cfg) losses.update(add_prefix(loss_aux, f'aux_{idx}')) else: loss_aux = self.auxiliary_head.forward_train( x, img_metas, gt_semantic_seg, self.train_cfg) losses.update(add_prefix(loss_aux, 'aux')) return losses def forward_dummy(self, img): """Dummy forward function.""" seg_logit = self.encode_decode(img, None) return seg_logit def forward_train(self, img, img_metas, gt_semantic_seg): """Forward function for training. Args: img (Tensor): Input images. img_metas (list[dict]): List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. gt_semantic_seg (Tensor): Semantic segmentation masks used if the architecture supports semantic segmentation task. Returns: dict[str, Tensor]: a dictionary of loss components """ x = self.extract_feat(img) losses = dict() loss_decode = self._decode_head_forward_train(x, img_metas, gt_semantic_seg) losses.update(loss_decode) if self.with_auxiliary_head: loss_aux = self._auxiliary_head_forward_train( x, img_metas, gt_semantic_seg) losses.update(loss_aux) return losses # TODO refactor def slide_inference(self, img, img_meta, rescale): """Inference by sliding-window with overlap. If h_crop > h_img or w_crop > w_img, the small patch will be used to decode without padding. """ h_stride, w_stride = self.test_cfg.stride h_crop, w_crop = self.test_cfg.crop_size batch_size, _, h_img, w_img = img.size() num_classes = self.num_classes h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1 w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1 preds = img.new_zeros((batch_size, num_classes, h_img, w_img)) count_mat = img.new_zeros((batch_size, 1, h_img, w_img)) for h_idx in range(h_grids): for w_idx in range(w_grids): y1 = h_idx * h_stride x1 = w_idx * w_stride y2 = min(y1 + h_crop, h_img) x2 = min(x1 + w_crop, w_img) y1 = max(y2 - h_crop, 0) x1 = max(x2 - w_crop, 0) crop_img = img[:, :, y1:y2, x1:x2] crop_seg_logit = self.encode_decode(crop_img, img_meta) preds += F.pad(crop_seg_logit, (int(x1), int(preds.shape[3] - x2), int(y1), int(preds.shape[2] - y2))) count_mat[:, :, y1:y2, x1:x2] += 1 assert (count_mat == 0).sum() == 0 if torch.onnx.is_in_onnx_export(): # cast count_mat to constant while exporting to ONNX count_mat = torch.from_numpy( count_mat.cpu().detach().numpy()).to(device=img.device) preds = preds / count_mat if rescale: preds = resize( preds, size=img_meta[0]['ori_shape'][:2], mode='bilinear', align_corners=self.align_corners, warning=False) return preds def whole_inference(self, img, img_meta, rescale): """Inference with full image.""" seg_logit = self.encode_decode(img, img_meta) if rescale: # support dynamic shape for onnx if torch.onnx.is_in_onnx_export(): size = img.shape[2:] else: size = img_meta[0]['ori_shape'][:2] seg_logit = resize( seg_logit, size=size, mode='bilinear', align_corners=self.align_corners, warning=False) return seg_logit def inference(self, img, img_meta, rescale): """Inference with slide/whole style. Args: img (Tensor): The input image of shape (N, 3, H, W). img_meta (dict): Image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see `mmseg/datasets/pipelines/formatting.py:Collect`. rescale (bool): Whether rescale back to original shape. Returns: Tensor: The output segmentation map. """ assert self.test_cfg.mode in ['slide', 'whole'] ori_shape = img_meta[0]['ori_shape'] assert all(_['ori_shape'] == ori_shape for _ in img_meta) if self.test_cfg.mode == 'slide': seg_logit = self.slide_inference(img, img_meta, rescale) else: seg_logit = self.whole_inference(img, img_meta, rescale) output = F.softmax(seg_logit, dim=1) flip = img_meta[0]['flip'] if flip: flip_direction = img_meta[0]['flip_direction'] assert flip_direction in ['horizontal', 'vertical'] if flip_direction == 'horizontal': output = output.flip(dims=(3, )) elif flip_direction == 'vertical': output = output.flip(dims=(2, )) return output def simple_test(self, img, img_meta, rescale=True): """Simple test with single image.""" seg_logit = self.inference(img, img_meta, rescale) seg_pred = seg_logit.argmax(dim=1) if torch.onnx.is_in_onnx_export(): # our inference backend only support 4D output seg_pred = seg_pred.unsqueeze(0) return seg_pred seg_pred = seg_pred.cpu().numpy() # unravel batch dim seg_pred = list(seg_pred) return seg_pred def aug_test(self, imgs, img_metas, rescale=True): """Test with augmentations. Only rescale=True is supported. """ # aug_test rescale all imgs back to ori_shape for now assert rescale # to save memory, we get augmented seg logit inplace seg_logit = self.inference(imgs[0], img_metas[0], rescale) for i in range(1, len(imgs)): cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale) seg_logit += cur_seg_logit seg_logit /= len(imgs) seg_pred = seg_logit.argmax(dim=1) seg_pred = seg_pred.cpu().numpy() # unravel batch dim seg_pred = list(seg_pred) return seg_pred ================================================ FILE: annotator/mmpkg/mmseg/models/utils/__init__.py ================================================ from .drop import DropPath from .inverted_residual import InvertedResidual, InvertedResidualV3 from .make_divisible import make_divisible from .res_layer import ResLayer from .se_layer import SELayer from .self_attention_block import SelfAttentionBlock from .up_conv_block import UpConvBlock from .weight_init import trunc_normal_ __all__ = [ 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_' ] ================================================ FILE: annotator/mmpkg/mmseg/models/utils/drop.py ================================================ """Modified from https://github.com/rwightman/pytorch-image- models/blob/master/timm/models/layers/drop.py.""" import torch from torch import nn class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). Args: drop_prob (float): Drop rate for paths of model. Dropout rate has to be between 0 and 1. Default: 0. """ def __init__(self, drop_prob=0.): super(DropPath, self).__init__() self.drop_prob = drop_prob self.keep_prob = 1 - drop_prob def forward(self, x): if self.drop_prob == 0. or not self.training: return x shape = (x.shape[0], ) + (1, ) * ( x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets random_tensor = self.keep_prob + torch.rand( shape, dtype=x.dtype, device=x.device) random_tensor.floor_() # binarize output = x.div(self.keep_prob) * random_tensor return output ================================================ FILE: annotator/mmpkg/mmseg/models/utils/inverted_residual.py ================================================ from annotator.mmpkg.mmcv.cnn import ConvModule from torch import nn from torch.utils import checkpoint as cp from .se_layer import SELayer class InvertedResidual(nn.Module): """InvertedResidual block for MobileNetV2. Args: in_channels (int): The input channels of the InvertedResidual block. out_channels (int): The output channels of the InvertedResidual block. stride (int): Stride of the middle (first) 3x3 convolution. expand_ratio (int): Adjusts number of channels of the hidden layer in InvertedResidual by this amount. dilation (int): Dilation rate of depthwise conv. Default: 1 conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict): Config dict for activation layer. Default: dict(type='ReLU6'). with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. Returns: Tensor: The output tensor. """ def __init__(self, in_channels, out_channels, stride, expand_ratio, dilation=1, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU6'), with_cp=False): super(InvertedResidual, self).__init__() self.stride = stride assert stride in [1, 2], f'stride must in [1, 2]. ' \ f'But received {stride}.' self.with_cp = with_cp self.use_res_connect = self.stride == 1 and in_channels == out_channels hidden_dim = int(round(in_channels * expand_ratio)) layers = [] if expand_ratio != 1: layers.append( ConvModule( in_channels=in_channels, out_channels=hidden_dim, kernel_size=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) layers.extend([ ConvModule( in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, groups=hidden_dim, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg), ConvModule( in_channels=hidden_dim, out_channels=out_channels, kernel_size=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=None) ]) self.conv = nn.Sequential(*layers) def forward(self, x): def _inner_forward(x): if self.use_res_connect: return x + self.conv(x) else: return self.conv(x) if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) return out class InvertedResidualV3(nn.Module): """Inverted Residual Block for MobileNetV3. Args: in_channels (int): The input channels of this Module. out_channels (int): The output channels of this Module. mid_channels (int): The input channels of the depthwise convolution. kernel_size (int): The kernel size of the depthwise convolution. Default: 3. stride (int): The stride of the depthwise convolution. Default: 1. se_cfg (dict): Config dict for se layer. Default: None, which means no se layer. with_expand_conv (bool): Use expand conv or not. If set False, mid_channels must be the same with in_channels. Default: True. conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict): Config dict for activation layer. Default: dict(type='ReLU'). with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. Returns: Tensor: The output tensor. """ def __init__(self, in_channels, out_channels, mid_channels, kernel_size=3, stride=1, se_cfg=None, with_expand_conv=True, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), with_cp=False): super(InvertedResidualV3, self).__init__() self.with_res_shortcut = (stride == 1 and in_channels == out_channels) assert stride in [1, 2] self.with_cp = with_cp self.with_se = se_cfg is not None self.with_expand_conv = with_expand_conv if self.with_se: assert isinstance(se_cfg, dict) if not self.with_expand_conv: assert mid_channels == in_channels if self.with_expand_conv: self.expand_conv = ConvModule( in_channels=in_channels, out_channels=mid_channels, kernel_size=1, stride=1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) self.depthwise_conv = ConvModule( in_channels=mid_channels, out_channels=mid_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, groups=mid_channels, conv_cfg=dict( type='Conv2dAdaptivePadding') if stride == 2 else conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) if self.with_se: self.se = SELayer(**se_cfg) self.linear_conv = ConvModule( in_channels=mid_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=None) def forward(self, x): def _inner_forward(x): out = x if self.with_expand_conv: out = self.expand_conv(out) out = self.depthwise_conv(out) if self.with_se: out = self.se(out) out = self.linear_conv(out) if self.with_res_shortcut: return x + out else: return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) return out ================================================ FILE: annotator/mmpkg/mmseg/models/utils/make_divisible.py ================================================ def make_divisible(value, divisor, min_value=None, min_ratio=0.9): """Make divisible function. This function rounds the channel number to the nearest value that can be divisible by the divisor. It is taken from the original tf repo. It ensures that all layers have a channel number that is divisible by divisor. It can be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py # noqa Args: value (int): The original channel number. divisor (int): The divisor to fully divide the channel number. min_value (int): The minimum value of the output channel. Default: None, means that the minimum value equal to the divisor. min_ratio (float): The minimum ratio of the rounded channel number to the original channel number. Default: 0.9. Returns: int: The modified output channel number. """ if min_value is None: min_value = divisor new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than (1-min_ratio). if new_value < min_ratio * value: new_value += divisor return new_value ================================================ FILE: annotator/mmpkg/mmseg/models/utils/res_layer.py ================================================ from annotator.mmpkg.mmcv.cnn import build_conv_layer, build_norm_layer from torch import nn as nn class ResLayer(nn.Sequential): """ResLayer to build ResNet style backbone. Args: block (nn.Module): block used to build ResLayer. inplanes (int): inplanes of block. planes (int): planes of block. num_blocks (int): number of blocks. stride (int): stride of the first block. Default: 1 avg_down (bool): Use AvgPool instead of stride conv when downsampling in the bottleneck. Default: False conv_cfg (dict): dictionary to construct and config conv layer. Default: None norm_cfg (dict): dictionary to construct and config norm layer. Default: dict(type='BN') multi_grid (int | None): Multi grid dilation rates of last stage. Default: None contract_dilation (bool): Whether contract first dilation of each layer Default: False """ def __init__(self, block, inplanes, planes, num_blocks, stride=1, dilation=1, avg_down=False, conv_cfg=None, norm_cfg=dict(type='BN'), multi_grid=None, contract_dilation=False, **kwargs): self.block = block downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = [] conv_stride = stride if avg_down: conv_stride = 1 downsample.append( nn.AvgPool2d( kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False)) downsample.extend([ build_conv_layer( conv_cfg, inplanes, planes * block.expansion, kernel_size=1, stride=conv_stride, bias=False), build_norm_layer(norm_cfg, planes * block.expansion)[1] ]) downsample = nn.Sequential(*downsample) layers = [] if multi_grid is None: if dilation > 1 and contract_dilation: first_dilation = dilation // 2 else: first_dilation = dilation else: first_dilation = multi_grid[0] layers.append( block( inplanes=inplanes, planes=planes, stride=stride, dilation=first_dilation, downsample=downsample, conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs)) inplanes = planes * block.expansion for i in range(1, num_blocks): layers.append( block( inplanes=inplanes, planes=planes, stride=1, dilation=dilation if multi_grid is None else multi_grid[i], conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs)) super(ResLayer, self).__init__(*layers) ================================================ FILE: annotator/mmpkg/mmseg/models/utils/se_layer.py ================================================ import annotator.mmpkg.mmcv as mmcv import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule from .make_divisible import make_divisible class SELayer(nn.Module): """Squeeze-and-Excitation Module. Args: channels (int): The input (and output) channels of the SE layer. ratio (int): Squeeze ratio in SELayer, the intermediate channel will be ``int(channels/ratio)``. Default: 16. conv_cfg (None or dict): Config dict for convolution layer. Default: None, which means using conv2d. act_cfg (dict or Sequence[dict]): Config dict for activation layer. If act_cfg is a dict, two activation layers will be configured by this dict. If act_cfg is a sequence of dicts, the first activation layer will be configured by the first dict and the second activation layer will be configured by the second dict. Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0, divisor=6.0)). """ def __init__(self, channels, ratio=16, conv_cfg=None, act_cfg=(dict(type='ReLU'), dict(type='HSigmoid', bias=3.0, divisor=6.0))): super(SELayer, self).__init__() if isinstance(act_cfg, dict): act_cfg = (act_cfg, act_cfg) assert len(act_cfg) == 2 assert mmcv.is_tuple_of(act_cfg, dict) self.global_avgpool = nn.AdaptiveAvgPool2d(1) self.conv1 = ConvModule( in_channels=channels, out_channels=make_divisible(channels // ratio, 8), kernel_size=1, stride=1, conv_cfg=conv_cfg, act_cfg=act_cfg[0]) self.conv2 = ConvModule( in_channels=make_divisible(channels // ratio, 8), out_channels=channels, kernel_size=1, stride=1, conv_cfg=conv_cfg, act_cfg=act_cfg[1]) def forward(self, x): out = self.global_avgpool(x) out = self.conv1(out) out = self.conv2(out) return x * out ================================================ FILE: annotator/mmpkg/mmseg/models/utils/self_attention_block.py ================================================ import torch from annotator.mmpkg.mmcv.cnn import ConvModule, constant_init from torch import nn as nn from torch.nn import functional as F class SelfAttentionBlock(nn.Module): """General self-attention block/non-local block. Please refer to https://arxiv.org/abs/1706.03762 for details about key, query and value. Args: key_in_channels (int): Input channels of key feature. query_in_channels (int): Input channels of query feature. channels (int): Output channels of key/query transform. out_channels (int): Output channels. share_key_query (bool): Whether share projection weight between key and query projection. query_downsample (nn.Module): Query downsample module. key_downsample (nn.Module): Key downsample module. key_query_num_convs (int): Number of convs for key/query projection. value_num_convs (int): Number of convs for value projection. matmul_norm (bool): Whether normalize attention map with sqrt of channels with_out (bool): Whether use out projection. conv_cfg (dict|None): Config of conv layers. norm_cfg (dict|None): Config of norm layers. act_cfg (dict|None): Config of activation layers. """ def __init__(self, key_in_channels, query_in_channels, channels, out_channels, share_key_query, query_downsample, key_downsample, key_query_num_convs, value_out_num_convs, key_query_norm, value_out_norm, matmul_norm, with_out, conv_cfg, norm_cfg, act_cfg): super(SelfAttentionBlock, self).__init__() if share_key_query: assert key_in_channels == query_in_channels self.key_in_channels = key_in_channels self.query_in_channels = query_in_channels self.out_channels = out_channels self.channels = channels self.share_key_query = share_key_query self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.key_project = self.build_project( key_in_channels, channels, num_convs=key_query_num_convs, use_conv_module=key_query_norm, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) if share_key_query: self.query_project = self.key_project else: self.query_project = self.build_project( query_in_channels, channels, num_convs=key_query_num_convs, use_conv_module=key_query_norm, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) self.value_project = self.build_project( key_in_channels, channels if with_out else out_channels, num_convs=value_out_num_convs, use_conv_module=value_out_norm, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) if with_out: self.out_project = self.build_project( channels, out_channels, num_convs=value_out_num_convs, use_conv_module=value_out_norm, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) else: self.out_project = None self.query_downsample = query_downsample self.key_downsample = key_downsample self.matmul_norm = matmul_norm self.init_weights() def init_weights(self): """Initialize weight of later layer.""" if self.out_project is not None: if not isinstance(self.out_project, ConvModule): constant_init(self.out_project, 0) def build_project(self, in_channels, channels, num_convs, use_conv_module, conv_cfg, norm_cfg, act_cfg): """Build projection layer for key/query/value/out.""" if use_conv_module: convs = [ ConvModule( in_channels, channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) ] for _ in range(num_convs - 1): convs.append( ConvModule( channels, channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) else: convs = [nn.Conv2d(in_channels, channels, 1)] for _ in range(num_convs - 1): convs.append(nn.Conv2d(channels, channels, 1)) if len(convs) > 1: convs = nn.Sequential(*convs) else: convs = convs[0] return convs def forward(self, query_feats, key_feats): """Forward function.""" batch_size = query_feats.size(0) query = self.query_project(query_feats) if self.query_downsample is not None: query = self.query_downsample(query) query = query.reshape(*query.shape[:2], -1) query = query.permute(0, 2, 1).contiguous() key = self.key_project(key_feats) value = self.value_project(key_feats) if self.key_downsample is not None: key = self.key_downsample(key) value = self.key_downsample(value) key = key.reshape(*key.shape[:2], -1) value = value.reshape(*value.shape[:2], -1) value = value.permute(0, 2, 1).contiguous() sim_map = torch.matmul(query, key) if self.matmul_norm: sim_map = (self.channels**-.5) * sim_map sim_map = F.softmax(sim_map, dim=-1) context = torch.matmul(sim_map, value) context = context.permute(0, 2, 1).contiguous() context = context.reshape(batch_size, -1, *query_feats.shape[2:]) if self.out_project is not None: context = self.out_project(context) return context ================================================ FILE: annotator/mmpkg/mmseg/models/utils/up_conv_block.py ================================================ import torch import torch.nn as nn from annotator.mmpkg.mmcv.cnn import ConvModule, build_upsample_layer class UpConvBlock(nn.Module): """Upsample convolution block in decoder for UNet. This upsample convolution block consists of one upsample module followed by one convolution block. The upsample module expands the high-level low-resolution feature map and the convolution block fuses the upsampled high-level low-resolution feature map and the low-level high-resolution feature map from encoder. Args: conv_block (nn.Sequential): Sequential of convolutional layers. in_channels (int): Number of input channels of the high-level skip_channels (int): Number of input channels of the low-level high-resolution feature map from encoder. out_channels (int): Number of output channels. num_convs (int): Number of convolutional layers in the conv_block. Default: 2. stride (int): Stride of convolutional layer in conv_block. Default: 1. dilation (int): Dilation rate of convolutional layer in conv_block. Default: 1. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. conv_cfg (dict | None): Config dict for convolution layer. Default: None. norm_cfg (dict | None): Config dict for normalization layer. Default: dict(type='BN'). act_cfg (dict | None): Config dict for activation layer in ConvModule. Default: dict(type='ReLU'). upsample_cfg (dict): The upsample config of the upsample module in decoder. Default: dict(type='InterpConv'). If the size of high-level feature map is the same as that of skip feature map (low-level feature map from encoder), it does not need upsample the high-level feature map and the upsample_cfg is None. dcn (bool): Use deformable convolution in convolutional layer or not. Default: None. plugins (dict): plugins for convolutional layers. Default: None. """ def __init__(self, conv_block, in_channels, skip_channels, out_channels, num_convs=2, stride=1, dilation=1, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), upsample_cfg=dict(type='InterpConv'), dcn=None, plugins=None): super(UpConvBlock, self).__init__() assert dcn is None, 'Not implemented yet.' assert plugins is None, 'Not implemented yet.' self.conv_block = conv_block( in_channels=2 * skip_channels, out_channels=out_channels, num_convs=num_convs, stride=stride, dilation=dilation, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, dcn=None, plugins=None) if upsample_cfg is not None: self.upsample = build_upsample_layer( cfg=upsample_cfg, in_channels=in_channels, out_channels=skip_channels, with_cp=with_cp, norm_cfg=norm_cfg, act_cfg=act_cfg) else: self.upsample = ConvModule( in_channels, skip_channels, kernel_size=1, stride=1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) def forward(self, skip, x): """Forward function.""" x = self.upsample(x) out = torch.cat([skip, x], dim=1) out = self.conv_block(out) return out ================================================ FILE: annotator/mmpkg/mmseg/models/utils/weight_init.py ================================================ """Modified from https://github.com/rwightman/pytorch-image- models/blob/master/timm/models/layers/drop.py.""" import math import warnings import torch def _no_grad_trunc_normal_(tensor, mean, std, a, b): """Reference: https://people.sc.fsu.edu/~jburkardt/presentations /truncated_normal.pdf""" def norm_cdf(x): # Computes standard normal cumulative distribution function return (1. + math.erf(x / math.sqrt(2.))) / 2. if (mean < a - 2 * std) or (mean > b + 2 * std): warnings.warn( 'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. ' 'The distribution of values may be incorrect.', stacklevel=2) with torch.no_grad(): # Values are generated by using a truncated uniform distribution and # then using the inverse CDF for the normal distribution. # Get upper and lower cdf values lower_bound = norm_cdf((a - mean) / std) upper_bound = norm_cdf((b - mean) / std) # Uniformly fill tensor with values from [l, u], then translate to # [2l-1, 2u-1]. tensor.uniform_(2 * lower_bound - 1, 2 * upper_bound - 1) # Use inverse cdf transform for normal distribution to get truncated # standard normal tensor.erfinv_() # Transform to proper mean, std tensor.mul_(std * math.sqrt(2.)) tensor.add_(mean) # Clamp to ensure it's in the proper range tensor.clamp_(min=a, max=b) return tensor def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): r"""Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for generating the random values works best when :math:`a \leq \text{mean} \leq b`. Args: tensor (``torch.Tensor``): an n-dimensional `torch.Tensor` mean (float): the mean of the normal distribution std (float): the standard deviation of the normal distribution a (float): the minimum cutoff value b (float): the maximum cutoff value """ return _no_grad_trunc_normal_(tensor, mean, std, a, b) ================================================ FILE: annotator/mmpkg/mmseg/ops/__init__.py ================================================ from .encoding import Encoding from .wrappers import Upsample, resize __all__ = ['Upsample', 'resize', 'Encoding'] ================================================ FILE: annotator/mmpkg/mmseg/ops/encoding.py ================================================ import torch from torch import nn from torch.nn import functional as F class Encoding(nn.Module): """Encoding Layer: a learnable residual encoder. Input is of shape (batch_size, channels, height, width). Output is of shape (batch_size, num_codes, channels). Args: channels: dimension of the features or feature channels num_codes: number of code words """ def __init__(self, channels, num_codes): super(Encoding, self).__init__() # init codewords and smoothing factor self.channels, self.num_codes = channels, num_codes std = 1. / ((num_codes * channels)**0.5) # [num_codes, channels] self.codewords = nn.Parameter( torch.empty(num_codes, channels, dtype=torch.float).uniform_(-std, std), requires_grad=True) # [num_codes] self.scale = nn.Parameter( torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0), requires_grad=True) @staticmethod def scaled_l2(x, codewords, scale): num_codes, channels = codewords.size() batch_size = x.size(0) reshaped_scale = scale.view((1, 1, num_codes)) expanded_x = x.unsqueeze(2).expand( (batch_size, x.size(1), num_codes, channels)) reshaped_codewords = codewords.view((1, 1, num_codes, channels)) scaled_l2_norm = reshaped_scale * ( expanded_x - reshaped_codewords).pow(2).sum(dim=3) return scaled_l2_norm @staticmethod def aggregate(assignment_weights, x, codewords): num_codes, channels = codewords.size() reshaped_codewords = codewords.view((1, 1, num_codes, channels)) batch_size = x.size(0) expanded_x = x.unsqueeze(2).expand( (batch_size, x.size(1), num_codes, channels)) encoded_feat = (assignment_weights.unsqueeze(3) * (expanded_x - reshaped_codewords)).sum(dim=1) return encoded_feat def forward(self, x): assert x.dim() == 4 and x.size(1) == self.channels # [batch_size, channels, height, width] batch_size = x.size(0) # [batch_size, height x width, channels] x = x.view(batch_size, self.channels, -1).transpose(1, 2).contiguous() # assignment_weights: [batch_size, channels, num_codes] assignment_weights = F.softmax( self.scaled_l2(x, self.codewords, self.scale), dim=2) # aggregate encoded_feat = self.aggregate(assignment_weights, x, self.codewords) return encoded_feat def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(Nx{self.channels}xHxW =>Nx{self.num_codes}' \ f'x{self.channels})' return repr_str ================================================ FILE: annotator/mmpkg/mmseg/ops/wrappers.py ================================================ import warnings import torch.nn as nn import torch.nn.functional as F def resize(input, size=None, scale_factor=None, mode='nearest', align_corners=None, warning=True): if warning: if size is not None and align_corners: input_h, input_w = tuple(int(x) for x in input.shape[2:]) output_h, output_w = tuple(int(x) for x in size) if output_h > input_h or output_w > output_h: if ((output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1) and (output_h - 1) % (input_h - 1) and (output_w - 1) % (input_w - 1)): warnings.warn( f'When align_corners={align_corners}, ' 'the output would more aligned if ' f'input size {(input_h, input_w)} is `x+1` and ' f'out size {(output_h, output_w)} is `nx+1`') return F.interpolate(input, size, scale_factor, mode, align_corners) class Upsample(nn.Module): def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=None): super(Upsample, self).__init__() self.size = size if isinstance(scale_factor, tuple): self.scale_factor = tuple(float(factor) for factor in scale_factor) else: self.scale_factor = float(scale_factor) if scale_factor else None self.mode = mode self.align_corners = align_corners def forward(self, x): if not self.size: size = [int(t * self.scale_factor) for t in x.shape[-2:]] else: size = self.size return resize(x, size, None, self.mode, self.align_corners) ================================================ FILE: annotator/mmpkg/mmseg/utils/__init__.py ================================================ from .collect_env import collect_env from .logger import get_root_logger __all__ = ['get_root_logger', 'collect_env'] ================================================ FILE: annotator/mmpkg/mmseg/utils/collect_env.py ================================================ from annotator.mmpkg.mmcv.utils import collect_env as collect_base_env from annotator.mmpkg.mmcv.utils import get_git_hash import annotator.mmpkg.mmseg as mmseg def collect_env(): """Collect the information of the running environments.""" env_info = collect_base_env() env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}' return env_info if __name__ == '__main__': for name, val in collect_env().items(): print('{}: {}'.format(name, val)) ================================================ FILE: annotator/mmpkg/mmseg/utils/logger.py ================================================ import logging from annotator.mmpkg.mmcv.utils import get_logger def get_root_logger(log_file=None, log_level=logging.INFO): """Get the root logger. The logger will be initialized if it has not been initialized. By default a StreamHandler will be added. If `log_file` is specified, a FileHandler will also be added. The name of the root logger is the top-level package name, e.g., "mmseg". Args: log_file (str | None): The log filename. If specified, a FileHandler will be added to the root logger. log_level (int): The root logger level. Note that only the process of rank 0 is affected, while other processes will set the level to "Error" and be silent most of the time. Returns: logging.Logger: The root logger. """ logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level) return logger ================================================ FILE: annotator/mobile_sam/__init__.py ================================================ from __future__ import print_function import os import numpy as np from PIL import Image from typing import Union from modules import devices from annotator.util import load_model from annotator.annotator_path import models_path from controlnet_aux import SamDetector from controlnet_aux.segment_anything import sam_model_registry, SamAutomaticMaskGenerator class SamDetector_Aux(SamDetector): model_dir = os.path.join(models_path, "mobile_sam") def __init__(self, mask_generator: SamAutomaticMaskGenerator, sam): super().__init__(mask_generator) self.device = devices.device self.model = sam.to(self.device).eval() @classmethod def from_pretrained(cls): """ Possible model_type : vit_h, vit_l, vit_b, vit_t download weights from https://huggingface.co/dhkim2810/MobileSAM """ remote_url = os.environ.get( "CONTROLNET_MOBILE_SAM_MODEL_URL", "https://huggingface.co/dhkim2810/MobileSAM/resolve/main/mobile_sam.pt", ) model_path = load_model( "mobile_sam.pt", remote_url=remote_url, model_dir=cls.model_dir ) sam = sam_model_registry["vit_t"](checkpoint=model_path) cls.model = sam.to(devices.device).eval() mask_generator = SamAutomaticMaskGenerator(cls.model) return cls(mask_generator, sam) def __call__(self, input_image: Union[np.ndarray, Image.Image]=None, detect_resolution=512, image_resolution=512, output_type="cv2", **kwargs) -> np.ndarray: self.model.to(self.device) image = super().__call__(input_image=input_image, detect_resolution=detect_resolution, image_resolution=image_resolution, output_type=output_type, **kwargs) return np.array(image).astype(np.uint8) ================================================ FILE: annotator/normalbae/LICENSE ================================================ MIT License Copyright (c) 2022 Caroline Chan Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/normalbae/__init__.py ================================================ import os import types import torch import numpy as np from einops import rearrange from .models.NNET import NNET from modules import devices from annotator.annotator_path import models_path import torchvision.transforms as transforms # load model def load_checkpoint(fpath, model): ckpt = torch.load(fpath, map_location='cpu')['model'] load_dict = {} for k, v in ckpt.items(): if k.startswith('module.'): k_ = k.replace('module.', '') load_dict[k_] = v else: load_dict[k] = v model.load_state_dict(load_dict) return model class NormalBaeDetector: model_dir = os.path.join(models_path, "normal_bae") def __init__(self): self.model = None self.device = devices.get_device_for("controlnet") def load_model(self): remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/scannet.pt" modelpath = os.path.join(self.model_dir, "scannet.pt") if not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=self.model_dir) args = types.SimpleNamespace() args.mode = 'client' args.architecture = 'BN' args.pretrained = 'scannet' args.sampling_ratio = 0.4 args.importance_ratio = 0.7 model = NNET(args) model = load_checkpoint(modelpath, model) model.eval() self.model = model.to(self.device) self.norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) def unload_model(self): if self.model is not None: self.model.cpu() def __call__(self, input_image): if self.model is None: self.load_model() self.model.to(self.device) assert input_image.ndim == 3 image_normal = input_image with torch.no_grad(): image_normal = torch.from_numpy(image_normal).float().to(self.device) image_normal = image_normal / 255.0 image_normal = rearrange(image_normal, 'h w c -> 1 c h w') image_normal = self.norm(image_normal) normal = self.model(image_normal) normal = normal[0][-1][:, :3] # d = torch.sum(normal ** 2.0, dim=1, keepdim=True) ** 0.5 # d = torch.maximum(d, torch.ones_like(d) * 1e-5) # normal /= d normal = ((normal + 1) * 0.5).clip(0, 1) normal = rearrange(normal[0], 'c h w -> h w c').cpu().numpy() normal_image = (normal * 255.0).clip(0, 255).astype(np.uint8) return normal_image ================================================ FILE: annotator/normalbae/models/NNET.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from .submodules.encoder import Encoder from .submodules.decoder import Decoder class NNET(nn.Module): def __init__(self, args): super(NNET, self).__init__() self.encoder = Encoder() self.decoder = Decoder(args) def get_1x_lr_params(self): # lr/10 learning rate return self.encoder.parameters() def get_10x_lr_params(self): # lr learning rate return self.decoder.parameters() def forward(self, img, **kwargs): return self.decoder(self.encoder(img), **kwargs) ================================================ FILE: annotator/normalbae/models/baseline.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from .submodules.submodules import UpSampleBN, norm_normalize # This is the baseline encoder-decoder we used in the ablation study class NNET(nn.Module): def __init__(self, args=None): super(NNET, self).__init__() self.encoder = Encoder() self.decoder = Decoder(num_classes=4) def forward(self, x, **kwargs): out = self.decoder(self.encoder(x), **kwargs) # Bilinearly upsample the output to match the input resolution up_out = F.interpolate(out, size=[x.size(2), x.size(3)], mode='bilinear', align_corners=False) # L2-normalize the first three channels / ensure positive value for concentration parameters (kappa) up_out = norm_normalize(up_out) return up_out def get_1x_lr_params(self): # lr/10 learning rate return self.encoder.parameters() def get_10x_lr_params(self): # lr learning rate modules = [self.decoder] for m in modules: yield from m.parameters() # Encoder class Encoder(nn.Module): def __init__(self): super(Encoder, self).__init__() basemodel_name = 'tf_efficientnet_b5_ap' basemodel = torch.hub.load('rwightman/gen-efficientnet-pytorch', basemodel_name, pretrained=True) # Remove last layer basemodel.global_pool = nn.Identity() basemodel.classifier = nn.Identity() self.original_model = basemodel def forward(self, x): features = [x] for k, v in self.original_model._modules.items(): if (k == 'blocks'): for ki, vi in v._modules.items(): features.append(vi(features[-1])) else: features.append(v(features[-1])) return features # Decoder (no pixel-wise MLP, no uncertainty-guided sampling) class Decoder(nn.Module): def __init__(self, num_classes=4): super(Decoder, self).__init__() self.conv2 = nn.Conv2d(2048, 2048, kernel_size=1, stride=1, padding=0) self.up1 = UpSampleBN(skip_input=2048 + 176, output_features=1024) self.up2 = UpSampleBN(skip_input=1024 + 64, output_features=512) self.up3 = UpSampleBN(skip_input=512 + 40, output_features=256) self.up4 = UpSampleBN(skip_input=256 + 24, output_features=128) self.conv3 = nn.Conv2d(128, num_classes, kernel_size=3, stride=1, padding=1) def forward(self, features): x_block0, x_block1, x_block2, x_block3, x_block4 = features[4], features[5], features[6], features[8], features[11] x_d0 = self.conv2(x_block4) x_d1 = self.up1(x_d0, x_block3) x_d2 = self.up2(x_d1, x_block2) x_d3 = self.up3(x_d2, x_block1) x_d4 = self.up4(x_d3, x_block0) out = self.conv3(x_d4) return out if __name__ == '__main__': model = Baseline() x = torch.rand(2, 3, 480, 640) out = model(x) print(out.shape) ================================================ FILE: annotator/normalbae/models/submodules/decoder.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from .submodules import UpSampleBN, UpSampleGN, norm_normalize, sample_points class Decoder(nn.Module): def __init__(self, args): super(Decoder, self).__init__() # hyper-parameter for sampling self.sampling_ratio = args.sampling_ratio self.importance_ratio = args.importance_ratio # feature-map self.conv2 = nn.Conv2d(2048, 2048, kernel_size=1, stride=1, padding=0) if args.architecture == 'BN': self.up1 = UpSampleBN(skip_input=2048 + 176, output_features=1024) self.up2 = UpSampleBN(skip_input=1024 + 64, output_features=512) self.up3 = UpSampleBN(skip_input=512 + 40, output_features=256) self.up4 = UpSampleBN(skip_input=256 + 24, output_features=128) elif args.architecture == 'GN': self.up1 = UpSampleGN(skip_input=2048 + 176, output_features=1024) self.up2 = UpSampleGN(skip_input=1024 + 64, output_features=512) self.up3 = UpSampleGN(skip_input=512 + 40, output_features=256) self.up4 = UpSampleGN(skip_input=256 + 24, output_features=128) else: raise Exception('invalid architecture') # produces 1/8 res output self.out_conv_res8 = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1) # produces 1/4 res output self.out_conv_res4 = nn.Sequential( nn.Conv1d(512 + 4, 128, kernel_size=1), nn.ReLU(), nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(), nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(), nn.Conv1d(128, 4, kernel_size=1), ) # produces 1/2 res output self.out_conv_res2 = nn.Sequential( nn.Conv1d(256 + 4, 128, kernel_size=1), nn.ReLU(), nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(), nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(), nn.Conv1d(128, 4, kernel_size=1), ) # produces 1/1 res output self.out_conv_res1 = nn.Sequential( nn.Conv1d(128 + 4, 128, kernel_size=1), nn.ReLU(), nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(), nn.Conv1d(128, 128, kernel_size=1), nn.ReLU(), nn.Conv1d(128, 4, kernel_size=1), ) def forward(self, features, gt_norm_mask=None, mode='test'): x_block0, x_block1, x_block2, x_block3, x_block4 = features[4], features[5], features[6], features[8], features[11] # generate feature-map x_d0 = self.conv2(x_block4) # x_d0 : [2, 2048, 15, 20] 1/32 res x_d1 = self.up1(x_d0, x_block3) # x_d1 : [2, 1024, 30, 40] 1/16 res x_d2 = self.up2(x_d1, x_block2) # x_d2 : [2, 512, 60, 80] 1/8 res x_d3 = self.up3(x_d2, x_block1) # x_d3: [2, 256, 120, 160] 1/4 res x_d4 = self.up4(x_d3, x_block0) # x_d4: [2, 128, 240, 320] 1/2 res # 1/8 res output out_res8 = self.out_conv_res8(x_d2) # out_res8: [2, 4, 60, 80] 1/8 res output out_res8 = norm_normalize(out_res8) # out_res8: [2, 4, 60, 80] 1/8 res output ################################################################################################################ # out_res4 ################################################################################################################ if mode == 'train': # upsampling ... out_res8: [2, 4, 60, 80] -> out_res8_res4: [2, 4, 120, 160] out_res8_res4 = F.interpolate(out_res8, scale_factor=2, mode='bilinear', align_corners=True) B, _, H, W = out_res8_res4.shape # samples: [B, 1, N, 2] point_coords_res4, rows_int, cols_int = sample_points(out_res8_res4.detach(), gt_norm_mask, sampling_ratio=self.sampling_ratio, beta=self.importance_ratio) # output (needed for evaluation / visualization) out_res4 = out_res8_res4 # grid_sample feature-map feat_res4 = F.grid_sample(x_d2, point_coords_res4, mode='bilinear', align_corners=True) # (B, 512, 1, N) init_pred = F.grid_sample(out_res8, point_coords_res4, mode='bilinear', align_corners=True) # (B, 4, 1, N) feat_res4 = torch.cat([feat_res4, init_pred], dim=1) # (B, 512+4, 1, N) # prediction (needed to compute loss) samples_pred_res4 = self.out_conv_res4(feat_res4[:, :, 0, :]) # (B, 4, N) samples_pred_res4 = norm_normalize(samples_pred_res4) # (B, 4, N) - normalized for i in range(B): out_res4[i, :, rows_int[i, :], cols_int[i, :]] = samples_pred_res4[i, :, :] else: # grid_sample feature-map feat_map = F.interpolate(x_d2, scale_factor=2, mode='bilinear', align_corners=True) init_pred = F.interpolate(out_res8, scale_factor=2, mode='bilinear', align_corners=True) feat_map = torch.cat([feat_map, init_pred], dim=1) # (B, 512+4, H, W) B, _, H, W = feat_map.shape # try all pixels out_res4 = self.out_conv_res4(feat_map.view(B, 512 + 4, -1)) # (B, 4, N) out_res4 = norm_normalize(out_res4) # (B, 4, N) - normalized out_res4 = out_res4.view(B, 4, H, W) samples_pred_res4 = point_coords_res4 = None ################################################################################################################ # out_res2 ################################################################################################################ if mode == 'train': # upsampling ... out_res4: [2, 4, 120, 160] -> out_res4_res2: [2, 4, 240, 320] out_res4_res2 = F.interpolate(out_res4, scale_factor=2, mode='bilinear', align_corners=True) B, _, H, W = out_res4_res2.shape # samples: [B, 1, N, 2] point_coords_res2, rows_int, cols_int = sample_points(out_res4_res2.detach(), gt_norm_mask, sampling_ratio=self.sampling_ratio, beta=self.importance_ratio) # output (needed for evaluation / visualization) out_res2 = out_res4_res2 # grid_sample feature-map feat_res2 = F.grid_sample(x_d3, point_coords_res2, mode='bilinear', align_corners=True) # (B, 256, 1, N) init_pred = F.grid_sample(out_res4, point_coords_res2, mode='bilinear', align_corners=True) # (B, 4, 1, N) feat_res2 = torch.cat([feat_res2, init_pred], dim=1) # (B, 256+4, 1, N) # prediction (needed to compute loss) samples_pred_res2 = self.out_conv_res2(feat_res2[:, :, 0, :]) # (B, 4, N) samples_pred_res2 = norm_normalize(samples_pred_res2) # (B, 4, N) - normalized for i in range(B): out_res2[i, :, rows_int[i, :], cols_int[i, :]] = samples_pred_res2[i, :, :] else: # grid_sample feature-map feat_map = F.interpolate(x_d3, scale_factor=2, mode='bilinear', align_corners=True) init_pred = F.interpolate(out_res4, scale_factor=2, mode='bilinear', align_corners=True) feat_map = torch.cat([feat_map, init_pred], dim=1) # (B, 512+4, H, W) B, _, H, W = feat_map.shape out_res2 = self.out_conv_res2(feat_map.view(B, 256 + 4, -1)) # (B, 4, N) out_res2 = norm_normalize(out_res2) # (B, 4, N) - normalized out_res2 = out_res2.view(B, 4, H, W) samples_pred_res2 = point_coords_res2 = None ################################################################################################################ # out_res1 ################################################################################################################ if mode == 'train': # upsampling ... out_res4: [2, 4, 120, 160] -> out_res4_res2: [2, 4, 240, 320] out_res2_res1 = F.interpolate(out_res2, scale_factor=2, mode='bilinear', align_corners=True) B, _, H, W = out_res2_res1.shape # samples: [B, 1, N, 2] point_coords_res1, rows_int, cols_int = sample_points(out_res2_res1.detach(), gt_norm_mask, sampling_ratio=self.sampling_ratio, beta=self.importance_ratio) # output (needed for evaluation / visualization) out_res1 = out_res2_res1 # grid_sample feature-map feat_res1 = F.grid_sample(x_d4, point_coords_res1, mode='bilinear', align_corners=True) # (B, 128, 1, N) init_pred = F.grid_sample(out_res2, point_coords_res1, mode='bilinear', align_corners=True) # (B, 4, 1, N) feat_res1 = torch.cat([feat_res1, init_pred], dim=1) # (B, 128+4, 1, N) # prediction (needed to compute loss) samples_pred_res1 = self.out_conv_res1(feat_res1[:, :, 0, :]) # (B, 4, N) samples_pred_res1 = norm_normalize(samples_pred_res1) # (B, 4, N) - normalized for i in range(B): out_res1[i, :, rows_int[i, :], cols_int[i, :]] = samples_pred_res1[i, :, :] else: # grid_sample feature-map feat_map = F.interpolate(x_d4, scale_factor=2, mode='bilinear', align_corners=True) init_pred = F.interpolate(out_res2, scale_factor=2, mode='bilinear', align_corners=True) feat_map = torch.cat([feat_map, init_pred], dim=1) # (B, 512+4, H, W) B, _, H, W = feat_map.shape out_res1 = self.out_conv_res1(feat_map.view(B, 128 + 4, -1)) # (B, 4, N) out_res1 = norm_normalize(out_res1) # (B, 4, N) - normalized out_res1 = out_res1.view(B, 4, H, W) samples_pred_res1 = point_coords_res1 = None return [out_res8, out_res4, out_res2, out_res1], \ [out_res8, samples_pred_res4, samples_pred_res2, samples_pred_res1], \ [None, point_coords_res4, point_coords_res2, point_coords_res1] ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/.gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # pytorch stuff *.pth *.onnx *.pb trained_models/ .fuse_hidden* ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/BENCHMARK.md ================================================ # Model Performance Benchmarks All benchmarks run as per: ``` python onnx_export.py --model mobilenetv3_100 ./mobilenetv3_100.onnx python onnx_optimize.py ./mobilenetv3_100.onnx --output mobilenetv3_100-opt.onnx python onnx_to_caffe.py ./mobilenetv3_100.onnx --c2-prefix mobilenetv3 python onnx_to_caffe.py ./mobilenetv3_100-opt.onnx --c2-prefix mobilenetv3-opt python caffe2_benchmark.py --c2-init ./mobilenetv3.init.pb --c2-predict ./mobilenetv3.predict.pb python caffe2_benchmark.py --c2-init ./mobilenetv3-opt.init.pb --c2-predict ./mobilenetv3-opt.predict.pb ``` ## EfficientNet-B0 ### Unoptimized ``` Main run finished. Milliseconds per iter: 49.2862. Iters per second: 20.2897 Time per operator type: 29.7378 ms. 60.5145%. Conv 12.1785 ms. 24.7824%. Sigmoid 3.62811 ms. 7.38297%. SpatialBN 2.98444 ms. 6.07314%. Mul 0.326902 ms. 0.665225%. AveragePool 0.197317 ms. 0.401528%. FC 0.0852877 ms. 0.173555%. Add 0.0032607 ms. 0.00663532%. Squeeze 49.1416 ms in Total FLOP per operator type: 0.76907 GFLOP. 95.2696%. Conv 0.0269508 GFLOP. 3.33857%. SpatialBN 0.00846444 GFLOP. 1.04855%. Mul 0.002561 GFLOP. 0.317248%. FC 0.000210112 GFLOP. 0.0260279%. Add 0.807256 GFLOP in Total Feature Memory Read per operator type: 58.5253 MB. 43.0891%. Mul 43.2015 MB. 31.807%. Conv 27.2869 MB. 20.0899%. SpatialBN 5.12912 MB. 3.77631%. FC 1.6809 MB. 1.23756%. Add 135.824 MB in Total Feature Memory Written per operator type: 33.8578 MB. 38.1965%. Mul 26.9881 MB. 30.4465%. Conv 26.9508 MB. 30.4044%. SpatialBN 0.840448 MB. 0.948147%. Add 0.004 MB. 0.00451258%. FC 88.6412 MB in Total Parameter Memory per operator type: 15.8248 MB. 74.9391%. Conv 5.124 MB. 24.265%. FC 0.168064 MB. 0.795877%. SpatialBN 0 MB. 0%. Add 0 MB. 0%. Mul 21.1168 MB in Total ``` ### Optimized ``` Main run finished. Milliseconds per iter: 46.0838. Iters per second: 21.6996 Time per operator type: 29.776 ms. 65.002%. Conv 12.2803 ms. 26.8084%. Sigmoid 3.15073 ms. 6.87815%. Mul 0.328651 ms. 0.717456%. AveragePool 0.186237 ms. 0.406563%. FC 0.0832429 ms. 0.181722%. Add 0.0026184 ms. 0.00571606%. Squeeze 45.8078 ms in Total FLOP per operator type: 0.76907 GFLOP. 98.5601%. Conv 0.00846444 GFLOP. 1.08476%. Mul 0.002561 GFLOP. 0.328205%. FC 0.000210112 GFLOP. 0.0269269%. Add 0.780305 GFLOP in Total Feature Memory Read per operator type: 58.5253 MB. 53.8803%. Mul 43.2855 MB. 39.8501%. Conv 5.12912 MB. 4.72204%. FC 1.6809 MB. 1.54749%. Add 108.621 MB in Total Feature Memory Written per operator type: 33.8578 MB. 54.8834%. Mul 26.9881 MB. 43.7477%. Conv 0.840448 MB. 1.36237%. Add 0.004 MB. 0.00648399%. FC 61.6904 MB in Total Parameter Memory per operator type: 15.8248 MB. 75.5403%. Conv 5.124 MB. 24.4597%. FC 0 MB. 0%. Add 0 MB. 0%. Mul 20.9488 MB in Total ``` ## EfficientNet-B1 ### Optimized ``` Main run finished. Milliseconds per iter: 71.8102. Iters per second: 13.9256 Time per operator type: 45.7915 ms. 66.3206%. Conv 17.8718 ms. 25.8841%. Sigmoid 4.44132 ms. 6.43244%. Mul 0.51001 ms. 0.738658%. AveragePool 0.233283 ms. 0.337868%. Add 0.194986 ms. 0.282402%. FC 0.00268255 ms. 0.00388519%. Squeeze 69.0456 ms in Total FLOP per operator type: 1.37105 GFLOP. 98.7673%. Conv 0.0138759 GFLOP. 0.99959%. Mul 0.002561 GFLOP. 0.184489%. FC 0.000674432 GFLOP. 0.0485847%. Add 1.38816 GFLOP in Total Feature Memory Read per operator type: 94.624 MB. 54.0789%. Mul 69.8255 MB. 39.9062%. Conv 5.39546 MB. 3.08357%. Add 5.12912 MB. 2.93136%. FC 174.974 MB in Total Feature Memory Written per operator type: 55.5035 MB. 54.555%. Mul 43.5333 MB. 42.7894%. Conv 2.69773 MB. 2.65163%. Add 0.004 MB. 0.00393165%. FC 101.739 MB in Total Parameter Memory per operator type: 25.7479 MB. 83.4024%. Conv 5.124 MB. 16.5976%. FC 0 MB. 0%. Add 0 MB. 0%. Mul 30.8719 MB in Total ``` ## EfficientNet-B2 ### Optimized ``` Main run finished. Milliseconds per iter: 92.28. Iters per second: 10.8366 Time per operator type: 61.4627 ms. 67.5845%. Conv 22.7458 ms. 25.0113%. Sigmoid 5.59931 ms. 6.15701%. Mul 0.642567 ms. 0.706568%. AveragePool 0.272795 ms. 0.299965%. Add 0.216178 ms. 0.237709%. FC 0.00268895 ms. 0.00295677%. Squeeze 90.942 ms in Total FLOP per operator type: 1.98431 GFLOP. 98.9343%. Conv 0.0177039 GFLOP. 0.882686%. Mul 0.002817 GFLOP. 0.140451%. FC 0.000853984 GFLOP. 0.0425782%. Add 2.00568 GFLOP in Total Feature Memory Read per operator type: 120.609 MB. 54.9637%. Mul 86.3512 MB. 39.3519%. Conv 6.83187 MB. 3.11341%. Add 5.64163 MB. 2.571%. FC 219.433 MB in Total Feature Memory Written per operator type: 70.8155 MB. 54.6573%. Mul 55.3273 MB. 42.7031%. Conv 3.41594 MB. 2.63651%. Add 0.004 MB. 0.00308731%. FC 129.563 MB in Total Parameter Memory per operator type: 30.4721 MB. 84.3913%. Conv 5.636 MB. 15.6087%. FC 0 MB. 0%. Add 0 MB. 0%. Mul 36.1081 MB in Total ``` ## MixNet-M ### Optimized ``` Main run finished. Milliseconds per iter: 63.1122. Iters per second: 15.8448 Time per operator type: 48.1139 ms. 75.2052%. Conv 7.1341 ms. 11.1511%. Sigmoid 2.63706 ms. 4.12189%. SpatialBN 1.73186 ms. 2.70701%. Mul 1.38707 ms. 2.16809%. Split 1.29322 ms. 2.02139%. Concat 1.00093 ms. 1.56452%. Relu 0.235309 ms. 0.367803%. Add 0.221579 ms. 0.346343%. FC 0.219315 ms. 0.342803%. AveragePool 0.00250145 ms. 0.00390993%. Squeeze 63.9768 ms in Total FLOP per operator type: 0.675273 GFLOP. 95.5827%. Conv 0.0221072 GFLOP. 3.12921%. SpatialBN 0.00538445 GFLOP. 0.762152%. Mul 0.003073 GFLOP. 0.434973%. FC 0.000642488 GFLOP. 0.0909421%. Add 0 GFLOP. 0%. Concat 0 GFLOP. 0%. Relu 0.70648 GFLOP in Total Feature Memory Read per operator type: 46.8424 MB. 30.502%. Conv 36.8626 MB. 24.0036%. Mul 22.3152 MB. 14.5309%. SpatialBN 22.1074 MB. 14.3955%. Concat 14.1496 MB. 9.21372%. Relu 6.15414 MB. 4.00735%. FC 5.1399 MB. 3.34692%. Add 153.571 MB in Total Feature Memory Written per operator type: 32.7672 MB. 28.4331%. Conv 22.1072 MB. 19.1831%. Concat 22.1072 MB. 19.1831%. SpatialBN 21.5378 MB. 18.689%. Mul 14.1496 MB. 12.2781%. Relu 2.56995 MB. 2.23003%. Add 0.004 MB. 0.00347092%. FC 115.243 MB in Total Parameter Memory per operator type: 13.7059 MB. 68.674%. Conv 6.148 MB. 30.8049%. FC 0.104 MB. 0.521097%. SpatialBN 0 MB. 0%. Add 0 MB. 0%. Concat 0 MB. 0%. Mul 0 MB. 0%. Relu 19.9579 MB in Total ``` ## TF MobileNet-V3 Large 1.0 ### Optimized ``` Main run finished. Milliseconds per iter: 22.0495. Iters per second: 45.3525 Time per operator type: 17.437 ms. 80.0087%. Conv 1.27662 ms. 5.8577%. Add 1.12759 ms. 5.17387%. Div 0.701155 ms. 3.21721%. Mul 0.562654 ms. 2.58171%. Relu 0.431144 ms. 1.97828%. Clip 0.156902 ms. 0.719936%. FC 0.0996858 ms. 0.457402%. AveragePool 0.00112455 ms. 0.00515993%. Flatten 21.7939 ms in Total FLOP per operator type: 0.43062 GFLOP. 98.1484%. Conv 0.002561 GFLOP. 0.583713%. FC 0.00210867 GFLOP. 0.480616%. Mul 0.00193868 GFLOP. 0.441871%. Add 0.00151532 GFLOP. 0.345377%. Div 0 GFLOP. 0%. Relu 0.438743 GFLOP in Total Feature Memory Read per operator type: 34.7967 MB. 43.9391%. Conv 14.496 MB. 18.3046%. Mul 9.44828 MB. 11.9307%. Add 9.26157 MB. 11.6949%. Relu 6.0614 MB. 7.65395%. Div 5.12912 MB. 6.47673%. FC 79.193 MB in Total Feature Memory Written per operator type: 17.6247 MB. 35.8656%. Conv 9.26157 MB. 18.847%. Relu 8.43469 MB. 17.1643%. Mul 7.75472 MB. 15.7806%. Add 6.06128 MB. 12.3345%. Div 0.004 MB. 0.00813985%. FC 49.1409 MB in Total Parameter Memory per operator type: 16.6851 MB. 76.5052%. Conv 5.124 MB. 23.4948%. FC 0 MB. 0%. Add 0 MB. 0%. Div 0 MB. 0%. Mul 0 MB. 0%. Relu 21.8091 MB in Total ``` ## MobileNet-V3 (RW) ### Unoptimized ``` Main run finished. Milliseconds per iter: 24.8316. Iters per second: 40.2712 Time per operator type: 15.9266 ms. 69.2624%. Conv 2.36551 ms. 10.2873%. SpatialBN 1.39102 ms. 6.04936%. Add 1.30327 ms. 5.66773%. Div 0.737014 ms. 3.20517%. Mul 0.639697 ms. 2.78195%. Relu 0.375681 ms. 1.63378%. Clip 0.153126 ms. 0.665921%. FC 0.0993787 ms. 0.432184%. AveragePool 0.0032632 ms. 0.0141912%. Squeeze 22.9946 ms in Total FLOP per operator type: 0.430616 GFLOP. 94.4041%. Conv 0.0175992 GFLOP. 3.85829%. SpatialBN 0.002561 GFLOP. 0.561449%. FC 0.00210961 GFLOP. 0.46249%. Mul 0.00173891 GFLOP. 0.381223%. Add 0.00151626 GFLOP. 0.33241%. Div 0 GFLOP. 0%. Relu 0.456141 GFLOP in Total Feature Memory Read per operator type: 34.7354 MB. 36.4363%. Conv 17.7944 MB. 18.6658%. SpatialBN 14.5035 MB. 15.2137%. Mul 9.25778 MB. 9.71113%. Relu 7.84641 MB. 8.23064%. Add 6.06516 MB. 6.36216%. Div 5.12912 MB. 5.38029%. FC 95.3317 MB in Total Feature Memory Written per operator type: 17.6246 MB. 26.7264%. Conv 17.5992 MB. 26.6878%. SpatialBN 9.25778 MB. 14.0387%. Relu 8.43843 MB. 12.7962%. Mul 6.95565 MB. 10.5477%. Add 6.06502 MB. 9.19713%. Div 0.004 MB. 0.00606568%. FC 65.9447 MB in Total Parameter Memory per operator type: 16.6778 MB. 76.1564%. Conv 5.124 MB. 23.3979%. FC 0.0976 MB. 0.445674%. SpatialBN 0 MB. 0%. Add 0 MB. 0%. Div 0 MB. 0%. Mul 0 MB. 0%. Relu 21.8994 MB in Total ``` ### Optimized ``` Main run finished. Milliseconds per iter: 22.0981. Iters per second: 45.2527 Time per operator type: 17.146 ms. 78.8965%. Conv 1.38453 ms. 6.37084%. Add 1.30991 ms. 6.02749%. Div 0.685417 ms. 3.15391%. Mul 0.532589 ms. 2.45068%. Relu 0.418263 ms. 1.92461%. Clip 0.15128 ms. 0.696106%. FC 0.102065 ms. 0.469648%. AveragePool 0.0022143 ms. 0.010189%. Squeeze 21.7323 ms in Total FLOP per operator type: 0.430616 GFLOP. 98.1927%. Conv 0.002561 GFLOP. 0.583981%. FC 0.00210961 GFLOP. 0.481051%. Mul 0.00173891 GFLOP. 0.396522%. Add 0.00151626 GFLOP. 0.34575%. Div 0 GFLOP. 0%. Relu 0.438542 GFLOP in Total Feature Memory Read per operator type: 34.7842 MB. 44.833%. Conv 14.5035 MB. 18.6934%. Mul 9.25778 MB. 11.9323%. Relu 7.84641 MB. 10.1132%. Add 6.06516 MB. 7.81733%. Div 5.12912 MB. 6.61087%. FC 77.5861 MB in Total Feature Memory Written per operator type: 17.6246 MB. 36.4556%. Conv 9.25778 MB. 19.1492%. Relu 8.43843 MB. 17.4544%. Mul 6.95565 MB. 14.3874%. Add 6.06502 MB. 12.5452%. Div 0.004 MB. 0.00827378%. FC 48.3455 MB in Total Parameter Memory per operator type: 16.6778 MB. 76.4973%. Conv 5.124 MB. 23.5027%. FC 0 MB. 0%. Add 0 MB. 0%. Div 0 MB. 0%. Mul 0 MB. 0%. Relu 21.8018 MB in Total ``` ## MnasNet-A1 ### Unoptimized ``` Main run finished. Milliseconds per iter: 30.0892. Iters per second: 33.2345 Time per operator type: 24.4656 ms. 79.0905%. Conv 4.14958 ms. 13.4144%. SpatialBN 1.60598 ms. 5.19169%. Relu 0.295219 ms. 0.95436%. Mul 0.187609 ms. 0.606486%. FC 0.120556 ms. 0.389724%. AveragePool 0.09036 ms. 0.292109%. Add 0.015727 ms. 0.050841%. Sigmoid 0.00306205 ms. 0.00989875%. Squeeze 30.9337 ms in Total FLOP per operator type: 0.620598 GFLOP. 95.6434%. Conv 0.0248873 GFLOP. 3.8355%. SpatialBN 0.002561 GFLOP. 0.394688%. FC 0.000597408 GFLOP. 0.0920695%. Mul 0.000222656 GFLOP. 0.0343146%. Add 0 GFLOP. 0%. Relu 0.648867 GFLOP in Total Feature Memory Read per operator type: 35.5457 MB. 38.4109%. Conv 25.1552 MB. 27.1829%. SpatialBN 22.5235 MB. 24.339%. Relu 5.12912 MB. 5.54256%. FC 2.40586 MB. 2.59978%. Mul 1.78125 MB. 1.92483%. Add 92.5406 MB in Total Feature Memory Written per operator type: 24.9042 MB. 32.9424%. Conv 24.8873 MB. 32.92%. SpatialBN 22.5235 MB. 29.7932%. Relu 2.38963 MB. 3.16092%. Mul 0.890624 MB. 1.17809%. Add 0.004 MB. 0.00529106%. FC 75.5993 MB in Total Parameter Memory per operator type: 10.2732 MB. 66.1459%. Conv 5.124 MB. 32.9917%. FC 0.133952 MB. 0.86247%. SpatialBN 0 MB. 0%. Add 0 MB. 0%. Mul 0 MB. 0%. Relu 15.5312 MB in Total ``` ### Optimized ``` Main run finished. Milliseconds per iter: 24.2367. Iters per second: 41.2597 Time per operator type: 22.0547 ms. 91.1375%. Conv 1.49096 ms. 6.16116%. Relu 0.253417 ms. 1.0472%. Mul 0.18506 ms. 0.76473%. FC 0.112942 ms. 0.466717%. AveragePool 0.086769 ms. 0.358559%. Add 0.0127889 ms. 0.0528479%. Sigmoid 0.0027346 ms. 0.0113003%. Squeeze 24.1994 ms in Total FLOP per operator type: 0.620598 GFLOP. 99.4581%. Conv 0.002561 GFLOP. 0.41043%. FC 0.000597408 GFLOP. 0.0957417%. Mul 0.000222656 GFLOP. 0.0356832%. Add 0 GFLOP. 0%. Relu 0.623979 GFLOP in Total Feature Memory Read per operator type: 35.6127 MB. 52.7968%. Conv 22.5235 MB. 33.3917%. Relu 5.12912 MB. 7.60406%. FC 2.40586 MB. 3.56675%. Mul 1.78125 MB. 2.64075%. Add 67.4524 MB in Total Feature Memory Written per operator type: 24.9042 MB. 49.1092%. Conv 22.5235 MB. 44.4145%. Relu 2.38963 MB. 4.71216%. Mul 0.890624 MB. 1.75624%. Add 0.004 MB. 0.00788768%. FC 50.712 MB in Total Parameter Memory per operator type: 10.2732 MB. 66.7213%. Conv 5.124 MB. 33.2787%. FC 0 MB. 0%. Add 0 MB. 0%. Mul 0 MB. 0%. Relu 15.3972 MB in Total ``` ## MnasNet-B1 ### Unoptimized ``` Main run finished. Milliseconds per iter: 28.3109. Iters per second: 35.322 Time per operator type: 29.1121 ms. 83.3081%. Conv 4.14959 ms. 11.8746%. SpatialBN 1.35823 ms. 3.88675%. Relu 0.186188 ms. 0.532802%. FC 0.116244 ms. 0.332647%. Add 0.018641 ms. 0.0533437%. AveragePool 0.0040904 ms. 0.0117052%. Squeeze 34.9451 ms in Total FLOP per operator type: 0.626272 GFLOP. 96.2088%. Conv 0.0218266 GFLOP. 3.35303%. SpatialBN 0.002561 GFLOP. 0.393424%. FC 0.000291648 GFLOP. 0.0448034%. Add 0 GFLOP. 0%. Relu 0.650951 GFLOP in Total Feature Memory Read per operator type: 34.4354 MB. 41.3788%. Conv 22.1299 MB. 26.5921%. SpatialBN 19.1923 MB. 23.0622%. Relu 5.12912 MB. 6.16333%. FC 2.33318 MB. 2.80364%. Add 83.2199 MB in Total Feature Memory Written per operator type: 21.8266 MB. 34.0955%. Conv 21.8266 MB. 34.0955%. SpatialBN 19.1923 MB. 29.9805%. Relu 1.16659 MB. 1.82234%. Add 0.004 MB. 0.00624844%. FC 64.016 MB in Total Parameter Memory per operator type: 12.2576 MB. 69.9104%. Conv 5.124 MB. 29.2245%. FC 0.15168 MB. 0.865099%. SpatialBN 0 MB. 0%. Add 0 MB. 0%. Relu 17.5332 MB in Total ``` ### Optimized ``` Main run finished. Milliseconds per iter: 26.6364. Iters per second: 37.5426 Time per operator type: 24.9888 ms. 94.0962%. Conv 1.26147 ms. 4.75011%. Relu 0.176234 ms. 0.663619%. FC 0.113309 ms. 0.426672%. Add 0.0138708 ms. 0.0522311%. AveragePool 0.00295685 ms. 0.0111341%. Squeeze 26.5566 ms in Total FLOP per operator type: 0.626272 GFLOP. 99.5466%. Conv 0.002561 GFLOP. 0.407074%. FC 0.000291648 GFLOP. 0.0463578%. Add 0 GFLOP. 0%. Relu 0.629124 GFLOP in Total Feature Memory Read per operator type: 34.5112 MB. 56.4224%. Conv 19.1923 MB. 31.3775%. Relu 5.12912 MB. 8.3856%. FC 2.33318 MB. 3.81452%. Add 61.1658 MB in Total Feature Memory Written per operator type: 21.8266 MB. 51.7346%. Conv 19.1923 MB. 45.4908%. Relu 1.16659 MB. 2.76513%. Add 0.004 MB. 0.00948104%. FC 42.1895 MB in Total Parameter Memory per operator type: 12.2576 MB. 70.5205%. Conv 5.124 MB. 29.4795%. FC 0 MB. 0%. Add 0 MB. 0%. Relu 17.3816 MB in Total ``` ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2020 Ross Wightman Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/README.md ================================================ # (Generic) EfficientNets for PyTorch A 'generic' implementation of EfficientNet, MixNet, MobileNetV3, etc. that covers most of the compute/parameter efficient architectures derived from the MobileNet V1/V2 block sequence, including those found via automated neural architecture search. All models are implemented by GenEfficientNet or MobileNetV3 classes, with string based architecture definitions to configure the block layouts (idea from [here](https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py)) ## What's New ### Aug 19, 2020 * Add updated PyTorch trained EfficientNet-B3 weights trained by myself with `timm` (82.1 top-1) * Add PyTorch trained EfficientNet-Lite0 contributed by [@hal-314](https://github.com/hal-314) (75.5 top-1) * Update ONNX and Caffe2 export / utility scripts to work with latest PyTorch / ONNX * ONNX runtime based validation script added * activations (mostly) brought in sync with `timm` equivalents ### April 5, 2020 * Add some newly trained MobileNet-V2 models trained with latest h-params, rand augment. They compare quite favourably to EfficientNet-Lite * 3.5M param MobileNet-V2 100 @ 73% * 4.5M param MobileNet-V2 110d @ 75% * 6.1M param MobileNet-V2 140 @ 76.5% * 5.8M param MobileNet-V2 120d @ 77.3% ### March 23, 2020 * Add EfficientNet-Lite models w/ weights ported from [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite) * Add PyTorch trained MobileNet-V3 Large weights with 75.77% top-1 * IMPORTANT CHANGE (if training from scratch) - weight init changed to better match Tensorflow impl, set `fix_group_fanout=False` in `initialize_weight_goog` for old behavior ### Feb 12, 2020 * Add EfficientNet-L2 and B0-B7 NoisyStudent weights ported from [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet) * Port new EfficientNet-B8 (RandAugment) weights from TF TPU, these are different than the B8 AdvProp, different input normalization. * Add RandAugment PyTorch trained EfficientNet-ES (EdgeTPU-Small) weights with 78.1 top-1. Trained by [Andrew Lavin](https://github.com/andravin) ### Jan 22, 2020 * Update weights for EfficientNet B0, B2, B3 and MixNet-XL with latest RandAugment trained weights. Trained with (https://github.com/rwightman/pytorch-image-models) * Fix torchscript compatibility for PyTorch 1.4, add torchscript support for MixedConv2d using ModuleDict * Test models, torchscript, onnx export with PyTorch 1.4 -- no issues ### Nov 22, 2019 * New top-1 high! Ported official TF EfficientNet AdvProp (https://arxiv.org/abs/1911.09665) weights and B8 model spec. Created a new set of `ap` models since they use a different preprocessing (Inception mean/std) from the original EfficientNet base/AA/RA weights. ### Nov 15, 2019 * Ported official TF MobileNet-V3 float32 large/small/minimalistic weights * Modifications to MobileNet-V3 model and components to support some additional config needed for differences between TF MobileNet-V3 and mine ### Oct 30, 2019 * Many of the models will now work with torch.jit.script, MixNet being the biggest exception * Improved interface for enabling torchscript or ONNX export compatible modes (via config) * Add JIT optimized mem-efficient Swish/Mish autograd.fn in addition to memory-efficient autgrad.fn * Activation factory to select best version of activation by name or override one globally * Add pretrained checkpoint load helper that handles input conv and classifier changes ### Oct 27, 2019 * Add CondConv EfficientNet variants ported from https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv * Add RandAug weights for TF EfficientNet B5 and B7 from https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet * Bring over MixNet-XL model and depth scaling algo from my pytorch-image-models code base * Switch activations and global pooling to modules * Add memory-efficient Swish/Mish impl * Add as_sequential() method to all models and allow as an argument in entrypoint fns * Move MobileNetV3 into own file since it has a different head * Remove ChamNet, MobileNet V2/V1 since they will likely never be used here ## Models Implemented models include: * EfficientNet NoisyStudent (B0-B7, L2) (https://arxiv.org/abs/1911.04252) * EfficientNet AdvProp (B0-B8) (https://arxiv.org/abs/1911.09665) * EfficientNet (B0-B8) (https://arxiv.org/abs/1905.11946) * EfficientNet-EdgeTPU (S, M, L) (https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html) * EfficientNet-CondConv (https://arxiv.org/abs/1904.04971) * EfficientNet-Lite (https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite) * MixNet (https://arxiv.org/abs/1907.09595) * MNASNet B1, A1 (Squeeze-Excite), and Small (https://arxiv.org/abs/1807.11626) * MobileNet-V3 (https://arxiv.org/abs/1905.02244) * FBNet-C (https://arxiv.org/abs/1812.03443) * Single-Path NAS (https://arxiv.org/abs/1904.02877) I originally implemented and trained some these models with code [here](https://github.com/rwightman/pytorch-image-models), this repository contains just the GenEfficientNet models, validation, and associated ONNX/Caffe2 export code. ## Pretrained I've managed to train several of the models to accuracies close to or above the originating papers and official impl. My training code is here: https://github.com/rwightman/pytorch-image-models |Model | Prec@1 (Err) | Prec@5 (Err) | Param#(M) | MAdds(M) | Image Scaling | Resolution | Crop | |---|---|---|---|---|---|---|---| | efficientnet_b3 | 82.240 (17.760) | 96.116 (3.884) | 12.23 | TBD | bicubic | 320 | 1.0 | | efficientnet_b3 | 82.076 (17.924) | 96.020 (3.980) | 12.23 | TBD | bicubic | 300 | 0.904 | | mixnet_xl | 81.074 (18.926) | 95.282 (4.718) | 11.90 | TBD | bicubic | 256 | 1.0 | | efficientnet_b2 | 80.612 (19.388) | 95.318 (4.682) | 9.1 | TBD | bicubic | 288 | 1.0 | | mixnet_xl | 80.476 (19.524) | 94.936 (5.064) | 11.90 | TBD | bicubic | 224 | 0.875 | | efficientnet_b2 | 80.288 (19.712) | 95.166 (4.834) | 9.1 | 1003 | bicubic | 260 | 0.890 | | mixnet_l | 78.976 (21.024 | 94.184 (5.816) | 7.33 | TBD | bicubic | 224 | 0.875 | | efficientnet_b1 | 78.692 (21.308) | 94.086 (5.914) | 7.8 | 694 | bicubic | 240 | 0.882 | | efficientnet_es | 78.066 (21.934) | 93.926 (6.074) | 5.44 | TBD | bicubic | 224 | 0.875 | | efficientnet_b0 | 77.698 (22.302) | 93.532 (6.468) | 5.3 | 390 | bicubic | 224 | 0.875 | | mobilenetv2_120d | 77.294 (22.706 | 93.502 (6.498) | 5.8 | TBD | bicubic | 224 | 0.875 | | mixnet_m | 77.256 (22.744) | 93.418 (6.582) | 5.01 | 353 | bicubic | 224 | 0.875 | | mobilenetv2_140 | 76.524 (23.476) | 92.990 (7.010) | 6.1 | TBD | bicubic | 224 | 0.875 | | mixnet_s | 75.988 (24.012) | 92.794 (7.206) | 4.13 | TBD | bicubic | 224 | 0.875 | | mobilenetv3_large_100 | 75.766 (24.234) | 92.542 (7.458) | 5.5 | TBD | bicubic | 224 | 0.875 | | mobilenetv3_rw | 75.634 (24.366) | 92.708 (7.292) | 5.5 | 219 | bicubic | 224 | 0.875 | | efficientnet_lite0 | 75.472 (24.528) | 92.520 (7.480) | 4.65 | TBD | bicubic | 224 | 0.875 | | mnasnet_a1 | 75.448 (24.552) | 92.604 (7.396) | 3.9 | 312 | bicubic | 224 | 0.875 | | fbnetc_100 | 75.124 (24.876) | 92.386 (7.614) | 5.6 | 385 | bilinear | 224 | 0.875 | | mobilenetv2_110d | 75.052 (24.948) | 92.180 (7.820) | 4.5 | TBD | bicubic | 224 | 0.875 | | mnasnet_b1 | 74.658 (25.342) | 92.114 (7.886) | 4.4 | 315 | bicubic | 224 | 0.875 | | spnasnet_100 | 74.084 (25.916) | 91.818 (8.182) | 4.4 | TBD | bilinear | 224 | 0.875 | | mobilenetv2_100 | 72.978 (27.022) | 91.016 (8.984) | 3.5 | TBD | bicubic | 224 | 0.875 | More pretrained models to come... ## Ported Weights The weights ported from Tensorflow checkpoints for the EfficientNet models do pretty much match accuracy in Tensorflow once a SAME convolution padding equivalent is added, and the same crop factors, image scaling, etc (see table) are used via cmd line args. **IMPORTANT:** * Tensorflow ported weights for EfficientNet AdvProp (AP), EfficientNet EdgeTPU, EfficientNet-CondConv, EfficientNet-Lite, and MobileNet-V3 models use Inception style (0.5, 0.5, 0.5) for mean and std. * Enabling the Tensorflow preprocessing pipeline with `--tf-preprocessing` at validation time will improve scores by 0.1-0.5%, very close to original TF impl. To run validation for tf_efficientnet_b5: `python validate.py /path/to/imagenet/validation/ --model tf_efficientnet_b5 -b 64 --img-size 456 --crop-pct 0.934 --interpolation bicubic` To run validation w/ TF preprocessing for tf_efficientnet_b5: `python validate.py /path/to/imagenet/validation/ --model tf_efficientnet_b5 -b 64 --img-size 456 --tf-preprocessing` To run validation for a model with Inception preprocessing, ie EfficientNet-B8 AdvProp: `python validate.py /path/to/imagenet/validation/ --model tf_efficientnet_b8_ap -b 48 --num-gpu 2 --img-size 672 --crop-pct 0.954 --mean 0.5 --std 0.5` |Model | Prec@1 (Err) | Prec@5 (Err) | Param # | Image Scaling | Image Size | Crop | |---|---|---|---|---|---|---| | tf_efficientnet_l2_ns *tfp | 88.352 (11.648) | 98.652 (1.348) | 480 | bicubic | 800 | N/A | | tf_efficientnet_l2_ns | TBD | TBD | 480 | bicubic | 800 | 0.961 | | tf_efficientnet_l2_ns_475 | 88.234 (11.766) | 98.546 (1.454) | 480 | bicubic | 475 | 0.936 | | tf_efficientnet_l2_ns_475 *tfp | 88.172 (11.828) | 98.566 (1.434) | 480 | bicubic | 475 | N/A | | tf_efficientnet_b7_ns *tfp | 86.844 (13.156) | 98.084 (1.916) | 66.35 | bicubic | 600 | N/A | | tf_efficientnet_b7_ns | 86.840 (13.160) | 98.094 (1.906) | 66.35 | bicubic | 600 | N/A | | tf_efficientnet_b6_ns | 86.452 (13.548) | 97.882 (2.118) | 43.04 | bicubic | 528 | N/A | | tf_efficientnet_b6_ns *tfp | 86.444 (13.556) | 97.880 (2.120) | 43.04 | bicubic | 528 | N/A | | tf_efficientnet_b5_ns *tfp | 86.064 (13.936) | 97.746 (2.254) | 30.39 | bicubic | 456 | N/A | | tf_efficientnet_b5_ns | 86.088 (13.912) | 97.752 (2.248) | 30.39 | bicubic | 456 | N/A | | tf_efficientnet_b8_ap *tfp | 85.436 (14.564) | 97.272 (2.728) | 87.4 | bicubic | 672 | N/A | | tf_efficientnet_b8 *tfp | 85.384 (14.616) | 97.394 (2.606) | 87.4 | bicubic | 672 | N/A | | tf_efficientnet_b8 | 85.370 (14.630) | 97.390 (2.610) | 87.4 | bicubic | 672 | 0.954 | | tf_efficientnet_b8_ap | 85.368 (14.632) | 97.294 (2.706) | 87.4 | bicubic | 672 | 0.954 | | tf_efficientnet_b4_ns *tfp | 85.298 (14.702) | 97.504 (2.496) | 19.34 | bicubic | 380 | N/A | | tf_efficientnet_b4_ns | 85.162 (14.838) | 97.470 (2.530) | 19.34 | bicubic | 380 | 0.922 | | tf_efficientnet_b7_ap *tfp | 85.154 (14.846) | 97.244 (2.756) | 66.35 | bicubic | 600 | N/A | | tf_efficientnet_b7_ap | 85.118 (14.882) | 97.252 (2.748) | 66.35 | bicubic | 600 | 0.949 | | tf_efficientnet_b7 *tfp | 84.940 (15.060) | 97.214 (2.786) | 66.35 | bicubic | 600 | N/A | | tf_efficientnet_b7 | 84.932 (15.068) | 97.208 (2.792) | 66.35 | bicubic | 600 | 0.949 | | tf_efficientnet_b6_ap | 84.786 (15.214) | 97.138 (2.862) | 43.04 | bicubic | 528 | 0.942 | | tf_efficientnet_b6_ap *tfp | 84.760 (15.240) | 97.124 (2.876) | 43.04 | bicubic | 528 | N/A | | tf_efficientnet_b5_ap *tfp | 84.276 (15.724) | 96.932 (3.068) | 30.39 | bicubic | 456 | N/A | | tf_efficientnet_b5_ap | 84.254 (15.746) | 96.976 (3.024) | 30.39 | bicubic | 456 | 0.934 | | tf_efficientnet_b6 *tfp | 84.140 (15.860) | 96.852 (3.148) | 43.04 | bicubic | 528 | N/A | | tf_efficientnet_b6 | 84.110 (15.890) | 96.886 (3.114) | 43.04 | bicubic | 528 | 0.942 | | tf_efficientnet_b3_ns *tfp | 84.054 (15.946) | 96.918 (3.082) | 12.23 | bicubic | 300 | N/A | | tf_efficientnet_b3_ns | 84.048 (15.952) | 96.910 (3.090) | 12.23 | bicubic | 300 | .904 | | tf_efficientnet_b5 *tfp | 83.822 (16.178) | 96.756 (3.244) | 30.39 | bicubic | 456 | N/A | | tf_efficientnet_b5 | 83.812 (16.188) | 96.748 (3.252) | 30.39 | bicubic | 456 | 0.934 | | tf_efficientnet_b4_ap *tfp | 83.278 (16.722) | 96.376 (3.624) | 19.34 | bicubic | 380 | N/A | | tf_efficientnet_b4_ap | 83.248 (16.752) | 96.388 (3.612) | 19.34 | bicubic | 380 | 0.922 | | tf_efficientnet_b4 | 83.022 (16.978) | 96.300 (3.700) | 19.34 | bicubic | 380 | 0.922 | | tf_efficientnet_b4 *tfp | 82.948 (17.052) | 96.308 (3.692) | 19.34 | bicubic | 380 | N/A | | tf_efficientnet_b2_ns *tfp | 82.436 (17.564) | 96.268 (3.732) | 9.11 | bicubic | 260 | N/A | | tf_efficientnet_b2_ns | 82.380 (17.620) | 96.248 (3.752) | 9.11 | bicubic | 260 | 0.89 | | tf_efficientnet_b3_ap *tfp | 81.882 (18.118) | 95.662 (4.338) | 12.23 | bicubic | 300 | N/A | | tf_efficientnet_b3_ap | 81.828 (18.172) | 95.624 (4.376) | 12.23 | bicubic | 300 | 0.904 | | tf_efficientnet_b3 | 81.636 (18.364) | 95.718 (4.282) | 12.23 | bicubic | 300 | 0.904 | | tf_efficientnet_b3 *tfp | 81.576 (18.424) | 95.662 (4.338) | 12.23 | bicubic | 300 | N/A | | tf_efficientnet_lite4 | 81.528 (18.472) | 95.668 (4.332) | 13.00 | bilinear | 380 | 0.92 | | tf_efficientnet_b1_ns *tfp | 81.514 (18.486) | 95.776 (4.224) | 7.79 | bicubic | 240 | N/A | | tf_efficientnet_lite4 *tfp | 81.502 (18.498) | 95.676 (4.324) | 13.00 | bilinear | 380 | N/A | | tf_efficientnet_b1_ns | 81.388 (18.612) | 95.738 (4.262) | 7.79 | bicubic | 240 | 0.88 | | tf_efficientnet_el | 80.534 (19.466) | 95.190 (4.810) | 10.59 | bicubic | 300 | 0.904 | | tf_efficientnet_el *tfp | 80.476 (19.524) | 95.200 (4.800) | 10.59 | bicubic | 300 | N/A | | tf_efficientnet_b2_ap *tfp | 80.420 (19.580) | 95.040 (4.960) | 9.11 | bicubic | 260 | N/A | | tf_efficientnet_b2_ap | 80.306 (19.694) | 95.028 (4.972) | 9.11 | bicubic | 260 | 0.890 | | tf_efficientnet_b2 *tfp | 80.188 (19.812) | 94.974 (5.026) | 9.11 | bicubic | 260 | N/A | | tf_efficientnet_b2 | 80.086 (19.914) | 94.908 (5.092) | 9.11 | bicubic | 260 | 0.890 | | tf_efficientnet_lite3 | 79.812 (20.188) | 94.914 (5.086) | 8.20 | bilinear | 300 | 0.904 | | tf_efficientnet_lite3 *tfp | 79.734 (20.266) | 94.838 (5.162) | 8.20 | bilinear | 300 | N/A | | tf_efficientnet_b1_ap *tfp | 79.532 (20.468) | 94.378 (5.622) | 7.79 | bicubic | 240 | N/A | | tf_efficientnet_cc_b1_8e *tfp | 79.464 (20.536)| 94.492 (5.508) | 39.7 | bicubic | 240 | 0.88 | | tf_efficientnet_cc_b1_8e | 79.298 (20.702) | 94.364 (5.636) | 39.7 | bicubic | 240 | 0.88 | | tf_efficientnet_b1_ap | 79.278 (20.722) | 94.308 (5.692) | 7.79 | bicubic | 240 | 0.88 | | tf_efficientnet_b1 *tfp | 79.172 (20.828) | 94.450 (5.550) | 7.79 | bicubic | 240 | N/A | | tf_efficientnet_em *tfp | 78.958 (21.042) | 94.458 (5.542) | 6.90 | bicubic | 240 | N/A | | tf_efficientnet_b0_ns *tfp | 78.806 (21.194) | 94.496 (5.504) | 5.29 | bicubic | 224 | N/A | | tf_mixnet_l *tfp | 78.846 (21.154) | 94.212 (5.788) | 7.33 | bilinear | 224 | N/A | | tf_efficientnet_b1 | 78.826 (21.174) | 94.198 (5.802) | 7.79 | bicubic | 240 | 0.88 | | tf_mixnet_l | 78.770 (21.230) | 94.004 (5.996) | 7.33 | bicubic | 224 | 0.875 | | tf_efficientnet_em | 78.742 (21.258) | 94.332 (5.668) | 6.90 | bicubic | 240 | 0.875 | | tf_efficientnet_b0_ns | 78.658 (21.342) | 94.376 (5.624) | 5.29 | bicubic | 224 | 0.875 | | tf_efficientnet_cc_b0_8e *tfp | 78.314 (21.686) | 93.790 (6.210) | 24.0 | bicubic | 224 | 0.875 | | tf_efficientnet_cc_b0_8e | 77.908 (22.092) | 93.656 (6.344) | 24.0 | bicubic | 224 | 0.875 | | tf_efficientnet_cc_b0_4e *tfp | 77.746 (22.254) | 93.552 (6.448) | 13.3 | bicubic | 224 | 0.875 | | tf_efficientnet_cc_b0_4e | 77.304 (22.696) | 93.332 (6.668) | 13.3 | bicubic | 224 | 0.875 | | tf_efficientnet_es *tfp | 77.616 (22.384) | 93.750 (6.250) | 5.44 | bicubic | 224 | N/A | | tf_efficientnet_lite2 *tfp | 77.544 (22.456) | 93.800 (6.200) | 6.09 | bilinear | 260 | N/A | | tf_efficientnet_lite2 | 77.460 (22.540) | 93.746 (6.254) | 6.09 | bicubic | 260 | 0.89 | | tf_efficientnet_b0_ap *tfp | 77.514 (22.486) | 93.576 (6.424) | 5.29 | bicubic | 224 | N/A | | tf_efficientnet_es | 77.264 (22.736) | 93.600 (6.400) | 5.44 | bicubic | 224 | N/A | | tf_efficientnet_b0 *tfp | 77.258 (22.742) | 93.478 (6.522) | 5.29 | bicubic | 224 | N/A | | tf_efficientnet_b0_ap | 77.084 (22.916) | 93.254 (6.746) | 5.29 | bicubic | 224 | 0.875 | | tf_mixnet_m *tfp | 77.072 (22.928) | 93.368 (6.632) | 5.01 | bilinear | 224 | N/A | | tf_mixnet_m | 76.950 (23.050) | 93.156 (6.844) | 5.01 | bicubic | 224 | 0.875 | | tf_efficientnet_b0 | 76.848 (23.152) | 93.228 (6.772) | 5.29 | bicubic | 224 | 0.875 | | tf_efficientnet_lite1 *tfp | 76.764 (23.236) | 93.326 (6.674) | 5.42 | bilinear | 240 | N/A | | tf_efficientnet_lite1 | 76.638 (23.362) | 93.232 (6.768) | 5.42 | bicubic | 240 | 0.882 | | tf_mixnet_s *tfp | 75.800 (24.200) | 92.788 (7.212) | 4.13 | bilinear | 224 | N/A | | tf_mobilenetv3_large_100 *tfp | 75.768 (24.232) | 92.710 (7.290) | 5.48 | bilinear | 224 | N/A | | tf_mixnet_s | 75.648 (24.352) | 92.636 (7.364) | 4.13 | bicubic | 224 | 0.875 | | tf_mobilenetv3_large_100 | 75.516 (24.484) | 92.600 (7.400) | 5.48 | bilinear | 224 | 0.875 | | tf_efficientnet_lite0 *tfp | 75.074 (24.926) | 92.314 (7.686) | 4.65 | bilinear | 224 | N/A | | tf_efficientnet_lite0 | 74.842 (25.158) | 92.170 (7.830) | 4.65 | bicubic | 224 | 0.875 | | tf_mobilenetv3_large_075 *tfp | 73.730 (26.270) | 91.616 (8.384) | 3.99 | bilinear | 224 |N/A | | tf_mobilenetv3_large_075 | 73.442 (26.558) | 91.352 (8.648) | 3.99 | bilinear | 224 | 0.875 | | tf_mobilenetv3_large_minimal_100 *tfp | 72.678 (27.322) | 90.860 (9.140) | 3.92 | bilinear | 224 | N/A | | tf_mobilenetv3_large_minimal_100 | 72.244 (27.756) | 90.636 (9.364) | 3.92 | bilinear | 224 | 0.875 | | tf_mobilenetv3_small_100 *tfp | 67.918 (32.082) | 87.958 (12.042 | 2.54 | bilinear | 224 | N/A | | tf_mobilenetv3_small_100 | 67.918 (32.082) | 87.662 (12.338) | 2.54 | bilinear | 224 | 0.875 | | tf_mobilenetv3_small_075 *tfp | 66.142 (33.858) | 86.498 (13.502) | 2.04 | bilinear | 224 | N/A | | tf_mobilenetv3_small_075 | 65.718 (34.282) | 86.136 (13.864) | 2.04 | bilinear | 224 | 0.875 | | tf_mobilenetv3_small_minimal_100 *tfp | 63.378 (36.622) | 84.802 (15.198) | 2.04 | bilinear | 224 | N/A | | tf_mobilenetv3_small_minimal_100 | 62.898 (37.102) | 84.230 (15.770) | 2.04 | bilinear | 224 | 0.875 | *tfp models validated with `tf-preprocessing` pipeline Google tf and tflite weights ported from official Tensorflow repositories * https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet * https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet * https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet ## Usage ### Environment All development and testing has been done in Conda Python 3 environments on Linux x86-64 systems, specifically Python 3.6.x, 3.7.x, 3.8.x. Users have reported that a Python 3 Anaconda install in Windows works. I have not verified this myself. PyTorch versions 1.4, 1.5, 1.6 have been tested with this code. I've tried to keep the dependencies minimal, the setup is as per the PyTorch default install instructions for Conda: ``` conda create -n torch-env conda activate torch-env conda install -c pytorch pytorch torchvision cudatoolkit=10.2 ``` ### PyTorch Hub Models can be accessed via the PyTorch Hub API ``` >>> torch.hub.list('rwightman/gen-efficientnet-pytorch') ['efficientnet_b0', ...] >>> model = torch.hub.load('rwightman/gen-efficientnet-pytorch', 'efficientnet_b0', pretrained=True) >>> model.eval() >>> output = model(torch.randn(1,3,224,224)) ``` ### Pip This package can be installed via pip. Install (after conda env/install): ``` pip install geffnet ``` Eval use: ``` >>> import geffnet >>> m = geffnet.create_model('mobilenetv3_large_100', pretrained=True) >>> m.eval() ``` Train use: ``` >>> import geffnet >>> # models can also be created by using the entrypoint directly >>> m = geffnet.efficientnet_b2(pretrained=True, drop_rate=0.25, drop_connect_rate=0.2) >>> m.train() ``` Create in a nn.Sequential container, for fast.ai, etc: ``` >>> import geffnet >>> m = geffnet.mixnet_l(pretrained=True, drop_rate=0.25, drop_connect_rate=0.2, as_sequential=True) ``` ### Exporting Scripts are included to * export models to ONNX (`onnx_export.py`) * optimized ONNX graph (`onnx_optimize.py` or `onnx_validate.py` w/ `--onnx-output-opt` arg) * validate with ONNX runtime (`onnx_validate.py`) * convert ONNX model to Caffe2 (`onnx_to_caffe.py`) * validate in Caffe2 (`caffe2_validate.py`) * benchmark in Caffe2 w/ FLOPs, parameters output (`caffe2_benchmark.py`) As an example, to export the MobileNet-V3 pretrained model and then run an Imagenet validation: ``` python onnx_export.py --model mobilenetv3_large_100 ./mobilenetv3_100.onnx python onnx_validate.py /imagenet/validation/ --onnx-input ./mobilenetv3_100.onnx ``` These scripts were tested to be working as of PyTorch 1.6 and ONNX 1.7 w/ ONNX runtime 1.4. Caffe2 compatible export now requires additional args mentioned in the export script (not needed in earlier versions). #### Export Notes 1. The TF ported weights with the 'SAME' conv padding activated cannot be exported to ONNX unless `_EXPORTABLE` flag in `config.py` is set to True. Use `config.set_exportable(True)` as in the `onnx_export.py` script. 2. TF ported models with 'SAME' padding will have the padding fixed at export time to the resolution used for export. Even though dynamic padding is supported in opset >= 11, I can't get it working. 3. ONNX optimize facility doesn't work reliably in PyTorch 1.6 / ONNX 1.7. Fortunately, the onnxruntime based inference is working very well now and includes on the fly optimization. 3. ONNX / Caffe2 export/import frequently breaks with different PyTorch and ONNX version releases. Please check their respective issue trackers before filing issues here. ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/caffe2_benchmark.py ================================================ """ Caffe2 validation script This script runs Caffe2 benchmark on exported ONNX model. It is a useful tool for reporting model FLOPS. Copyright 2020 Ross Wightman """ import argparse from caffe2.python import core, workspace, model_helper from caffe2.proto import caffe2_pb2 parser = argparse.ArgumentParser(description='Caffe2 Model Benchmark') parser.add_argument('--c2-prefix', default='', type=str, metavar='NAME', help='caffe2 model pb name prefix') parser.add_argument('--c2-init', default='', type=str, metavar='PATH', help='caffe2 model init .pb') parser.add_argument('--c2-predict', default='', type=str, metavar='PATH', help='caffe2 model predict .pb') parser.add_argument('-b', '--batch-size', default=1, type=int, metavar='N', help='mini-batch size (default: 1)') parser.add_argument('--img-size', default=224, type=int, metavar='N', help='Input image dimension, uses model default if empty') def main(): args = parser.parse_args() args.gpu_id = 0 if args.c2_prefix: args.c2_init = args.c2_prefix + '.init.pb' args.c2_predict = args.c2_prefix + '.predict.pb' model = model_helper.ModelHelper(name="le_net", init_params=False) # Bring in the init net from init_net.pb init_net_proto = caffe2_pb2.NetDef() with open(args.c2_init, "rb") as f: init_net_proto.ParseFromString(f.read()) model.param_init_net = core.Net(init_net_proto) # bring in the predict net from predict_net.pb predict_net_proto = caffe2_pb2.NetDef() with open(args.c2_predict, "rb") as f: predict_net_proto.ParseFromString(f.read()) model.net = core.Net(predict_net_proto) # CUDA performance not impressive #device_opts = core.DeviceOption(caffe2_pb2.PROTO_CUDA, args.gpu_id) #model.net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True) #model.param_init_net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True) input_blob = model.net.external_inputs[0] model.param_init_net.GaussianFill( [], input_blob.GetUnscopedName(), shape=(args.batch_size, 3, args.img_size, args.img_size), mean=0.0, std=1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net, overwrite=True) workspace.BenchmarkNet(model.net.Proto().name, 5, 20, True) if __name__ == '__main__': main() ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/caffe2_validate.py ================================================ """ Caffe2 validation script This script is created to verify exported ONNX models running in Caffe2 It utilizes the same PyTorch dataloader/processing pipeline for a fair comparison against the originals. Copyright 2020 Ross Wightman """ import argparse import numpy as np from caffe2.python import core, workspace, model_helper from caffe2.proto import caffe2_pb2 from data import create_loader, resolve_data_config, Dataset from utils import AverageMeter import time parser = argparse.ArgumentParser(description='Caffe2 ImageNet Validation') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('--c2-prefix', default='', type=str, metavar='NAME', help='caffe2 model pb name prefix') parser.add_argument('--c2-init', default='', type=str, metavar='PATH', help='caffe2 model init .pb') parser.add_argument('--c2-predict', default='', type=str, metavar='PATH', help='caffe2 model predict .pb') parser.add_argument('-j', '--workers', default=2, type=int, metavar='N', help='number of data loading workers (default: 2)') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256)') parser.add_argument('--img-size', default=None, type=int, metavar='N', help='Input image dimension, uses model default if empty') parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', help='Override mean pixel value of dataset') parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', help='Override std deviation of of dataset') parser.add_argument('--crop-pct', type=float, default=None, metavar='PCT', help='Override default crop pct of 0.875') parser.add_argument('--interpolation', default='', type=str, metavar='NAME', help='Image resize interpolation type (overrides model)') parser.add_argument('--tf-preprocessing', dest='tf_preprocessing', action='store_true', help='use tensorflow mnasnet preporcessing') parser.add_argument('--print-freq', '-p', default=10, type=int, metavar='N', help='print frequency (default: 10)') def main(): args = parser.parse_args() args.gpu_id = 0 if args.c2_prefix: args.c2_init = args.c2_prefix + '.init.pb' args.c2_predict = args.c2_prefix + '.predict.pb' model = model_helper.ModelHelper(name="validation_net", init_params=False) # Bring in the init net from init_net.pb init_net_proto = caffe2_pb2.NetDef() with open(args.c2_init, "rb") as f: init_net_proto.ParseFromString(f.read()) model.param_init_net = core.Net(init_net_proto) # bring in the predict net from predict_net.pb predict_net_proto = caffe2_pb2.NetDef() with open(args.c2_predict, "rb") as f: predict_net_proto.ParseFromString(f.read()) model.net = core.Net(predict_net_proto) data_config = resolve_data_config(None, args) loader = create_loader( Dataset(args.data, load_bytes=args.tf_preprocessing), input_size=data_config['input_size'], batch_size=args.batch_size, use_prefetcher=False, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, crop_pct=data_config['crop_pct'], tensorflow_preprocessing=args.tf_preprocessing) # this is so obvious, wonderful interface input_blob = model.net.external_inputs[0] output_blob = model.net.external_outputs[0] if True: device_opts = None else: # CUDA is crashing, no idea why, awesome error message, give it a try for kicks device_opts = core.DeviceOption(caffe2_pb2.PROTO_CUDA, args.gpu_id) model.net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True) model.param_init_net.RunAllOnGPU(gpu_id=args.gpu_id, use_cudnn=True) model.param_init_net.GaussianFill( [], input_blob.GetUnscopedName(), shape=(1,) + data_config['input_size'], mean=0.0, std=1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net, overwrite=True) batch_time = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() for i, (input, target) in enumerate(loader): # run the net and return prediction caffe2_in = input.data.numpy() workspace.FeedBlob(input_blob, caffe2_in, device_opts) workspace.RunNet(model.net, num_iter=1) output = workspace.FetchBlob(output_blob) # measure accuracy and record loss prec1, prec5 = accuracy_np(output.data, target.numpy()) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s, {ms_avg:.3f} ms/sample) \t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, ms_avg=100 * batch_time.avg / input.size(0), top1=top1, top5=top5)) print(' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'.format( top1=top1, top1a=100-top1.avg, top5=top5, top5a=100.-top5.avg)) def accuracy_np(output, target): max_indices = np.argsort(output, axis=1)[:, ::-1] top5 = 100 * np.equal(max_indices[:, :5], target[:, np.newaxis]).sum(axis=1).mean() top1 = 100 * np.equal(max_indices[:, 0], target).mean() return top1, top5 if __name__ == '__main__': main() ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/__init__.py ================================================ from .gen_efficientnet import * from .mobilenetv3 import * from .model_factory import create_model from .config import is_exportable, is_scriptable, set_exportable, set_scriptable from .activations import * ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/activations/__init__.py ================================================ from geffnet import config from geffnet.activations.activations_me import * from geffnet.activations.activations_jit import * from geffnet.activations.activations import * import torch _has_silu = 'silu' in dir(torch.nn.functional) _ACT_FN_DEFAULT = dict( silu=F.silu if _has_silu else swish, swish=F.silu if _has_silu else swish, mish=mish, relu=F.relu, relu6=F.relu6, sigmoid=sigmoid, tanh=tanh, hard_sigmoid=hard_sigmoid, hard_swish=hard_swish, ) _ACT_FN_JIT = dict( silu=F.silu if _has_silu else swish_jit, swish=F.silu if _has_silu else swish_jit, mish=mish_jit, ) _ACT_FN_ME = dict( silu=F.silu if _has_silu else swish_me, swish=F.silu if _has_silu else swish_me, mish=mish_me, hard_swish=hard_swish_me, hard_sigmoid_jit=hard_sigmoid_me, ) _ACT_LAYER_DEFAULT = dict( silu=nn.SiLU if _has_silu else Swish, swish=nn.SiLU if _has_silu else Swish, mish=Mish, relu=nn.ReLU, relu6=nn.ReLU6, sigmoid=Sigmoid, tanh=Tanh, hard_sigmoid=HardSigmoid, hard_swish=HardSwish, ) _ACT_LAYER_JIT = dict( silu=nn.SiLU if _has_silu else SwishJit, swish=nn.SiLU if _has_silu else SwishJit, mish=MishJit, ) _ACT_LAYER_ME = dict( silu=nn.SiLU if _has_silu else SwishMe, swish=nn.SiLU if _has_silu else SwishMe, mish=MishMe, hard_swish=HardSwishMe, hard_sigmoid=HardSigmoidMe ) _OVERRIDE_FN = dict() _OVERRIDE_LAYER = dict() def add_override_act_fn(name, fn): global _OVERRIDE_FN _OVERRIDE_FN[name] = fn def update_override_act_fn(overrides): assert isinstance(overrides, dict) global _OVERRIDE_FN _OVERRIDE_FN.update(overrides) def clear_override_act_fn(): global _OVERRIDE_FN _OVERRIDE_FN = dict() def add_override_act_layer(name, fn): _OVERRIDE_LAYER[name] = fn def update_override_act_layer(overrides): assert isinstance(overrides, dict) global _OVERRIDE_LAYER _OVERRIDE_LAYER.update(overrides) def clear_override_act_layer(): global _OVERRIDE_LAYER _OVERRIDE_LAYER = dict() def get_act_fn(name='relu'): """ Activation Function Factory Fetching activation fns by name with this function allows export or torch script friendly functions to be returned dynamically based on current config. """ if name in _OVERRIDE_FN: return _OVERRIDE_FN[name] use_me = not (config.is_exportable() or config.is_scriptable() or config.is_no_jit()) if use_me and name in _ACT_FN_ME: # If not exporting or scripting the model, first look for a memory optimized version # activation with custom autograd, then fallback to jit scripted, then a Python or Torch builtin return _ACT_FN_ME[name] if config.is_exportable() and name in ('silu', 'swish'): # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack return swish use_jit = not (config.is_exportable() or config.is_no_jit()) # NOTE: export tracing should work with jit scripted components, but I keep running into issues if use_jit and name in _ACT_FN_JIT: # jit scripted models should be okay for export/scripting return _ACT_FN_JIT[name] return _ACT_FN_DEFAULT[name] def get_act_layer(name='relu'): """ Activation Layer Factory Fetching activation layers by name with this function allows export or torch script friendly functions to be returned dynamically based on current config. """ if name in _OVERRIDE_LAYER: return _OVERRIDE_LAYER[name] use_me = not (config.is_exportable() or config.is_scriptable() or config.is_no_jit()) if use_me and name in _ACT_LAYER_ME: return _ACT_LAYER_ME[name] if config.is_exportable() and name in ('silu', 'swish'): # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack return Swish use_jit = not (config.is_exportable() or config.is_no_jit()) # NOTE: export tracing should work with jit scripted components, but I keep running into issues if use_jit and name in _ACT_FN_JIT: # jit scripted models should be okay for export/scripting return _ACT_LAYER_JIT[name] return _ACT_LAYER_DEFAULT[name] ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/activations/activations.py ================================================ """ Activations A collection of activations fn and modules with a common interface so that they can easily be swapped. All have an `inplace` arg even if not used. Copyright 2020 Ross Wightman """ from torch import nn as nn from torch.nn import functional as F def swish(x, inplace: bool = False): """Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3) and also as Swish (https://arxiv.org/abs/1710.05941). TODO Rename to SiLU with addition to PyTorch """ return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid()) class Swish(nn.Module): def __init__(self, inplace: bool = False): super(Swish, self).__init__() self.inplace = inplace def forward(self, x): return swish(x, self.inplace) def mish(x, inplace: bool = False): """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681 """ return x.mul(F.softplus(x).tanh()) class Mish(nn.Module): def __init__(self, inplace: bool = False): super(Mish, self).__init__() self.inplace = inplace def forward(self, x): return mish(x, self.inplace) def sigmoid(x, inplace: bool = False): return x.sigmoid_() if inplace else x.sigmoid() # PyTorch has this, but not with a consistent inplace argmument interface class Sigmoid(nn.Module): def __init__(self, inplace: bool = False): super(Sigmoid, self).__init__() self.inplace = inplace def forward(self, x): return x.sigmoid_() if self.inplace else x.sigmoid() def tanh(x, inplace: bool = False): return x.tanh_() if inplace else x.tanh() # PyTorch has this, but not with a consistent inplace argmument interface class Tanh(nn.Module): def __init__(self, inplace: bool = False): super(Tanh, self).__init__() self.inplace = inplace def forward(self, x): return x.tanh_() if self.inplace else x.tanh() def hard_swish(x, inplace: bool = False): inner = F.relu6(x + 3.).div_(6.) return x.mul_(inner) if inplace else x.mul(inner) class HardSwish(nn.Module): def __init__(self, inplace: bool = False): super(HardSwish, self).__init__() self.inplace = inplace def forward(self, x): return hard_swish(x, self.inplace) def hard_sigmoid(x, inplace: bool = False): if inplace: return x.add_(3.).clamp_(0., 6.).div_(6.) else: return F.relu6(x + 3.) / 6. class HardSigmoid(nn.Module): def __init__(self, inplace: bool = False): super(HardSigmoid, self).__init__() self.inplace = inplace def forward(self, x): return hard_sigmoid(x, self.inplace) ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/activations/activations_jit.py ================================================ """ Activations (jit) A collection of jit-scripted activations fn and modules with a common interface so that they can easily be swapped. All have an `inplace` arg even if not used. All jit scripted activations are lacking in-place variations on purpose, scripted kernel fusion does not currently work across in-place op boundaries, thus performance is equal to or less than the non-scripted versions if they contain in-place ops. Copyright 2020 Ross Wightman """ import torch from torch import nn as nn from torch.nn import functional as F __all__ = ['swish_jit', 'SwishJit', 'mish_jit', 'MishJit', 'hard_sigmoid_jit', 'HardSigmoidJit', 'hard_swish_jit', 'HardSwishJit'] @torch.jit.script def swish_jit(x, inplace: bool = False): """Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3) and also as Swish (https://arxiv.org/abs/1710.05941). TODO Rename to SiLU with addition to PyTorch """ return x.mul(x.sigmoid()) @torch.jit.script def mish_jit(x, _inplace: bool = False): """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681 """ return x.mul(F.softplus(x).tanh()) class SwishJit(nn.Module): def __init__(self, inplace: bool = False): super(SwishJit, self).__init__() def forward(self, x): return swish_jit(x) class MishJit(nn.Module): def __init__(self, inplace: bool = False): super(MishJit, self).__init__() def forward(self, x): return mish_jit(x) @torch.jit.script def hard_sigmoid_jit(x, inplace: bool = False): # return F.relu6(x + 3.) / 6. return (x + 3).clamp(min=0, max=6).div(6.) # clamp seems ever so slightly faster? class HardSigmoidJit(nn.Module): def __init__(self, inplace: bool = False): super(HardSigmoidJit, self).__init__() def forward(self, x): return hard_sigmoid_jit(x) @torch.jit.script def hard_swish_jit(x, inplace: bool = False): # return x * (F.relu6(x + 3.) / 6) return x * (x + 3).clamp(min=0, max=6).div(6.) # clamp seems ever so slightly faster? class HardSwishJit(nn.Module): def __init__(self, inplace: bool = False): super(HardSwishJit, self).__init__() def forward(self, x): return hard_swish_jit(x) ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/activations/activations_me.py ================================================ """ Activations (memory-efficient w/ custom autograd) A collection of activations fn and modules with a common interface so that they can easily be swapped. All have an `inplace` arg even if not used. These activations are not compatible with jit scripting or ONNX export of the model, please use either the JIT or basic versions of the activations. Copyright 2020 Ross Wightman """ import torch from torch import nn as nn from torch.nn import functional as F __all__ = ['swish_me', 'SwishMe', 'mish_me', 'MishMe', 'hard_sigmoid_me', 'HardSigmoidMe', 'hard_swish_me', 'HardSwishMe'] @torch.jit.script def swish_jit_fwd(x): return x.mul(torch.sigmoid(x)) @torch.jit.script def swish_jit_bwd(x, grad_output): x_sigmoid = torch.sigmoid(x) return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid))) class SwishJitAutoFn(torch.autograd.Function): """ torch.jit.script optimised Swish w/ memory-efficient checkpoint Inspired by conversation btw Jeremy Howard & Adam Pazske https://twitter.com/jeremyphoward/status/1188251041835315200 Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3) and also as Swish (https://arxiv.org/abs/1710.05941). TODO Rename to SiLU with addition to PyTorch """ @staticmethod def forward(ctx, x): ctx.save_for_backward(x) return swish_jit_fwd(x) @staticmethod def backward(ctx, grad_output): x = ctx.saved_tensors[0] return swish_jit_bwd(x, grad_output) def swish_me(x, inplace=False): return SwishJitAutoFn.apply(x) class SwishMe(nn.Module): def __init__(self, inplace: bool = False): super(SwishMe, self).__init__() def forward(self, x): return SwishJitAutoFn.apply(x) @torch.jit.script def mish_jit_fwd(x): return x.mul(torch.tanh(F.softplus(x))) @torch.jit.script def mish_jit_bwd(x, grad_output): x_sigmoid = torch.sigmoid(x) x_tanh_sp = F.softplus(x).tanh() return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp)) class MishJitAutoFn(torch.autograd.Function): """ Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681 A memory efficient, jit scripted variant of Mish """ @staticmethod def forward(ctx, x): ctx.save_for_backward(x) return mish_jit_fwd(x) @staticmethod def backward(ctx, grad_output): x = ctx.saved_tensors[0] return mish_jit_bwd(x, grad_output) def mish_me(x, inplace=False): return MishJitAutoFn.apply(x) class MishMe(nn.Module): def __init__(self, inplace: bool = False): super(MishMe, self).__init__() def forward(self, x): return MishJitAutoFn.apply(x) @torch.jit.script def hard_sigmoid_jit_fwd(x, inplace: bool = False): return (x + 3).clamp(min=0, max=6).div(6.) @torch.jit.script def hard_sigmoid_jit_bwd(x, grad_output): m = torch.ones_like(x) * ((x >= -3.) & (x <= 3.)) / 6. return grad_output * m class HardSigmoidJitAutoFn(torch.autograd.Function): @staticmethod def forward(ctx, x): ctx.save_for_backward(x) return hard_sigmoid_jit_fwd(x) @staticmethod def backward(ctx, grad_output): x = ctx.saved_tensors[0] return hard_sigmoid_jit_bwd(x, grad_output) def hard_sigmoid_me(x, inplace: bool = False): return HardSigmoidJitAutoFn.apply(x) class HardSigmoidMe(nn.Module): def __init__(self, inplace: bool = False): super(HardSigmoidMe, self).__init__() def forward(self, x): return HardSigmoidJitAutoFn.apply(x) @torch.jit.script def hard_swish_jit_fwd(x): return x * (x + 3).clamp(min=0, max=6).div(6.) @torch.jit.script def hard_swish_jit_bwd(x, grad_output): m = torch.ones_like(x) * (x >= 3.) m = torch.where((x >= -3.) & (x <= 3.), x / 3. + .5, m) return grad_output * m class HardSwishJitAutoFn(torch.autograd.Function): """A memory efficient, jit-scripted HardSwish activation""" @staticmethod def forward(ctx, x): ctx.save_for_backward(x) return hard_swish_jit_fwd(x) @staticmethod def backward(ctx, grad_output): x = ctx.saved_tensors[0] return hard_swish_jit_bwd(x, grad_output) def hard_swish_me(x, inplace=False): return HardSwishJitAutoFn.apply(x) class HardSwishMe(nn.Module): def __init__(self, inplace: bool = False): super(HardSwishMe, self).__init__() def forward(self, x): return HardSwishJitAutoFn.apply(x) ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/config.py ================================================ """ Global layer config state """ from typing import Any, Optional __all__ = [ 'is_exportable', 'is_scriptable', 'is_no_jit', 'layer_config_kwargs', 'set_exportable', 'set_scriptable', 'set_no_jit', 'set_layer_config' ] # Set to True if prefer to have layers with no jit optimization (includes activations) _NO_JIT = False # Set to True if prefer to have activation layers with no jit optimization # NOTE not currently used as no difference between no_jit and no_activation jit as only layers obeying # the jit flags so far are activations. This will change as more layers are updated and/or added. _NO_ACTIVATION_JIT = False # Set to True if exporting a model with Same padding via ONNX _EXPORTABLE = False # Set to True if wanting to use torch.jit.script on a model _SCRIPTABLE = False def is_no_jit(): return _NO_JIT class set_no_jit: def __init__(self, mode: bool) -> None: global _NO_JIT self.prev = _NO_JIT _NO_JIT = mode def __enter__(self) -> None: pass def __exit__(self, *args: Any) -> bool: global _NO_JIT _NO_JIT = self.prev return False def is_exportable(): return _EXPORTABLE class set_exportable: def __init__(self, mode: bool) -> None: global _EXPORTABLE self.prev = _EXPORTABLE _EXPORTABLE = mode def __enter__(self) -> None: pass def __exit__(self, *args: Any) -> bool: global _EXPORTABLE _EXPORTABLE = self.prev return False def is_scriptable(): return _SCRIPTABLE class set_scriptable: def __init__(self, mode: bool) -> None: global _SCRIPTABLE self.prev = _SCRIPTABLE _SCRIPTABLE = mode def __enter__(self) -> None: pass def __exit__(self, *args: Any) -> bool: global _SCRIPTABLE _SCRIPTABLE = self.prev return False class set_layer_config: """ Layer config context manager that allows setting all layer config flags at once. If a flag arg is None, it will not change the current value. """ def __init__( self, scriptable: Optional[bool] = None, exportable: Optional[bool] = None, no_jit: Optional[bool] = None, no_activation_jit: Optional[bool] = None): global _SCRIPTABLE global _EXPORTABLE global _NO_JIT global _NO_ACTIVATION_JIT self.prev = _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT if scriptable is not None: _SCRIPTABLE = scriptable if exportable is not None: _EXPORTABLE = exportable if no_jit is not None: _NO_JIT = no_jit if no_activation_jit is not None: _NO_ACTIVATION_JIT = no_activation_jit def __enter__(self) -> None: pass def __exit__(self, *args: Any) -> bool: global _SCRIPTABLE global _EXPORTABLE global _NO_JIT global _NO_ACTIVATION_JIT _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT = self.prev return False def layer_config_kwargs(kwargs): """ Consume config kwargs and return contextmgr obj """ return set_layer_config( scriptable=kwargs.pop('scriptable', None), exportable=kwargs.pop('exportable', None), no_jit=kwargs.pop('no_jit', None)) ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/conv2d_layers.py ================================================ """ Conv2D w/ SAME padding, CondConv, MixedConv A collection of conv layers and padding helpers needed by EfficientNet, MixNet, and MobileNetV3 models that maintain weight compatibility with original Tensorflow models. Copyright 2020 Ross Wightman """ import collections.abc import math from functools import partial from itertools import repeat from typing import Tuple, Optional import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from .config import * # From PyTorch internals def _ntuple(n): def parse(x): if isinstance(x, collections.abc.Iterable): return x return tuple(repeat(x, n)) return parse _single = _ntuple(1) _pair = _ntuple(2) _triple = _ntuple(3) _quadruple = _ntuple(4) def _is_static_pad(kernel_size, stride=1, dilation=1, **_): return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0 def _get_padding(kernel_size, stride=1, dilation=1, **_): padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 return padding def _calc_same_pad(i: int, k: int, s: int, d: int): return max((-(i // -s) - 1) * s + (k - 1) * d + 1 - i, 0) def _same_pad_arg(input_size, kernel_size, stride, dilation): ih, iw = input_size kh, kw = kernel_size pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0]) pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1]) return [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2] def _split_channels(num_chan, num_groups): split = [num_chan // num_groups for _ in range(num_groups)] split[0] += num_chan - sum(split) return split def conv2d_same( x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1), padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1): ih, iw = x.size()[-2:] kh, kw = weight.size()[-2:] pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0]) pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1]) x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]) return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups) class Conv2dSame(nn.Conv2d): """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions """ # pylint: disable=unused-argument def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super(Conv2dSame, self).__init__( in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) def forward(self, x): return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) class Conv2dSameExport(nn.Conv2d): """ ONNX export friendly Tensorflow like 'SAME' convolution wrapper for 2D convolutions NOTE: This does not currently work with torch.jit.script """ # pylint: disable=unused-argument def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super(Conv2dSameExport, self).__init__( in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) self.pad = None self.pad_input_size = (0, 0) def forward(self, x): input_size = x.size()[-2:] if self.pad is None: pad_arg = _same_pad_arg(input_size, self.weight.size()[-2:], self.stride, self.dilation) self.pad = nn.ZeroPad2d(pad_arg) self.pad_input_size = input_size if self.pad is not None: x = self.pad(x) return F.conv2d( x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) def get_padding_value(padding, kernel_size, **kwargs): dynamic = False if isinstance(padding, str): # for any string padding, the padding will be calculated for you, one of three ways padding = padding.lower() if padding == 'same': # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact if _is_static_pad(kernel_size, **kwargs): # static case, no extra overhead padding = _get_padding(kernel_size, **kwargs) else: # dynamic padding padding = 0 dynamic = True elif padding == 'valid': # 'VALID' padding, same as padding=0 padding = 0 else: # Default to PyTorch style 'same'-ish symmetric padding padding = _get_padding(kernel_size, **kwargs) return padding, dynamic def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs): padding = kwargs.pop('padding', '') kwargs.setdefault('bias', False) padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs) if is_dynamic: if is_exportable(): assert not is_scriptable() return Conv2dSameExport(in_chs, out_chs, kernel_size, **kwargs) else: return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs) else: return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs) class MixedConv2d(nn.ModuleDict): """ Mixed Grouped Convolution Based on MDConv and GroupedConv in MixNet impl: https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py """ def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding='', dilation=1, depthwise=False, **kwargs): super(MixedConv2d, self).__init__() kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size] num_groups = len(kernel_size) in_splits = _split_channels(in_channels, num_groups) out_splits = _split_channels(out_channels, num_groups) self.in_channels = sum(in_splits) self.out_channels = sum(out_splits) for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)): conv_groups = out_ch if depthwise else 1 self.add_module( str(idx), create_conv2d_pad( in_ch, out_ch, k, stride=stride, padding=padding, dilation=dilation, groups=conv_groups, **kwargs) ) self.splits = in_splits def forward(self, x): x_split = torch.split(x, self.splits, 1) x_out = [conv(x_split[i]) for i, conv in enumerate(self.values())] x = torch.cat(x_out, 1) return x def get_condconv_initializer(initializer, num_experts, expert_shape): def condconv_initializer(weight): """CondConv initializer function.""" num_params = np.prod(expert_shape) if (len(weight.shape) != 2 or weight.shape[0] != num_experts or weight.shape[1] != num_params): raise (ValueError( 'CondConv variables must have shape [num_experts, num_params]')) for i in range(num_experts): initializer(weight[i].view(expert_shape)) return condconv_initializer class CondConv2d(nn.Module): """ Conditional Convolution Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion: https://github.com/pytorch/pytorch/issues/17983 """ __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding'] def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4): super(CondConv2d, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = _pair(stride) padding_val, is_padding_dynamic = get_padding_value( padding, kernel_size, stride=stride, dilation=dilation) self.dynamic_padding = is_padding_dynamic # if in forward to work with torchscript self.padding = _pair(padding_val) self.dilation = _pair(dilation) self.groups = groups self.num_experts = num_experts self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size weight_num_param = 1 for wd in self.weight_shape: weight_num_param *= wd self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param)) if bias: self.bias_shape = (self.out_channels,) self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): init_weight = get_condconv_initializer( partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape) init_weight(self.weight) if self.bias is not None: fan_in = np.prod(self.weight_shape[1:]) bound = 1 / math.sqrt(fan_in) init_bias = get_condconv_initializer( partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape) init_bias(self.bias) def forward(self, x, routing_weights): B, C, H, W = x.shape weight = torch.matmul(routing_weights, self.weight) new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size weight = weight.view(new_weight_shape) bias = None if self.bias is not None: bias = torch.matmul(routing_weights, self.bias) bias = bias.view(B * self.out_channels) # move batch elements with channels so each batch element can be efficiently convolved with separate kernel x = x.view(1, B * C, H, W) if self.dynamic_padding: out = conv2d_same( x, weight, bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups * B) else: out = F.conv2d( x, weight, bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups * B) out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1]) # Literal port (from TF definition) # x = torch.split(x, 1, 0) # weight = torch.split(weight, 1, 0) # if self.bias is not None: # bias = torch.matmul(routing_weights, self.bias) # bias = torch.split(bias, 1, 0) # else: # bias = [None] * B # out = [] # for xi, wi, bi in zip(x, weight, bias): # wi = wi.view(*self.weight_shape) # if bi is not None: # bi = bi.view(*self.bias_shape) # out.append(self.conv_fn( # xi, wi, bi, stride=self.stride, padding=self.padding, # dilation=self.dilation, groups=self.groups)) # out = torch.cat(out, 0) return out def select_conv2d(in_chs, out_chs, kernel_size, **kwargs): assert 'groups' not in kwargs # only use 'depthwise' bool arg if isinstance(kernel_size, list): assert 'num_experts' not in kwargs # MixNet + CondConv combo not supported currently # We're going to use only lists for defining the MixedConv2d kernel groups, # ints, tuples, other iterables will continue to pass to normal conv and specify h, w. m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs) else: depthwise = kwargs.pop('depthwise', False) groups = out_chs if depthwise else 1 if 'num_experts' in kwargs and kwargs['num_experts'] > 0: m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs) else: m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs) return m ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/efficientnet_builder.py ================================================ """ EfficientNet / MobileNetV3 Blocks and Builder Copyright 2020 Ross Wightman """ import re from copy import deepcopy from .conv2d_layers import * from geffnet.activations import * __all__ = ['get_bn_args_tf', 'resolve_bn_args', 'resolve_se_args', 'resolve_act_layer', 'make_divisible', 'round_channels', 'drop_connect', 'SqueezeExcite', 'ConvBnAct', 'DepthwiseSeparableConv', 'InvertedResidual', 'CondConvResidual', 'EdgeResidual', 'EfficientNetBuilder', 'decode_arch_def', 'initialize_weight_default', 'initialize_weight_goog', 'BN_MOMENTUM_TF_DEFAULT', 'BN_EPS_TF_DEFAULT' ] # Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per # papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay) # NOTE: momentum varies btw .99 and .9997 depending on source # .99 in official TF TPU impl # .9997 (/w .999 in search space) for paper # # PyTorch defaults are momentum = .1, eps = 1e-5 # BN_MOMENTUM_TF_DEFAULT = 1 - 0.99 BN_EPS_TF_DEFAULT = 1e-3 _BN_ARGS_TF = dict(momentum=BN_MOMENTUM_TF_DEFAULT, eps=BN_EPS_TF_DEFAULT) def get_bn_args_tf(): return _BN_ARGS_TF.copy() def resolve_bn_args(kwargs): bn_args = get_bn_args_tf() if kwargs.pop('bn_tf', False) else {} bn_momentum = kwargs.pop('bn_momentum', None) if bn_momentum is not None: bn_args['momentum'] = bn_momentum bn_eps = kwargs.pop('bn_eps', None) if bn_eps is not None: bn_args['eps'] = bn_eps return bn_args _SE_ARGS_DEFAULT = dict( gate_fn=sigmoid, act_layer=None, # None == use containing block's activation layer reduce_mid=False, divisor=1) def resolve_se_args(kwargs, in_chs, act_layer=None): se_kwargs = kwargs.copy() if kwargs is not None else {} # fill in args that aren't specified with the defaults for k, v in _SE_ARGS_DEFAULT.items(): se_kwargs.setdefault(k, v) # some models, like MobilNetV3, calculate SE reduction chs from the containing block's mid_ch instead of in_ch if not se_kwargs.pop('reduce_mid'): se_kwargs['reduced_base_chs'] = in_chs # act_layer override, if it remains None, the containing block's act_layer will be used if se_kwargs['act_layer'] is None: assert act_layer is not None se_kwargs['act_layer'] = act_layer return se_kwargs def resolve_act_layer(kwargs, default='relu'): act_layer = kwargs.pop('act_layer', default) if isinstance(act_layer, str): act_layer = get_act_layer(act_layer) return act_layer def make_divisible(v: int, divisor: int = 8, min_value: int = None): min_value = min_value or divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) if new_v < 0.9 * v: # ensure round down does not go down by more than 10%. new_v += divisor return new_v def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None): """Round number of filters based on depth multiplier.""" if not multiplier: return channels channels *= multiplier return make_divisible(channels, divisor, channel_min) def drop_connect(inputs, training: bool = False, drop_connect_rate: float = 0.): """Apply drop connect.""" if not training: return inputs keep_prob = 1 - drop_connect_rate random_tensor = keep_prob + torch.rand( (inputs.size()[0], 1, 1, 1), dtype=inputs.dtype, device=inputs.device) random_tensor.floor_() # binarize output = inputs.div(keep_prob) * random_tensor return output class SqueezeExcite(nn.Module): def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None, act_layer=nn.ReLU, gate_fn=sigmoid, divisor=1): super(SqueezeExcite, self).__init__() reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor) self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True) self.act1 = act_layer(inplace=True) self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True) self.gate_fn = gate_fn def forward(self, x): x_se = x.mean((2, 3), keepdim=True) x_se = self.conv_reduce(x_se) x_se = self.act1(x_se) x_se = self.conv_expand(x_se) x = x * self.gate_fn(x_se) return x class ConvBnAct(nn.Module): def __init__(self, in_chs, out_chs, kernel_size, stride=1, pad_type='', act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, norm_kwargs=None): super(ConvBnAct, self).__init__() assert stride in [1, 2] norm_kwargs = norm_kwargs or {} self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, padding=pad_type) self.bn1 = norm_layer(out_chs, **norm_kwargs) self.act1 = act_layer(inplace=True) def forward(self, x): x = self.conv(x) x = self.bn1(x) x = self.act1(x) return x class DepthwiseSeparableConv(nn.Module): """ DepthwiseSeparable block Used for DS convs in MobileNet-V1 and in the place of IR blocks with an expansion factor of 1.0. This is an alternative to having a IR with optional first pw conv. """ def __init__(self, in_chs, out_chs, dw_kernel_size=3, stride=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.): super(DepthwiseSeparableConv, self).__init__() assert stride in [1, 2] norm_kwargs = norm_kwargs or {} self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip self.drop_connect_rate = drop_connect_rate self.conv_dw = select_conv2d( in_chs, in_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True) self.bn1 = norm_layer(in_chs, **norm_kwargs) self.act1 = act_layer(inplace=True) # Squeeze-and-excitation if se_ratio is not None and se_ratio > 0.: se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer) self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs) else: self.se = nn.Identity() self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type) self.bn2 = norm_layer(out_chs, **norm_kwargs) self.act2 = act_layer(inplace=True) if pw_act else nn.Identity() def forward(self, x): residual = x x = self.conv_dw(x) x = self.bn1(x) x = self.act1(x) x = self.se(x) x = self.conv_pw(x) x = self.bn2(x) x = self.act2(x) if self.has_residual: if self.drop_connect_rate > 0.: x = drop_connect(x, self.training, self.drop_connect_rate) x += residual return x class InvertedResidual(nn.Module): """ Inverted residual block w/ optional SE""" def __init__(self, in_chs, out_chs, dw_kernel_size=3, stride=1, pad_type='', act_layer=nn.ReLU, noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, conv_kwargs=None, drop_connect_rate=0.): super(InvertedResidual, self).__init__() norm_kwargs = norm_kwargs or {} conv_kwargs = conv_kwargs or {} mid_chs: int = make_divisible(in_chs * exp_ratio) self.has_residual = (in_chs == out_chs and stride == 1) and not noskip self.drop_connect_rate = drop_connect_rate # Point-wise expansion self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs) self.bn1 = norm_layer(mid_chs, **norm_kwargs) self.act1 = act_layer(inplace=True) # Depth-wise convolution self.conv_dw = select_conv2d( mid_chs, mid_chs, dw_kernel_size, stride=stride, padding=pad_type, depthwise=True, **conv_kwargs) self.bn2 = norm_layer(mid_chs, **norm_kwargs) self.act2 = act_layer(inplace=True) # Squeeze-and-excitation if se_ratio is not None and se_ratio > 0.: se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer) self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs) else: self.se = nn.Identity() # for jit.script compat # Point-wise linear projection self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs) self.bn3 = norm_layer(out_chs, **norm_kwargs) def forward(self, x): residual = x # Point-wise expansion x = self.conv_pw(x) x = self.bn1(x) x = self.act1(x) # Depth-wise convolution x = self.conv_dw(x) x = self.bn2(x) x = self.act2(x) # Squeeze-and-excitation x = self.se(x) # Point-wise linear projection x = self.conv_pwl(x) x = self.bn3(x) if self.has_residual: if self.drop_connect_rate > 0.: x = drop_connect(x, self.training, self.drop_connect_rate) x += residual return x class CondConvResidual(InvertedResidual): """ Inverted residual block w/ CondConv routing""" def __init__(self, in_chs, out_chs, dw_kernel_size=3, stride=1, pad_type='', act_layer=nn.ReLU, noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, num_experts=0, drop_connect_rate=0.): self.num_experts = num_experts conv_kwargs = dict(num_experts=self.num_experts) super(CondConvResidual, self).__init__( in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, pad_type=pad_type, act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size, pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs, norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs, drop_connect_rate=drop_connect_rate) self.routing_fn = nn.Linear(in_chs, self.num_experts) def forward(self, x): residual = x # CondConv routing pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1) routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs)) # Point-wise expansion x = self.conv_pw(x, routing_weights) x = self.bn1(x) x = self.act1(x) # Depth-wise convolution x = self.conv_dw(x, routing_weights) x = self.bn2(x) x = self.act2(x) # Squeeze-and-excitation x = self.se(x) # Point-wise linear projection x = self.conv_pwl(x, routing_weights) x = self.bn3(x) if self.has_residual: if self.drop_connect_rate > 0.: x = drop_connect(x, self.training, self.drop_connect_rate) x += residual return x class EdgeResidual(nn.Module): """ EdgeTPU Residual block with expansion convolution followed by pointwise-linear w/ stride""" def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0, stride=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1, se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.): super(EdgeResidual, self).__init__() norm_kwargs = norm_kwargs or {} mid_chs = make_divisible(fake_in_chs * exp_ratio) if fake_in_chs > 0 else make_divisible(in_chs * exp_ratio) self.has_residual = (in_chs == out_chs and stride == 1) and not noskip self.drop_connect_rate = drop_connect_rate # Expansion convolution self.conv_exp = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type) self.bn1 = norm_layer(mid_chs, **norm_kwargs) self.act1 = act_layer(inplace=True) # Squeeze-and-excitation if se_ratio is not None and se_ratio > 0.: se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer) self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs) else: self.se = nn.Identity() # Point-wise linear projection self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, stride=stride, padding=pad_type) self.bn2 = nn.BatchNorm2d(out_chs, **norm_kwargs) def forward(self, x): residual = x # Expansion convolution x = self.conv_exp(x) x = self.bn1(x) x = self.act1(x) # Squeeze-and-excitation x = self.se(x) # Point-wise linear projection x = self.conv_pwl(x) x = self.bn2(x) if self.has_residual: if self.drop_connect_rate > 0.: x = drop_connect(x, self.training, self.drop_connect_rate) x += residual return x class EfficientNetBuilder: """ Build Trunk Blocks for Efficient/Mobile Networks This ended up being somewhat of a cross between https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py and https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py """ def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None, pad_type='', act_layer=None, se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.): self.channel_multiplier = channel_multiplier self.channel_divisor = channel_divisor self.channel_min = channel_min self.pad_type = pad_type self.act_layer = act_layer self.se_kwargs = se_kwargs self.norm_layer = norm_layer self.norm_kwargs = norm_kwargs self.drop_connect_rate = drop_connect_rate # updated during build self.in_chs = None self.block_idx = 0 self.block_count = 0 def _round_channels(self, chs): return round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min) def _make_block(self, ba): bt = ba.pop('block_type') ba['in_chs'] = self.in_chs ba['out_chs'] = self._round_channels(ba['out_chs']) if 'fake_in_chs' in ba and ba['fake_in_chs']: # FIXME this is a hack to work around mismatch in origin impl input filters for EdgeTPU ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs']) ba['norm_layer'] = self.norm_layer ba['norm_kwargs'] = self.norm_kwargs ba['pad_type'] = self.pad_type # block act fn overrides the model default ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer assert ba['act_layer'] is not None if bt == 'ir': ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count ba['se_kwargs'] = self.se_kwargs if ba.get('num_experts', 0) > 0: block = CondConvResidual(**ba) else: block = InvertedResidual(**ba) elif bt == 'ds' or bt == 'dsa': ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count ba['se_kwargs'] = self.se_kwargs block = DepthwiseSeparableConv(**ba) elif bt == 'er': ba['drop_connect_rate'] = self.drop_connect_rate * self.block_idx / self.block_count ba['se_kwargs'] = self.se_kwargs block = EdgeResidual(**ba) elif bt == 'cn': block = ConvBnAct(**ba) else: assert False, 'Uknkown block type (%s) while building model.' % bt self.in_chs = ba['out_chs'] # update in_chs for arg of next block return block def _make_stack(self, stack_args): blocks = [] # each stack (stage) contains a list of block arguments for i, ba in enumerate(stack_args): if i >= 1: # only the first block in any stack can have a stride > 1 ba['stride'] = 1 block = self._make_block(ba) blocks.append(block) self.block_idx += 1 # incr global idx (across all stacks) return nn.Sequential(*blocks) def __call__(self, in_chs, block_args): """ Build the blocks Args: in_chs: Number of input-channels passed to first block block_args: A list of lists, outer list defines stages, inner list contains strings defining block configuration(s) Return: List of block stacks (each stack wrapped in nn.Sequential) """ self.in_chs = in_chs self.block_count = sum([len(x) for x in block_args]) self.block_idx = 0 blocks = [] # outer list of block_args defines the stacks ('stages' by some conventions) for stack_idx, stack in enumerate(block_args): assert isinstance(stack, list) stack = self._make_stack(stack) blocks.append(stack) return blocks def _parse_ksize(ss): if ss.isdigit(): return int(ss) else: return [int(k) for k in ss.split('.')] def _decode_block_str(block_str): """ Decode block definition string Gets a list of block arg (dicts) through a string notation of arguments. E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip All args can exist in any order with the exception of the leading string which is assumed to indicate the block type. leading string - block type ( ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct) r - number of repeat blocks, k - kernel size, s - strides (1-9), e - expansion ratio, c - output channels, se - squeeze/excitation ratio n - activation fn ('re', 'r6', 'hs', or 'sw') Args: block_str: a string representation of block arguments. Returns: A list of block args (dicts) Raises: ValueError: if the string def not properly specified (TODO) """ assert isinstance(block_str, str) ops = block_str.split('_') block_type = ops[0] # take the block type off the front ops = ops[1:] options = {} noskip = False for op in ops: # string options being checked on individual basis, combine if they grow if op == 'noskip': noskip = True elif op.startswith('n'): # activation fn key = op[0] v = op[1:] if v == 're': value = get_act_layer('relu') elif v == 'r6': value = get_act_layer('relu6') elif v == 'hs': value = get_act_layer('hard_swish') elif v == 'sw': value = get_act_layer('swish') else: continue options[key] = value else: # all numeric options splits = re.split(r'(\d.*)', op) if len(splits) >= 2: key, value = splits[:2] options[key] = value # if act_layer is None, the model default (passed to model init) will be used act_layer = options['n'] if 'n' in options else None exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1 pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1 fake_in_chs = int(options['fc']) if 'fc' in options else 0 # FIXME hack to deal with in_chs issue in TPU def num_repeat = int(options['r']) # each type of block has different valid arguments, fill accordingly if block_type == 'ir': block_args = dict( block_type=block_type, dw_kernel_size=_parse_ksize(options['k']), exp_kernel_size=exp_kernel_size, pw_kernel_size=pw_kernel_size, out_chs=int(options['c']), exp_ratio=float(options['e']), se_ratio=float(options['se']) if 'se' in options else None, stride=int(options['s']), act_layer=act_layer, noskip=noskip, ) if 'cc' in options: block_args['num_experts'] = int(options['cc']) elif block_type == 'ds' or block_type == 'dsa': block_args = dict( block_type=block_type, dw_kernel_size=_parse_ksize(options['k']), pw_kernel_size=pw_kernel_size, out_chs=int(options['c']), se_ratio=float(options['se']) if 'se' in options else None, stride=int(options['s']), act_layer=act_layer, pw_act=block_type == 'dsa', noskip=block_type == 'dsa' or noskip, ) elif block_type == 'er': block_args = dict( block_type=block_type, exp_kernel_size=_parse_ksize(options['k']), pw_kernel_size=pw_kernel_size, out_chs=int(options['c']), exp_ratio=float(options['e']), fake_in_chs=fake_in_chs, se_ratio=float(options['se']) if 'se' in options else None, stride=int(options['s']), act_layer=act_layer, noskip=noskip, ) elif block_type == 'cn': block_args = dict( block_type=block_type, kernel_size=int(options['k']), out_chs=int(options['c']), stride=int(options['s']), act_layer=act_layer, ) else: assert False, 'Unknown block type (%s)' % block_type return block_args, num_repeat def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'): """ Per-stage depth scaling Scales the block repeats in each stage. This depth scaling impl maintains compatibility with the EfficientNet scaling method, while allowing sensible scaling for other models that may have multiple block arg definitions in each stage. """ # We scale the total repeat count for each stage, there may be multiple # block arg defs per stage so we need to sum. num_repeat = sum(repeats) if depth_trunc == 'round': # Truncating to int by rounding allows stages with few repeats to remain # proportionally smaller for longer. This is a good choice when stage definitions # include single repeat stages that we'd prefer to keep that way as long as possible num_repeat_scaled = max(1, round(num_repeat * depth_multiplier)) else: # The default for EfficientNet truncates repeats to int via 'ceil'. # Any multiplier > 1.0 will result in an increased depth for every stage. num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier)) # Proportionally distribute repeat count scaling to each block definition in the stage. # Allocation is done in reverse as it results in the first block being less likely to be scaled. # The first block makes less sense to repeat in most of the arch definitions. repeats_scaled = [] for r in repeats[::-1]: rs = max(1, round((r / num_repeat * num_repeat_scaled))) repeats_scaled.append(rs) num_repeat -= r num_repeat_scaled -= rs repeats_scaled = repeats_scaled[::-1] # Apply the calculated scaling to each block arg in the stage sa_scaled = [] for ba, rep in zip(stack_args, repeats_scaled): sa_scaled.extend([deepcopy(ba) for _ in range(rep)]) return sa_scaled def decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1, fix_first_last=False): arch_args = [] for stack_idx, block_strings in enumerate(arch_def): assert isinstance(block_strings, list) stack_args = [] repeats = [] for block_str in block_strings: assert isinstance(block_str, str) ba, rep = _decode_block_str(block_str) if ba.get('num_experts', 0) > 0 and experts_multiplier > 1: ba['num_experts'] *= experts_multiplier stack_args.append(ba) repeats.append(rep) if fix_first_last and (stack_idx == 0 or stack_idx == len(arch_def) - 1): arch_args.append(_scale_stage_depth(stack_args, repeats, 1.0, depth_trunc)) else: arch_args.append(_scale_stage_depth(stack_args, repeats, depth_multiplier, depth_trunc)) return arch_args def initialize_weight_goog(m, n='', fix_group_fanout=True): # weight init as per Tensorflow Official impl # https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py if isinstance(m, CondConv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels if fix_group_fanout: fan_out //= m.groups init_weight_fn = get_condconv_initializer( lambda w: w.data.normal_(0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape) init_weight_fn(m.weight) if m.bias is not None: m.bias.data.zero_() elif isinstance(m, nn.Conv2d): fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels if fix_group_fanout: fan_out //= m.groups m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) if m.bias is not None: m.bias.data.zero_() elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1.0) m.bias.data.zero_() elif isinstance(m, nn.Linear): fan_out = m.weight.size(0) # fan-out fan_in = 0 if 'routing_fn' in n: fan_in = m.weight.size(1) init_range = 1.0 / math.sqrt(fan_in + fan_out) m.weight.data.uniform_(-init_range, init_range) m.bias.data.zero_() def initialize_weight_default(m, n=''): if isinstance(m, CondConv2d): init_fn = get_condconv_initializer(partial( nn.init.kaiming_normal_, mode='fan_out', nonlinearity='relu'), m.num_experts, m.weight_shape) init_fn(m.weight) elif isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1.0) m.bias.data.zero_() elif isinstance(m, nn.Linear): nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='linear') ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/gen_efficientnet.py ================================================ """ Generic Efficient Networks A generic MobileNet class with building blocks to support a variety of models: * EfficientNet (B0-B8, L2 + Tensorflow pretrained AutoAug/RandAug/AdvProp/NoisyStudent ports) - EfficientNet: Rethinking Model Scaling for CNNs - https://arxiv.org/abs/1905.11946 - CondConv: Conditionally Parameterized Convolutions for Efficient Inference - https://arxiv.org/abs/1904.04971 - Adversarial Examples Improve Image Recognition - https://arxiv.org/abs/1911.09665 - Self-training with Noisy Student improves ImageNet classification - https://arxiv.org/abs/1911.04252 * EfficientNet-Lite * MixNet (Small, Medium, and Large) - MixConv: Mixed Depthwise Convolutional Kernels - https://arxiv.org/abs/1907.09595 * MNasNet B1, A1 (SE), Small - MnasNet: Platform-Aware Neural Architecture Search for Mobile - https://arxiv.org/abs/1807.11626 * FBNet-C - FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable NAS - https://arxiv.org/abs/1812.03443 * Single-Path NAS Pixel1 - Single-Path NAS: Designing Hardware-Efficient ConvNets - https://arxiv.org/abs/1904.02877 * And likely more... Hacked together by / Copyright 2020 Ross Wightman """ import torch.nn as nn import torch.nn.functional as F from .config import layer_config_kwargs, is_scriptable from .conv2d_layers import select_conv2d from .helpers import load_pretrained from .efficientnet_builder import * __all__ = ['GenEfficientNet', 'mnasnet_050', 'mnasnet_075', 'mnasnet_100', 'mnasnet_b1', 'mnasnet_140', 'semnasnet_050', 'semnasnet_075', 'semnasnet_100', 'mnasnet_a1', 'semnasnet_140', 'mnasnet_small', 'mobilenetv2_100', 'mobilenetv2_140', 'mobilenetv2_110d', 'mobilenetv2_120d', 'fbnetc_100', 'spnasnet_100', 'efficientnet_b0', 'efficientnet_b1', 'efficientnet_b2', 'efficientnet_b3', 'efficientnet_b4', 'efficientnet_b5', 'efficientnet_b6', 'efficientnet_b7', 'efficientnet_b8', 'efficientnet_l2', 'efficientnet_es', 'efficientnet_em', 'efficientnet_el', 'efficientnet_cc_b0_4e', 'efficientnet_cc_b0_8e', 'efficientnet_cc_b1_8e', 'efficientnet_lite0', 'efficientnet_lite1', 'efficientnet_lite2', 'efficientnet_lite3', 'efficientnet_lite4', 'tf_efficientnet_b0', 'tf_efficientnet_b1', 'tf_efficientnet_b2', 'tf_efficientnet_b3', 'tf_efficientnet_b4', 'tf_efficientnet_b5', 'tf_efficientnet_b6', 'tf_efficientnet_b7', 'tf_efficientnet_b8', 'tf_efficientnet_b0_ap', 'tf_efficientnet_b1_ap', 'tf_efficientnet_b2_ap', 'tf_efficientnet_b3_ap', 'tf_efficientnet_b4_ap', 'tf_efficientnet_b5_ap', 'tf_efficientnet_b6_ap', 'tf_efficientnet_b7_ap', 'tf_efficientnet_b8_ap', 'tf_efficientnet_b0_ns', 'tf_efficientnet_b1_ns', 'tf_efficientnet_b2_ns', 'tf_efficientnet_b3_ns', 'tf_efficientnet_b4_ns', 'tf_efficientnet_b5_ns', 'tf_efficientnet_b6_ns', 'tf_efficientnet_b7_ns', 'tf_efficientnet_l2_ns', 'tf_efficientnet_l2_ns_475', 'tf_efficientnet_es', 'tf_efficientnet_em', 'tf_efficientnet_el', 'tf_efficientnet_cc_b0_4e', 'tf_efficientnet_cc_b0_8e', 'tf_efficientnet_cc_b1_8e', 'tf_efficientnet_lite0', 'tf_efficientnet_lite1', 'tf_efficientnet_lite2', 'tf_efficientnet_lite3', 'tf_efficientnet_lite4', 'mixnet_s', 'mixnet_m', 'mixnet_l', 'mixnet_xl', 'tf_mixnet_s', 'tf_mixnet_m', 'tf_mixnet_l'] model_urls = { 'mnasnet_050': None, 'mnasnet_075': None, 'mnasnet_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_b1-74cb7081.pth', 'mnasnet_140': None, 'mnasnet_small': None, 'semnasnet_050': None, 'semnasnet_075': None, 'semnasnet_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_a1-d9418771.pth', 'semnasnet_140': None, 'mobilenetv2_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_100_ra-b33bc2c4.pth', 'mobilenetv2_110d': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_110d_ra-77090ade.pth', 'mobilenetv2_120d': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_120d_ra-5987e2ed.pth', 'mobilenetv2_140': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_140_ra-21a4e913.pth', 'fbnetc_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetc_100-c345b898.pth', 'spnasnet_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/spnasnet_100-048bc3f4.pth', 'efficientnet_b0': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth', 'efficientnet_b1': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b1-533bc792.pth', 'efficientnet_b2': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth', 'efficientnet_b3': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b3_ra2-cf984f9c.pth', 'efficientnet_b4': None, 'efficientnet_b5': None, 'efficientnet_b6': None, 'efficientnet_b7': None, 'efficientnet_b8': None, 'efficientnet_l2': None, 'efficientnet_es': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth', 'efficientnet_em': None, 'efficientnet_el': None, 'efficientnet_cc_b0_4e': None, 'efficientnet_cc_b0_8e': None, 'efficientnet_cc_b1_8e': None, 'efficientnet_lite0': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_lite0_ra-37913777.pth', 'efficientnet_lite1': None, 'efficientnet_lite2': None, 'efficientnet_lite3': None, 'efficientnet_lite4': None, 'tf_efficientnet_b0': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_aa-827b6e33.pth', 'tf_efficientnet_b1': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_aa-ea7a6ee0.pth', 'tf_efficientnet_b2': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_aa-60c94f97.pth', 'tf_efficientnet_b3': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_aa-84b4657e.pth', 'tf_efficientnet_b4': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_aa-818f208c.pth', 'tf_efficientnet_b5': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ra-9a3e5369.pth', 'tf_efficientnet_b6': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_aa-80ba17e4.pth', 'tf_efficientnet_b7': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ra-6c08e654.pth', 'tf_efficientnet_b8': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ra-572d5dd9.pth', 'tf_efficientnet_b0_ap': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth', 'tf_efficientnet_b1_ap': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ap-44ef0a3d.pth', 'tf_efficientnet_b2_ap': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ap-2f8e7636.pth', 'tf_efficientnet_b3_ap': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ap-aad25bdd.pth', 'tf_efficientnet_b4_ap': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ap-dedb23e6.pth', 'tf_efficientnet_b5_ap': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ap-9e82fae8.pth', 'tf_efficientnet_b6_ap': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ap-4ffb161f.pth', 'tf_efficientnet_b7_ap': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ap-ddb28fec.pth', 'tf_efficientnet_b8_ap': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ap-00e169fa.pth', 'tf_efficientnet_b0_ns': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ns-c0e6a31c.pth', 'tf_efficientnet_b1_ns': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ns-99dd0c41.pth', 'tf_efficientnet_b2_ns': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ns-00306e48.pth', 'tf_efficientnet_b3_ns': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ns-9d44bf68.pth', 'tf_efficientnet_b4_ns': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ns-d6313a46.pth', 'tf_efficientnet_b5_ns': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ns-6f26d0cf.pth', 'tf_efficientnet_b6_ns': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ns-51548356.pth', 'tf_efficientnet_b7_ns': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ns-1dbc32de.pth', 'tf_efficientnet_l2_ns_475': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns_475-bebbd00a.pth', 'tf_efficientnet_l2_ns': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns-df73bb44.pth', 'tf_efficientnet_es': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth', 'tf_efficientnet_em': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_em-e78cfe58.pth', 'tf_efficientnet_el': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_el-5143854e.pth', 'tf_efficientnet_cc_b0_4e': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_4e-4362b6b2.pth', 'tf_efficientnet_cc_b0_8e': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_8e-66184a25.pth', 'tf_efficientnet_cc_b1_8e': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b1_8e-f7c79ae1.pth', 'tf_efficientnet_lite0': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite0-0aa007d2.pth', 'tf_efficientnet_lite1': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite1-bde8b488.pth', 'tf_efficientnet_lite2': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite2-dcccb7df.pth', 'tf_efficientnet_lite3': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth', 'tf_efficientnet_lite4': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite4-741542c3.pth', 'mixnet_s': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_s-a907afbc.pth', 'mixnet_m': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_m-4647fc68.pth', 'mixnet_l': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_l-5a9a2ed8.pth', 'mixnet_xl': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_xl_ra-aac3c00c.pth', 'tf_mixnet_s': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_s-89d3354b.pth', 'tf_mixnet_m': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_m-0f4d8805.pth', 'tf_mixnet_l': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_l-6c92e0c8.pth', } class GenEfficientNet(nn.Module): """ Generic EfficientNets An implementation of mobile optimized networks that covers: * EfficientNet (B0-B8, L2, CondConv, EdgeTPU) * MixNet (Small, Medium, and Large, XL) * MNASNet A1, B1, and small * FBNet C * Single-Path NAS Pixel1 """ def __init__(self, block_args, num_classes=1000, in_chans=3, num_features=1280, stem_size=32, fix_stem=False, channel_multiplier=1.0, channel_divisor=8, channel_min=None, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, weight_init='goog'): super(GenEfficientNet, self).__init__() self.drop_rate = drop_rate if not fix_stem: stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min) self.conv_stem = select_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type) self.bn1 = norm_layer(stem_size, **norm_kwargs) self.act1 = act_layer(inplace=True) in_chs = stem_size builder = EfficientNetBuilder( channel_multiplier, channel_divisor, channel_min, pad_type, act_layer, se_kwargs, norm_layer, norm_kwargs, drop_connect_rate) self.blocks = nn.Sequential(*builder(in_chs, block_args)) in_chs = builder.in_chs self.conv_head = select_conv2d(in_chs, num_features, 1, padding=pad_type) self.bn2 = norm_layer(num_features, **norm_kwargs) self.act2 = act_layer(inplace=True) self.global_pool = nn.AdaptiveAvgPool2d(1) self.classifier = nn.Linear(num_features, num_classes) for n, m in self.named_modules(): if weight_init == 'goog': initialize_weight_goog(m, n) else: initialize_weight_default(m, n) def features(self, x): x = self.conv_stem(x) x = self.bn1(x) x = self.act1(x) x = self.blocks(x) x = self.conv_head(x) x = self.bn2(x) x = self.act2(x) return x def as_sequential(self): layers = [self.conv_stem, self.bn1, self.act1] layers.extend(self.blocks) layers.extend([ self.conv_head, self.bn2, self.act2, self.global_pool, nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier]) return nn.Sequential(*layers) def forward(self, x): x = self.features(x) x = self.global_pool(x) x = x.flatten(1) if self.drop_rate > 0.: x = F.dropout(x, p=self.drop_rate, training=self.training) return self.classifier(x) def _create_model(model_kwargs, variant, pretrained=False): as_sequential = model_kwargs.pop('as_sequential', False) model = GenEfficientNet(**model_kwargs) if pretrained: load_pretrained(model, model_urls[variant]) if as_sequential: model = model.as_sequential() return model def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs): """Creates a mnasnet-a1 model. Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet Paper: https://arxiv.org/pdf/1807.11626.pdf. Args: channel_multiplier: multiplier to number of channels per layer. """ arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s1_e1_c16_noskip'], # stage 1, 112x112 in ['ir_r2_k3_s2_e6_c24'], # stage 2, 56x56 in ['ir_r3_k5_s2_e3_c40_se0.25'], # stage 3, 28x28 in ['ir_r4_k3_s2_e6_c80'], # stage 4, 14x14in ['ir_r2_k3_s1_e6_c112_se0.25'], # stage 5, 14x14in ['ir_r3_k5_s2_e6_c160_se0.25'], # stage 6, 7x7 in ['ir_r1_k3_s1_e6_c320'], ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def), stem_size=32, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'relu'), norm_kwargs=resolve_bn_args(kwargs), **kwargs ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs): """Creates a mnasnet-b1 model. Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet Paper: https://arxiv.org/pdf/1807.11626.pdf. Args: channel_multiplier: multiplier to number of channels per layer. """ arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s1_c16_noskip'], # stage 1, 112x112 in ['ir_r3_k3_s2_e3_c24'], # stage 2, 56x56 in ['ir_r3_k5_s2_e3_c40'], # stage 3, 28x28 in ['ir_r3_k5_s2_e6_c80'], # stage 4, 14x14in ['ir_r2_k3_s1_e6_c96'], # stage 5, 14x14in ['ir_r4_k5_s2_e6_c192'], # stage 6, 7x7 in ['ir_r1_k3_s1_e6_c320_noskip'] ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def), stem_size=32, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'relu'), norm_kwargs=resolve_bn_args(kwargs), **kwargs ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwargs): """Creates a mnasnet-b1 model. Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet Paper: https://arxiv.org/pdf/1807.11626.pdf. Args: channel_multiplier: multiplier to number of channels per layer. """ arch_def = [ ['ds_r1_k3_s1_c8'], ['ir_r1_k3_s2_e3_c16'], ['ir_r2_k3_s2_e6_c16'], ['ir_r4_k5_s2_e6_c32_se0.25'], ['ir_r3_k3_s1_e6_c32_se0.25'], ['ir_r3_k5_s2_e6_c88_se0.25'], ['ir_r1_k3_s1_e6_c144'] ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def), stem_size=8, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'relu'), norm_kwargs=resolve_bn_args(kwargs), **kwargs ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_mobilenet_v2( variant, channel_multiplier=1.0, depth_multiplier=1.0, fix_stem_head=False, pretrained=False, **kwargs): """ Generate MobileNet-V2 network Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py Paper: https://arxiv.org/abs/1801.04381 """ arch_def = [ ['ds_r1_k3_s1_c16'], ['ir_r2_k3_s2_e6_c24'], ['ir_r3_k3_s2_e6_c32'], ['ir_r4_k3_s2_e6_c64'], ['ir_r3_k3_s1_e6_c96'], ['ir_r3_k3_s2_e6_c160'], ['ir_r1_k3_s1_e6_c320'], ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def, depth_multiplier=depth_multiplier, fix_first_last=fix_stem_head), num_features=1280 if fix_stem_head else round_channels(1280, channel_multiplier, 8, None), stem_size=32, fix_stem=fix_stem_head, channel_multiplier=channel_multiplier, norm_kwargs=resolve_bn_args(kwargs), act_layer=nn.ReLU6, **kwargs ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs): """ FBNet-C Paper: https://arxiv.org/abs/1812.03443 Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py NOTE: the impl above does not relate to the 'C' variant here, that was derived from paper, it was used to confirm some building block details """ arch_def = [ ['ir_r1_k3_s1_e1_c16'], ['ir_r1_k3_s2_e6_c24', 'ir_r2_k3_s1_e1_c24'], ['ir_r1_k5_s2_e6_c32', 'ir_r1_k5_s1_e3_c32', 'ir_r1_k5_s1_e6_c32', 'ir_r1_k3_s1_e6_c32'], ['ir_r1_k5_s2_e6_c64', 'ir_r1_k5_s1_e3_c64', 'ir_r2_k5_s1_e6_c64'], ['ir_r3_k5_s1_e6_c112', 'ir_r1_k5_s1_e3_c112'], ['ir_r4_k5_s2_e6_c184'], ['ir_r1_k3_s1_e6_c352'], ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def), stem_size=16, num_features=1984, # paper suggests this, but is not 100% clear channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'relu'), norm_kwargs=resolve_bn_args(kwargs), **kwargs ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs): """Creates the Single-Path NAS model from search targeted for Pixel1 phone. Paper: https://arxiv.org/abs/1904.02877 Args: channel_multiplier: multiplier to number of channels per layer. """ arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s1_c16_noskip'], # stage 1, 112x112 in ['ir_r3_k3_s2_e3_c24'], # stage 2, 56x56 in ['ir_r1_k5_s2_e6_c40', 'ir_r3_k3_s1_e3_c40'], # stage 3, 28x28 in ['ir_r1_k5_s2_e6_c80', 'ir_r3_k3_s1_e3_c80'], # stage 4, 14x14in ['ir_r1_k5_s1_e6_c96', 'ir_r3_k5_s1_e3_c96'], # stage 5, 14x14in ['ir_r4_k5_s2_e6_c192'], # stage 6, 7x7 in ['ir_r1_k3_s1_e6_c320_noskip'] ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def), stem_size=32, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'relu'), norm_kwargs=resolve_bn_args(kwargs), **kwargs ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): """Creates an EfficientNet model. Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py Paper: https://arxiv.org/abs/1905.11946 EfficientNet params name: (channel_multiplier, depth_multiplier, resolution, dropout_rate) 'efficientnet-b0': (1.0, 1.0, 224, 0.2), 'efficientnet-b1': (1.0, 1.1, 240, 0.2), 'efficientnet-b2': (1.1, 1.2, 260, 0.3), 'efficientnet-b3': (1.2, 1.4, 300, 0.3), 'efficientnet-b4': (1.4, 1.8, 380, 0.4), 'efficientnet-b5': (1.6, 2.2, 456, 0.4), 'efficientnet-b6': (1.8, 2.6, 528, 0.5), 'efficientnet-b7': (2.0, 3.1, 600, 0.5), 'efficientnet-b8': (2.2, 3.6, 672, 0.5), Args: channel_multiplier: multiplier to number of channels per layer depth_multiplier: multiplier to number of repeats per stage """ arch_def = [ ['ds_r1_k3_s1_e1_c16_se0.25'], ['ir_r2_k3_s2_e6_c24_se0.25'], ['ir_r2_k5_s2_e6_c40_se0.25'], ['ir_r3_k3_s2_e6_c80_se0.25'], ['ir_r3_k5_s1_e6_c112_se0.25'], ['ir_r4_k5_s2_e6_c192_se0.25'], ['ir_r1_k3_s1_e6_c320_se0.25'], ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def, depth_multiplier), num_features=round_channels(1280, channel_multiplier, 8, None), stem_size=32, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'swish'), norm_kwargs=resolve_bn_args(kwargs), **kwargs, ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): arch_def = [ # NOTE `fc` is present to override a mismatch between stem channels and in chs not # present in other models ['er_r1_k3_s1_e4_c24_fc24_noskip'], ['er_r2_k3_s2_e8_c32'], ['er_r4_k3_s2_e8_c48'], ['ir_r5_k5_s2_e8_c96'], ['ir_r4_k5_s1_e8_c144'], ['ir_r2_k5_s2_e8_c192'], ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def, depth_multiplier), num_features=round_channels(1280, channel_multiplier, 8, None), stem_size=32, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'relu'), norm_kwargs=resolve_bn_args(kwargs), **kwargs, ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_efficientnet_condconv( variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs): """Creates an efficientnet-condconv model.""" arch_def = [ ['ds_r1_k3_s1_e1_c16_se0.25'], ['ir_r2_k3_s2_e6_c24_se0.25'], ['ir_r2_k5_s2_e6_c40_se0.25'], ['ir_r3_k3_s2_e6_c80_se0.25'], ['ir_r3_k5_s1_e6_c112_se0.25_cc4'], ['ir_r4_k5_s2_e6_c192_se0.25_cc4'], ['ir_r1_k3_s1_e6_c320_se0.25_cc4'], ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier), num_features=round_channels(1280, channel_multiplier, 8, None), stem_size=32, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'swish'), norm_kwargs=resolve_bn_args(kwargs), **kwargs, ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): """Creates an EfficientNet-Lite model. Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite Paper: https://arxiv.org/abs/1905.11946 EfficientNet params name: (channel_multiplier, depth_multiplier, resolution, dropout_rate) 'efficientnet-lite0': (1.0, 1.0, 224, 0.2), 'efficientnet-lite1': (1.0, 1.1, 240, 0.2), 'efficientnet-lite2': (1.1, 1.2, 260, 0.3), 'efficientnet-lite3': (1.2, 1.4, 280, 0.3), 'efficientnet-lite4': (1.4, 1.8, 300, 0.3), Args: channel_multiplier: multiplier to number of channels per layer depth_multiplier: multiplier to number of repeats per stage """ arch_def = [ ['ds_r1_k3_s1_e1_c16'], ['ir_r2_k3_s2_e6_c24'], ['ir_r2_k5_s2_e6_c40'], ['ir_r3_k3_s2_e6_c80'], ['ir_r3_k5_s1_e6_c112'], ['ir_r4_k5_s2_e6_c192'], ['ir_r1_k3_s1_e6_c320'], ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def, depth_multiplier, fix_first_last=True), num_features=1280, stem_size=32, fix_stem=True, channel_multiplier=channel_multiplier, act_layer=nn.ReLU6, norm_kwargs=resolve_bn_args(kwargs), **kwargs, ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs): """Creates a MixNet Small model. Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet Paper: https://arxiv.org/abs/1907.09595 """ arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s1_e1_c16'], # relu # stage 1, 112x112 in ['ir_r1_k3_a1.1_p1.1_s2_e6_c24', 'ir_r1_k3_a1.1_p1.1_s1_e3_c24'], # relu # stage 2, 56x56 in ['ir_r1_k3.5.7_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'], # swish # stage 3, 28x28 in ['ir_r1_k3.5.7_p1.1_s2_e6_c80_se0.25_nsw', 'ir_r2_k3.5_p1.1_s1_e6_c80_se0.25_nsw'], # swish # stage 4, 14x14in ['ir_r1_k3.5.7_a1.1_p1.1_s1_e6_c120_se0.5_nsw', 'ir_r2_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'], # swish # stage 5, 14x14in ['ir_r1_k3.5.7.9.11_s2_e6_c200_se0.5_nsw', 'ir_r2_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'], # swish # 7x7 ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def), num_features=1536, stem_size=16, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'relu'), norm_kwargs=resolve_bn_args(kwargs), **kwargs ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): """Creates a MixNet Medium-Large model. Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet Paper: https://arxiv.org/abs/1907.09595 """ arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s1_e1_c24'], # relu # stage 1, 112x112 in ['ir_r1_k3.5.7_a1.1_p1.1_s2_e6_c32', 'ir_r1_k3_a1.1_p1.1_s1_e3_c32'], # relu # stage 2, 56x56 in ['ir_r1_k3.5.7.9_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'], # swish # stage 3, 28x28 in ['ir_r1_k3.5.7_s2_e6_c80_se0.25_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e6_c80_se0.25_nsw'], # swish # stage 4, 14x14in ['ir_r1_k3_s1_e6_c120_se0.5_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'], # swish # stage 5, 14x14in ['ir_r1_k3.5.7.9_s2_e6_c200_se0.5_nsw', 'ir_r3_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'], # swish # 7x7 ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'), num_features=1536, stem_size=24, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'relu'), norm_kwargs=resolve_bn_args(kwargs), **kwargs ) model = _create_model(model_kwargs, variant, pretrained) return model def mnasnet_050(pretrained=False, **kwargs): """ MNASNet B1, depth multiplier of 0.5. """ model = _gen_mnasnet_b1('mnasnet_050', 0.5, pretrained=pretrained, **kwargs) return model def mnasnet_075(pretrained=False, **kwargs): """ MNASNet B1, depth multiplier of 0.75. """ model = _gen_mnasnet_b1('mnasnet_075', 0.75, pretrained=pretrained, **kwargs) return model def mnasnet_100(pretrained=False, **kwargs): """ MNASNet B1, depth multiplier of 1.0. """ model = _gen_mnasnet_b1('mnasnet_100', 1.0, pretrained=pretrained, **kwargs) return model def mnasnet_b1(pretrained=False, **kwargs): """ MNASNet B1, depth multiplier of 1.0. """ return mnasnet_100(pretrained, **kwargs) def mnasnet_140(pretrained=False, **kwargs): """ MNASNet B1, depth multiplier of 1.4 """ model = _gen_mnasnet_b1('mnasnet_140', 1.4, pretrained=pretrained, **kwargs) return model def semnasnet_050(pretrained=False, **kwargs): """ MNASNet A1 (w/ SE), depth multiplier of 0.5 """ model = _gen_mnasnet_a1('semnasnet_050', 0.5, pretrained=pretrained, **kwargs) return model def semnasnet_075(pretrained=False, **kwargs): """ MNASNet A1 (w/ SE), depth multiplier of 0.75. """ model = _gen_mnasnet_a1('semnasnet_075', 0.75, pretrained=pretrained, **kwargs) return model def semnasnet_100(pretrained=False, **kwargs): """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """ model = _gen_mnasnet_a1('semnasnet_100', 1.0, pretrained=pretrained, **kwargs) return model def mnasnet_a1(pretrained=False, **kwargs): """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """ return semnasnet_100(pretrained, **kwargs) def semnasnet_140(pretrained=False, **kwargs): """ MNASNet A1 (w/ SE), depth multiplier of 1.4. """ model = _gen_mnasnet_a1('semnasnet_140', 1.4, pretrained=pretrained, **kwargs) return model def mnasnet_small(pretrained=False, **kwargs): """ MNASNet Small, depth multiplier of 1.0. """ model = _gen_mnasnet_small('mnasnet_small', 1.0, pretrained=pretrained, **kwargs) return model def mobilenetv2_100(pretrained=False, **kwargs): """ MobileNet V2 w/ 1.0 channel multiplier """ model = _gen_mobilenet_v2('mobilenetv2_100', 1.0, pretrained=pretrained, **kwargs) return model def mobilenetv2_140(pretrained=False, **kwargs): """ MobileNet V2 w/ 1.4 channel multiplier """ model = _gen_mobilenet_v2('mobilenetv2_140', 1.4, pretrained=pretrained, **kwargs) return model def mobilenetv2_110d(pretrained=False, **kwargs): """ MobileNet V2 w/ 1.1 channel, 1.2 depth multipliers""" model = _gen_mobilenet_v2( 'mobilenetv2_110d', 1.1, depth_multiplier=1.2, fix_stem_head=True, pretrained=pretrained, **kwargs) return model def mobilenetv2_120d(pretrained=False, **kwargs): """ MobileNet V2 w/ 1.2 channel, 1.4 depth multipliers """ model = _gen_mobilenet_v2( 'mobilenetv2_120d', 1.2, depth_multiplier=1.4, fix_stem_head=True, pretrained=pretrained, **kwargs) return model def fbnetc_100(pretrained=False, **kwargs): """ FBNet-C """ if pretrained: # pretrained model trained with non-default BN epsilon kwargs['bn_eps'] = BN_EPS_TF_DEFAULT model = _gen_fbnetc('fbnetc_100', 1.0, pretrained=pretrained, **kwargs) return model def spnasnet_100(pretrained=False, **kwargs): """ Single-Path NAS Pixel1""" model = _gen_spnasnet('spnasnet_100', 1.0, pretrained=pretrained, **kwargs) return model def efficientnet_b0(pretrained=False, **kwargs): """ EfficientNet-B0 """ # NOTE for train set drop_rate=0.2, drop_connect_rate=0.2 model = _gen_efficientnet( 'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def efficientnet_b1(pretrained=False, **kwargs): """ EfficientNet-B1 """ # NOTE for train set drop_rate=0.2, drop_connect_rate=0.2 model = _gen_efficientnet( 'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) return model def efficientnet_b2(pretrained=False, **kwargs): """ EfficientNet-B2 """ # NOTE for train set drop_rate=0.3, drop_connect_rate=0.2 model = _gen_efficientnet( 'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) return model def efficientnet_b3(pretrained=False, **kwargs): """ EfficientNet-B3 """ # NOTE for train set drop_rate=0.3, drop_connect_rate=0.2 model = _gen_efficientnet( 'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) return model def efficientnet_b4(pretrained=False, **kwargs): """ EfficientNet-B4 """ # NOTE for train set drop_rate=0.4, drop_connect_rate=0.2 model = _gen_efficientnet( 'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) return model def efficientnet_b5(pretrained=False, **kwargs): """ EfficientNet-B5 """ # NOTE for train set drop_rate=0.4, drop_connect_rate=0.2 model = _gen_efficientnet( 'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs) return model def efficientnet_b6(pretrained=False, **kwargs): """ EfficientNet-B6 """ # NOTE for train set drop_rate=0.5, drop_connect_rate=0.2 model = _gen_efficientnet( 'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs) return model def efficientnet_b7(pretrained=False, **kwargs): """ EfficientNet-B7 """ # NOTE for train set drop_rate=0.5, drop_connect_rate=0.2 model = _gen_efficientnet( 'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs) return model def efficientnet_b8(pretrained=False, **kwargs): """ EfficientNet-B8 """ # NOTE for train set drop_rate=0.5, drop_connect_rate=0.2 model = _gen_efficientnet( 'efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs) return model def efficientnet_l2(pretrained=False, **kwargs): """ EfficientNet-L2. """ # NOTE for train, drop_rate should be 0.5 model = _gen_efficientnet( 'efficientnet_l2', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs) return model def efficientnet_es(pretrained=False, **kwargs): """ EfficientNet-Edge Small. """ model = _gen_efficientnet_edge( 'efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def efficientnet_em(pretrained=False, **kwargs): """ EfficientNet-Edge-Medium. """ model = _gen_efficientnet_edge( 'efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) return model def efficientnet_el(pretrained=False, **kwargs): """ EfficientNet-Edge-Large. """ model = _gen_efficientnet_edge( 'efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) return model def efficientnet_cc_b0_4e(pretrained=False, **kwargs): """ EfficientNet-CondConv-B0 w/ 8 Experts """ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2 model = _gen_efficientnet_condconv( 'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def efficientnet_cc_b0_8e(pretrained=False, **kwargs): """ EfficientNet-CondConv-B0 w/ 8 Experts """ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2 model = _gen_efficientnet_condconv( 'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2, pretrained=pretrained, **kwargs) return model def efficientnet_cc_b1_8e(pretrained=False, **kwargs): """ EfficientNet-CondConv-B1 w/ 8 Experts """ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2 model = _gen_efficientnet_condconv( 'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2, pretrained=pretrained, **kwargs) return model def efficientnet_lite0(pretrained=False, **kwargs): """ EfficientNet-Lite0 """ model = _gen_efficientnet_lite( 'efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def efficientnet_lite1(pretrained=False, **kwargs): """ EfficientNet-Lite1 """ model = _gen_efficientnet_lite( 'efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) return model def efficientnet_lite2(pretrained=False, **kwargs): """ EfficientNet-Lite2 """ model = _gen_efficientnet_lite( 'efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) return model def efficientnet_lite3(pretrained=False, **kwargs): """ EfficientNet-Lite3 """ model = _gen_efficientnet_lite( 'efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) return model def efficientnet_lite4(pretrained=False, **kwargs): """ EfficientNet-Lite4 """ model = _gen_efficientnet_lite( 'efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b0(pretrained=False, **kwargs): """ EfficientNet-B0 AutoAug. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b1(pretrained=False, **kwargs): """ EfficientNet-B1 AutoAug. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b2(pretrained=False, **kwargs): """ EfficientNet-B2 AutoAug. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b3(pretrained=False, **kwargs): """ EfficientNet-B3 AutoAug. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b4(pretrained=False, **kwargs): """ EfficientNet-B4 AutoAug. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b5(pretrained=False, **kwargs): """ EfficientNet-B5 RandAug. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b6(pretrained=False, **kwargs): """ EfficientNet-B6 AutoAug. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b7(pretrained=False, **kwargs): """ EfficientNet-B7 RandAug. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b8(pretrained=False, **kwargs): """ EfficientNet-B8 RandAug. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b0_ap(pretrained=False, **kwargs): """ EfficientNet-B0 AdvProp. Tensorflow compatible variant Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b0_ap', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b1_ap(pretrained=False, **kwargs): """ EfficientNet-B1 AdvProp. Tensorflow compatible variant Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b1_ap', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b2_ap(pretrained=False, **kwargs): """ EfficientNet-B2 AdvProp. Tensorflow compatible variant Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b2_ap', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b3_ap(pretrained=False, **kwargs): """ EfficientNet-B3 AdvProp. Tensorflow compatible variant Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b3_ap', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b4_ap(pretrained=False, **kwargs): """ EfficientNet-B4 AdvProp. Tensorflow compatible variant Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b4_ap', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b5_ap(pretrained=False, **kwargs): """ EfficientNet-B5 AdvProp. Tensorflow compatible variant Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b5_ap', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b6_ap(pretrained=False, **kwargs): """ EfficientNet-B6 AdvProp. Tensorflow compatible variant Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665) """ # NOTE for train, drop_rate should be 0.5 kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b6_ap', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b7_ap(pretrained=False, **kwargs): """ EfficientNet-B7 AdvProp. Tensorflow compatible variant Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665) """ # NOTE for train, drop_rate should be 0.5 kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b7_ap', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b8_ap(pretrained=False, **kwargs): """ EfficientNet-B8 AdvProp. Tensorflow compatible variant Paper: Adversarial Examples Improve Image Recognition (https://arxiv.org/abs/1911.09665) """ # NOTE for train, drop_rate should be 0.5 kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b8_ap', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b0_ns(pretrained=False, **kwargs): """ EfficientNet-B0 NoisyStudent. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b0_ns', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b1_ns(pretrained=False, **kwargs): """ EfficientNet-B1 NoisyStudent. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b1_ns', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b2_ns(pretrained=False, **kwargs): """ EfficientNet-B2 NoisyStudent. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b2_ns', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b3_ns(pretrained=False, **kwargs): """ EfficientNet-B3 NoisyStudent. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b3_ns', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b4_ns(pretrained=False, **kwargs): """ EfficientNet-B4 NoisyStudent. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b4_ns', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b5_ns(pretrained=False, **kwargs): """ EfficientNet-B5 NoisyStudent. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b5_ns', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b6_ns(pretrained=False, **kwargs): """ EfficientNet-B6 NoisyStudent. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ # NOTE for train, drop_rate should be 0.5 kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b6_ns', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs) return model def tf_efficientnet_b7_ns(pretrained=False, **kwargs): """ EfficientNet-B7 NoisyStudent. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ # NOTE for train, drop_rate should be 0.5 kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_b7_ns', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs) return model def tf_efficientnet_l2_ns_475(pretrained=False, **kwargs): """ EfficientNet-L2 NoisyStudent @ 475x475. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ # NOTE for train, drop_rate should be 0.5 kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_l2_ns_475', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs) return model def tf_efficientnet_l2_ns(pretrained=False, **kwargs): """ EfficientNet-L2 NoisyStudent. Tensorflow compatible variant Paper: Self-training with Noisy Student improves ImageNet classification (https://arxiv.org/abs/1911.04252) """ # NOTE for train, drop_rate should be 0.5 kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet( 'tf_efficientnet_l2_ns', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs) return model def tf_efficientnet_es(pretrained=False, **kwargs): """ EfficientNet-Edge Small. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_edge( 'tf_efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def tf_efficientnet_em(pretrained=False, **kwargs): """ EfficientNet-Edge-Medium. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_edge( 'tf_efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) return model def tf_efficientnet_el(pretrained=False, **kwargs): """ EfficientNet-Edge-Large. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_edge( 'tf_efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) return model def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs): """ EfficientNet-CondConv-B0 w/ 4 Experts """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_condconv( 'tf_efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs): """ EfficientNet-CondConv-B0 w/ 8 Experts """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_condconv( 'tf_efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2, pretrained=pretrained, **kwargs) return model def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs): """ EfficientNet-CondConv-B1 w/ 8 Experts """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_condconv( 'tf_efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2, pretrained=pretrained, **kwargs) return model def tf_efficientnet_lite0(pretrained=False, **kwargs): """ EfficientNet-Lite0. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_lite( 'tf_efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) return model def tf_efficientnet_lite1(pretrained=False, **kwargs): """ EfficientNet-Lite1. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_lite( 'tf_efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) return model def tf_efficientnet_lite2(pretrained=False, **kwargs): """ EfficientNet-Lite2. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_lite( 'tf_efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) return model def tf_efficientnet_lite3(pretrained=False, **kwargs): """ EfficientNet-Lite3. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_lite( 'tf_efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) return model def tf_efficientnet_lite4(pretrained=False, **kwargs): """ EfficientNet-Lite4. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_efficientnet_lite( 'tf_efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) return model def mixnet_s(pretrained=False, **kwargs): """Creates a MixNet Small model. """ # NOTE for train set drop_rate=0.2 model = _gen_mixnet_s( 'mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs) return model def mixnet_m(pretrained=False, **kwargs): """Creates a MixNet Medium model. """ # NOTE for train set drop_rate=0.25 model = _gen_mixnet_m( 'mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs) return model def mixnet_l(pretrained=False, **kwargs): """Creates a MixNet Large model. """ # NOTE for train set drop_rate=0.25 model = _gen_mixnet_m( 'mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs) return model def mixnet_xl(pretrained=False, **kwargs): """Creates a MixNet Extra-Large model. Not a paper spec, experimental def by RW w/ depth scaling. """ # NOTE for train set drop_rate=0.25, drop_connect_rate=0.2 model = _gen_mixnet_m( 'mixnet_xl', channel_multiplier=1.6, depth_multiplier=1.2, pretrained=pretrained, **kwargs) return model def mixnet_xxl(pretrained=False, **kwargs): """Creates a MixNet Double Extra Large model. Not a paper spec, experimental def by RW w/ depth scaling. """ # NOTE for train set drop_rate=0.3, drop_connect_rate=0.2 model = _gen_mixnet_m( 'mixnet_xxl', channel_multiplier=2.4, depth_multiplier=1.3, pretrained=pretrained, **kwargs) return model def tf_mixnet_s(pretrained=False, **kwargs): """Creates a MixNet Small model. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_mixnet_s( 'tf_mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs) return model def tf_mixnet_m(pretrained=False, **kwargs): """Creates a MixNet Medium model. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_mixnet_m( 'tf_mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs) return model def tf_mixnet_l(pretrained=False, **kwargs): """Creates a MixNet Large model. Tensorflow compatible variant """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_mixnet_m( 'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs) return model ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/helpers.py ================================================ """ Checkpoint loading / state_dict helpers Copyright 2020 Ross Wightman """ import torch import os from collections import OrderedDict try: from torch.hub import load_state_dict_from_url except ImportError: from torch.utils.model_zoo import load_url as load_state_dict_from_url def load_checkpoint(model, checkpoint_path): if checkpoint_path and os.path.isfile(checkpoint_path): print("=> Loading checkpoint '{}'".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: new_state_dict = OrderedDict() for k, v in checkpoint['state_dict'].items(): if k.startswith('module'): name = k[7:] # remove `module.` else: name = k new_state_dict[name] = v model.load_state_dict(new_state_dict) else: model.load_state_dict(checkpoint) print("=> Loaded checkpoint '{}'".format(checkpoint_path)) else: print("=> Error: No checkpoint found at '{}'".format(checkpoint_path)) raise FileNotFoundError() def load_pretrained(model, url, filter_fn=None, strict=True): if not url: print("=> Warning: Pretrained model URL is empty, using random initialization.") return state_dict = load_state_dict_from_url(url, progress=False, map_location='cpu') input_conv = 'conv_stem' classifier = 'classifier' in_chans = getattr(model, input_conv).weight.shape[1] num_classes = getattr(model, classifier).weight.shape[0] input_conv_weight = input_conv + '.weight' pretrained_in_chans = state_dict[input_conv_weight].shape[1] if in_chans != pretrained_in_chans: if in_chans == 1: print('=> Converting pretrained input conv {} from {} to 1 channel'.format( input_conv_weight, pretrained_in_chans)) conv1_weight = state_dict[input_conv_weight] state_dict[input_conv_weight] = conv1_weight.sum(dim=1, keepdim=True) else: print('=> Discarding pretrained input conv {} since input channel count != {}'.format( input_conv_weight, pretrained_in_chans)) del state_dict[input_conv_weight] strict = False classifier_weight = classifier + '.weight' pretrained_num_classes = state_dict[classifier_weight].shape[0] if num_classes != pretrained_num_classes: print('=> Discarding pretrained classifier since num_classes != {}'.format(pretrained_num_classes)) del state_dict[classifier_weight] del state_dict[classifier + '.bias'] strict = False if filter_fn is not None: state_dict = filter_fn(state_dict) model.load_state_dict(state_dict, strict=strict) ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/mobilenetv3.py ================================================ """ MobileNet-V3 A PyTorch impl of MobileNet-V3, compatible with TF weights from official impl. Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244 Hacked together by / Copyright 2020 Ross Wightman """ import torch.nn as nn import torch.nn.functional as F from .activations import get_act_fn, get_act_layer, HardSwish from .config import layer_config_kwargs from .conv2d_layers import select_conv2d from .helpers import load_pretrained from .efficientnet_builder import * __all__ = ['mobilenetv3_rw', 'mobilenetv3_large_075', 'mobilenetv3_large_100', 'mobilenetv3_large_minimal_100', 'mobilenetv3_small_075', 'mobilenetv3_small_100', 'mobilenetv3_small_minimal_100', 'tf_mobilenetv3_large_075', 'tf_mobilenetv3_large_100', 'tf_mobilenetv3_large_minimal_100', 'tf_mobilenetv3_small_075', 'tf_mobilenetv3_small_100', 'tf_mobilenetv3_small_minimal_100'] model_urls = { 'mobilenetv3_rw': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth', 'mobilenetv3_large_075': None, 'mobilenetv3_large_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_large_100_ra-f55367f5.pth', 'mobilenetv3_large_minimal_100': None, 'mobilenetv3_small_075': None, 'mobilenetv3_small_100': None, 'mobilenetv3_small_minimal_100': None, 'tf_mobilenetv3_large_075': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_075-150ee8b0.pth', 'tf_mobilenetv3_large_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_100-427764d5.pth', 'tf_mobilenetv3_large_minimal_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_minimal_100-8596ae28.pth', 'tf_mobilenetv3_small_075': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_075-da427f52.pth', 'tf_mobilenetv3_small_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_100-37f49e2b.pth', 'tf_mobilenetv3_small_minimal_100': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_minimal_100-922a7843.pth', } class MobileNetV3(nn.Module): """ MobileNet-V3 A this model utilizes the MobileNet-v3 specific 'efficient head', where global pooling is done before the head convolution without a final batch-norm layer before the classifier. Paper: https://arxiv.org/abs/1905.02244 """ def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True, channel_multiplier=1.0, pad_type='', act_layer=HardSwish, drop_rate=0., drop_connect_rate=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, weight_init='goog'): super(MobileNetV3, self).__init__() self.drop_rate = drop_rate stem_size = round_channels(stem_size, channel_multiplier) self.conv_stem = select_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type) self.bn1 = nn.BatchNorm2d(stem_size, **norm_kwargs) self.act1 = act_layer(inplace=True) in_chs = stem_size builder = EfficientNetBuilder( channel_multiplier, pad_type=pad_type, act_layer=act_layer, se_kwargs=se_kwargs, norm_layer=norm_layer, norm_kwargs=norm_kwargs, drop_connect_rate=drop_connect_rate) self.blocks = nn.Sequential(*builder(in_chs, block_args)) in_chs = builder.in_chs self.global_pool = nn.AdaptiveAvgPool2d(1) self.conv_head = select_conv2d(in_chs, num_features, 1, padding=pad_type, bias=head_bias) self.act2 = act_layer(inplace=True) self.classifier = nn.Linear(num_features, num_classes) for m in self.modules(): if weight_init == 'goog': initialize_weight_goog(m) else: initialize_weight_default(m) def as_sequential(self): layers = [self.conv_stem, self.bn1, self.act1] layers.extend(self.blocks) layers.extend([ self.global_pool, self.conv_head, self.act2, nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier]) return nn.Sequential(*layers) def features(self, x): x = self.conv_stem(x) x = self.bn1(x) x = self.act1(x) x = self.blocks(x) x = self.global_pool(x) x = self.conv_head(x) x = self.act2(x) return x def forward(self, x): x = self.features(x) x = x.flatten(1) if self.drop_rate > 0.: x = F.dropout(x, p=self.drop_rate, training=self.training) return self.classifier(x) def _create_model(model_kwargs, variant, pretrained=False): as_sequential = model_kwargs.pop('as_sequential', False) model = MobileNetV3(**model_kwargs) if pretrained and model_urls[variant]: load_pretrained(model, model_urls[variant]) if as_sequential: model = model.as_sequential() return model def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kwargs): """Creates a MobileNet-V3 model (RW variant). Paper: https://arxiv.org/abs/1905.02244 This was my first attempt at reproducing the MobileNet-V3 from paper alone. It came close to the eventual Tensorflow reference impl but has a few differences: 1. This model has no bias on the head convolution 2. This model forces no residual (noskip) on the first DWS block, this is different than MnasNet 3. This model always uses ReLU for the SE activation layer, other models in the family inherit their act layer from their parent block 4. This model does not enforce divisible by 8 limitation on the SE reduction channel count Overall the changes are fairly minor and result in a very small parameter count difference and no top-1/5 Args: channel_multiplier: multiplier to number of channels per layer. """ arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s1_e1_c16_nre_noskip'], # relu # stage 1, 112x112 in ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'], # relu # stage 2, 56x56 in ['ir_r3_k5_s2_e3_c40_se0.25_nre'], # relu # stage 3, 28x28 in ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], # hard-swish # stage 4, 14x14in ['ir_r2_k3_s1_e6_c112_se0.25'], # hard-swish # stage 5, 14x14in ['ir_r3_k5_s2_e6_c160_se0.25'], # hard-swish # stage 6, 7x7 in ['cn_r1_k1_s1_c960'], # hard-swish ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def), head_bias=False, # one of my mistakes channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, 'hard_swish'), se_kwargs=dict(gate_fn=get_act_fn('hard_sigmoid'), reduce_mid=True), norm_kwargs=resolve_bn_args(kwargs), **kwargs, ) model = _create_model(model_kwargs, variant, pretrained) return model def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs): """Creates a MobileNet-V3 large/small/minimal models. Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v3.py Paper: https://arxiv.org/abs/1905.02244 Args: channel_multiplier: multiplier to number of channels per layer. """ if 'small' in variant: num_features = 1024 if 'minimal' in variant: act_layer = 'relu' arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s2_e1_c16'], # stage 1, 56x56 in ['ir_r1_k3_s2_e4.5_c24', 'ir_r1_k3_s1_e3.67_c24'], # stage 2, 28x28 in ['ir_r1_k3_s2_e4_c40', 'ir_r2_k3_s1_e6_c40'], # stage 3, 14x14 in ['ir_r2_k3_s1_e3_c48'], # stage 4, 14x14in ['ir_r3_k3_s2_e6_c96'], # stage 6, 7x7 in ['cn_r1_k1_s1_c576'], ] else: act_layer = 'hard_swish' arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s2_e1_c16_se0.25_nre'], # relu # stage 1, 56x56 in ['ir_r1_k3_s2_e4.5_c24_nre', 'ir_r1_k3_s1_e3.67_c24_nre'], # relu # stage 2, 28x28 in ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r2_k5_s1_e6_c40_se0.25'], # hard-swish # stage 3, 14x14 in ['ir_r2_k5_s1_e3_c48_se0.25'], # hard-swish # stage 4, 14x14in ['ir_r3_k5_s2_e6_c96_se0.25'], # hard-swish # stage 6, 7x7 in ['cn_r1_k1_s1_c576'], # hard-swish ] else: num_features = 1280 if 'minimal' in variant: act_layer = 'relu' arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s1_e1_c16'], # stage 1, 112x112 in ['ir_r1_k3_s2_e4_c24', 'ir_r1_k3_s1_e3_c24'], # stage 2, 56x56 in ['ir_r3_k3_s2_e3_c40'], # stage 3, 28x28 in ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], # stage 4, 14x14in ['ir_r2_k3_s1_e6_c112'], # stage 5, 14x14in ['ir_r3_k3_s2_e6_c160'], # stage 6, 7x7 in ['cn_r1_k1_s1_c960'], ] else: act_layer = 'hard_swish' arch_def = [ # stage 0, 112x112 in ['ds_r1_k3_s1_e1_c16_nre'], # relu # stage 1, 112x112 in ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'], # relu # stage 2, 56x56 in ['ir_r3_k5_s2_e3_c40_se0.25_nre'], # relu # stage 3, 28x28 in ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], # hard-swish # stage 4, 14x14in ['ir_r2_k3_s1_e6_c112_se0.25'], # hard-swish # stage 5, 14x14in ['ir_r3_k5_s2_e6_c160_se0.25'], # hard-swish # stage 6, 7x7 in ['cn_r1_k1_s1_c960'], # hard-swish ] with layer_config_kwargs(kwargs): model_kwargs = dict( block_args=decode_arch_def(arch_def), num_features=num_features, stem_size=16, channel_multiplier=channel_multiplier, act_layer=resolve_act_layer(kwargs, act_layer), se_kwargs=dict( act_layer=get_act_layer('relu'), gate_fn=get_act_fn('hard_sigmoid'), reduce_mid=True, divisor=8), norm_kwargs=resolve_bn_args(kwargs), **kwargs, ) model = _create_model(model_kwargs, variant, pretrained) return model def mobilenetv3_rw(pretrained=False, **kwargs): """ MobileNet-V3 RW Attn: See note in gen function for this variant. """ # NOTE for train set drop_rate=0.2 if pretrained: # pretrained model trained with non-default BN epsilon kwargs['bn_eps'] = BN_EPS_TF_DEFAULT model = _gen_mobilenet_v3_rw('mobilenetv3_rw', 1.0, pretrained=pretrained, **kwargs) return model def mobilenetv3_large_075(pretrained=False, **kwargs): """ MobileNet V3 Large 0.75""" # NOTE for train set drop_rate=0.2 model = _gen_mobilenet_v3('mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs) return model def mobilenetv3_large_100(pretrained=False, **kwargs): """ MobileNet V3 Large 1.0 """ # NOTE for train set drop_rate=0.2 model = _gen_mobilenet_v3('mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs) return model def mobilenetv3_large_minimal_100(pretrained=False, **kwargs): """ MobileNet V3 Large (Minimalistic) 1.0 """ # NOTE for train set drop_rate=0.2 model = _gen_mobilenet_v3('mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs) return model def mobilenetv3_small_075(pretrained=False, **kwargs): """ MobileNet V3 Small 0.75 """ model = _gen_mobilenet_v3('mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs) return model def mobilenetv3_small_100(pretrained=False, **kwargs): """ MobileNet V3 Small 1.0 """ model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs) return model def mobilenetv3_small_minimal_100(pretrained=False, **kwargs): """ MobileNet V3 Small (Minimalistic) 1.0 """ model = _gen_mobilenet_v3('mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs) return model def tf_mobilenetv3_large_075(pretrained=False, **kwargs): """ MobileNet V3 Large 0.75. Tensorflow compat variant. """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_mobilenet_v3('tf_mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs) return model def tf_mobilenetv3_large_100(pretrained=False, **kwargs): """ MobileNet V3 Large 1.0. Tensorflow compat variant. """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_mobilenet_v3('tf_mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs) return model def tf_mobilenetv3_large_minimal_100(pretrained=False, **kwargs): """ MobileNet V3 Large Minimalistic 1.0. Tensorflow compat variant. """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_mobilenet_v3('tf_mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs) return model def tf_mobilenetv3_small_075(pretrained=False, **kwargs): """ MobileNet V3 Small 0.75. Tensorflow compat variant. """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_mobilenet_v3('tf_mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs) return model def tf_mobilenetv3_small_100(pretrained=False, **kwargs): """ MobileNet V3 Small 1.0. Tensorflow compat variant.""" kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_mobilenet_v3('tf_mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs) return model def tf_mobilenetv3_small_minimal_100(pretrained=False, **kwargs): """ MobileNet V3 Small Minimalistic 1.0. Tensorflow compat variant. """ kwargs['bn_eps'] = BN_EPS_TF_DEFAULT kwargs['pad_type'] = 'same' model = _gen_mobilenet_v3('tf_mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs) return model ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/model_factory.py ================================================ from .config import set_layer_config from .helpers import load_checkpoint from .gen_efficientnet import * from .mobilenetv3 import * def create_model( model_name='mnasnet_100', pretrained=None, num_classes=1000, in_chans=3, checkpoint_path='', **kwargs): model_kwargs = dict(num_classes=num_classes, in_chans=in_chans, pretrained=pretrained, **kwargs) if model_name in globals(): create_fn = globals()[model_name] model = create_fn(**model_kwargs) else: raise RuntimeError('Unknown model (%s)' % model_name) if checkpoint_path and not pretrained: load_checkpoint(model, checkpoint_path) return model ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/geffnet/version.py ================================================ __version__ = '1.0.2' ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/hubconf.py ================================================ dependencies = ['torch', 'math'] from geffnet import efficientnet_b0 from geffnet import efficientnet_b1 from geffnet import efficientnet_b2 from geffnet import efficientnet_b3 from geffnet import efficientnet_es from geffnet import efficientnet_lite0 from geffnet import mixnet_s from geffnet import mixnet_m from geffnet import mixnet_l from geffnet import mixnet_xl from geffnet import mobilenetv2_100 from geffnet import mobilenetv2_110d from geffnet import mobilenetv2_120d from geffnet import mobilenetv2_140 from geffnet import mobilenetv3_large_100 from geffnet import mobilenetv3_rw from geffnet import mnasnet_a1 from geffnet import mnasnet_b1 from geffnet import fbnetc_100 from geffnet import spnasnet_100 from geffnet import tf_efficientnet_b0 from geffnet import tf_efficientnet_b1 from geffnet import tf_efficientnet_b2 from geffnet import tf_efficientnet_b3 from geffnet import tf_efficientnet_b4 from geffnet import tf_efficientnet_b5 from geffnet import tf_efficientnet_b6 from geffnet import tf_efficientnet_b7 from geffnet import tf_efficientnet_b8 from geffnet import tf_efficientnet_b0_ap from geffnet import tf_efficientnet_b1_ap from geffnet import tf_efficientnet_b2_ap from geffnet import tf_efficientnet_b3_ap from geffnet import tf_efficientnet_b4_ap from geffnet import tf_efficientnet_b5_ap from geffnet import tf_efficientnet_b6_ap from geffnet import tf_efficientnet_b7_ap from geffnet import tf_efficientnet_b8_ap from geffnet import tf_efficientnet_b0_ns from geffnet import tf_efficientnet_b1_ns from geffnet import tf_efficientnet_b2_ns from geffnet import tf_efficientnet_b3_ns from geffnet import tf_efficientnet_b4_ns from geffnet import tf_efficientnet_b5_ns from geffnet import tf_efficientnet_b6_ns from geffnet import tf_efficientnet_b7_ns from geffnet import tf_efficientnet_l2_ns_475 from geffnet import tf_efficientnet_l2_ns from geffnet import tf_efficientnet_es from geffnet import tf_efficientnet_em from geffnet import tf_efficientnet_el from geffnet import tf_efficientnet_cc_b0_4e from geffnet import tf_efficientnet_cc_b0_8e from geffnet import tf_efficientnet_cc_b1_8e from geffnet import tf_efficientnet_lite0 from geffnet import tf_efficientnet_lite1 from geffnet import tf_efficientnet_lite2 from geffnet import tf_efficientnet_lite3 from geffnet import tf_efficientnet_lite4 from geffnet import tf_mixnet_s from geffnet import tf_mixnet_m from geffnet import tf_mixnet_l from geffnet import tf_mobilenetv3_large_075 from geffnet import tf_mobilenetv3_large_100 from geffnet import tf_mobilenetv3_large_minimal_100 from geffnet import tf_mobilenetv3_small_075 from geffnet import tf_mobilenetv3_small_100 from geffnet import tf_mobilenetv3_small_minimal_100 ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/onnx_export.py ================================================ """ ONNX export script Export PyTorch models as ONNX graphs. This export script originally started as an adaptation of code snippets found at https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html The default parameters work with PyTorch 1.6 and ONNX 1.7 and produce an optimal ONNX graph for hosting in the ONNX runtime (see onnx_validate.py). To export an ONNX model compatible with caffe2 (see caffe2_benchmark.py and caffe2_validate.py), the --keep-init and --aten-fallback flags are currently required. Older versions of PyTorch/ONNX (tested PyTorch 1.4, ONNX 1.5) do not need extra flags for caffe2 compatibility, but they produce a model that isn't as fast running on ONNX runtime. Most new release of PyTorch and ONNX cause some sort of breakage in the export / usage of ONNX models. Please do your research and search ONNX and PyTorch issue tracker before asking me. Thanks. Copyright 2020 Ross Wightman """ import argparse import torch import numpy as np import onnx import geffnet parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation') parser.add_argument('output', metavar='ONNX_FILE', help='output model filename') parser.add_argument('--model', '-m', metavar='MODEL', default='mobilenetv3_large_100', help='model architecture (default: mobilenetv3_large_100)') parser.add_argument('--opset', type=int, default=10, help='ONNX opset to use (default: 10)') parser.add_argument('--keep-init', action='store_true', default=False, help='Keep initializers as input. Needed for Caffe2 compatible export in newer PyTorch/ONNX.') parser.add_argument('--aten-fallback', action='store_true', default=False, help='Fallback to ATEN ops. Helps fix AdaptiveAvgPool issue with Caffe2 in newer PyTorch/ONNX.') parser.add_argument('--dynamic-size', action='store_true', default=False, help='Export model width dynamic width/height. Not recommended for "tf" models with SAME padding.') parser.add_argument('-b', '--batch-size', default=1, type=int, metavar='N', help='mini-batch size (default: 1)') parser.add_argument('--img-size', default=None, type=int, metavar='N', help='Input image dimension, uses model default if empty') parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', help='Override mean pixel value of dataset') parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', help='Override std deviation of of dataset') parser.add_argument('--num-classes', type=int, default=1000, help='Number classes in dataset') parser.add_argument('--checkpoint', default='', type=str, metavar='PATH', help='path to checkpoint (default: none)') def main(): args = parser.parse_args() args.pretrained = True if args.checkpoint: args.pretrained = False print("==> Creating PyTorch {} model".format(args.model)) # NOTE exportable=True flag disables autofn/jit scripted activations and uses Conv2dSameExport layers # for models using SAME padding model = geffnet.create_model( args.model, num_classes=args.num_classes, in_chans=3, pretrained=args.pretrained, checkpoint_path=args.checkpoint, exportable=True) model.eval() example_input = torch.randn((args.batch_size, 3, args.img_size or 224, args.img_size or 224), requires_grad=True) # Run model once before export trace, sets padding for models with Conv2dSameExport. This means # that the padding for models with Conv2dSameExport (most models with tf_ prefix) is fixed for # the input img_size specified in this script. # Opset >= 11 should allow for dynamic padding, however I cannot get it to work due to # issues in the tracing of the dynamic padding or errors attempting to export the model after jit # scripting it (an approach that should work). Perhaps in a future PyTorch or ONNX versions... model(example_input) print("==> Exporting model to ONNX format at '{}'".format(args.output)) input_names = ["input0"] output_names = ["output0"] dynamic_axes = {'input0': {0: 'batch'}, 'output0': {0: 'batch'}} if args.dynamic_size: dynamic_axes['input0'][2] = 'height' dynamic_axes['input0'][3] = 'width' if args.aten_fallback: export_type = torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK else: export_type = torch.onnx.OperatorExportTypes.ONNX torch_out = torch.onnx._export( model, example_input, args.output, export_params=True, verbose=True, input_names=input_names, output_names=output_names, keep_initializers_as_inputs=args.keep_init, dynamic_axes=dynamic_axes, opset_version=args.opset, operator_export_type=export_type) print("==> Loading and checking exported model from '{}'".format(args.output)) onnx_model = onnx.load(args.output) onnx.checker.check_model(onnx_model) # assuming throw on error print("==> Passed") if args.keep_init and args.aten_fallback: import caffe2.python.onnx.backend as onnx_caffe2 # Caffe2 loading only works properly in newer PyTorch/ONNX combos when # keep_initializers_as_inputs and aten_fallback are set to True. print("==> Loading model into Caffe2 backend and comparing forward pass.".format(args.output)) caffe2_backend = onnx_caffe2.prepare(onnx_model) B = {onnx_model.graph.input[0].name: x.data.numpy()} c2_out = caffe2_backend.run(B)[0] np.testing.assert_almost_equal(torch_out.data.numpy(), c2_out, decimal=5) print("==> Passed") if __name__ == '__main__': main() ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/onnx_optimize.py ================================================ """ ONNX optimization script Run ONNX models through the optimizer to prune unneeded nodes, fuse batchnorm layers into conv, etc. NOTE: This isn't working consistently in recent PyTorch/ONNX combos (ie PyTorch 1.6 and ONNX 1.7), it seems time to switch to using the onnxruntime online optimizer (can also be saved for offline). Copyright 2020 Ross Wightman """ import argparse import warnings import onnx from onnx import optimizer parser = argparse.ArgumentParser(description="Optimize ONNX model") parser.add_argument("model", help="The ONNX model") parser.add_argument("--output", required=True, help="The optimized model output filename") def traverse_graph(graph, prefix=''): content = [] indent = prefix + ' ' graphs = [] num_nodes = 0 for node in graph.node: pn, gs = onnx.helper.printable_node(node, indent, subgraphs=True) assert isinstance(gs, list) content.append(pn) graphs.extend(gs) num_nodes += 1 for g in graphs: g_count, g_str = traverse_graph(g) content.append('\n' + g_str) num_nodes += g_count return num_nodes, '\n'.join(content) def main(): args = parser.parse_args() onnx_model = onnx.load(args.model) num_original_nodes, original_graph_str = traverse_graph(onnx_model.graph) # Optimizer passes to perform passes = [ #'eliminate_deadend', 'eliminate_identity', 'eliminate_nop_dropout', 'eliminate_nop_pad', 'eliminate_nop_transpose', 'eliminate_unused_initializer', 'extract_constant_to_initializer', 'fuse_add_bias_into_conv', 'fuse_bn_into_conv', 'fuse_consecutive_concats', 'fuse_consecutive_reduce_unsqueeze', 'fuse_consecutive_squeezes', 'fuse_consecutive_transposes', #'fuse_matmul_add_bias_into_gemm', 'fuse_pad_into_conv', #'fuse_transpose_into_gemm', #'lift_lexical_references', ] # Apply the optimization on the original serialized model # WARNING I've had issues with optimizer in recent versions of PyTorch / ONNX causing # 'duplicate definition of name' errors, see: https://github.com/onnx/onnx/issues/2401 # It may be better to rely on onnxruntime optimizations, see onnx_validate.py script. warnings.warn("I've had issues with optimizer in recent versions of PyTorch / ONNX." "Try onnxruntime optimization if this doesn't work.") optimized_model = optimizer.optimize(onnx_model, passes) num_optimized_nodes, optimzied_graph_str = traverse_graph(optimized_model.graph) print('==> The model after optimization:\n{}\n'.format(optimzied_graph_str)) print('==> The optimized model has {} nodes, the original had {}.'.format(num_optimized_nodes, num_original_nodes)) # Save the ONNX model onnx.save(optimized_model, args.output) if __name__ == "__main__": main() ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/onnx_to_caffe.py ================================================ import argparse import onnx from caffe2.python.onnx.backend import Caffe2Backend parser = argparse.ArgumentParser(description="Convert ONNX to Caffe2") parser.add_argument("model", help="The ONNX model") parser.add_argument("--c2-prefix", required=True, help="The output file prefix for the caffe2 model init and predict file. ") def main(): args = parser.parse_args() onnx_model = onnx.load(args.model) caffe2_init, caffe2_predict = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model) caffe2_init_str = caffe2_init.SerializeToString() with open(args.c2_prefix + '.init.pb', "wb") as f: f.write(caffe2_init_str) caffe2_predict_str = caffe2_predict.SerializeToString() with open(args.c2_prefix + '.predict.pb', "wb") as f: f.write(caffe2_predict_str) if __name__ == "__main__": main() ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/onnx_validate.py ================================================ """ ONNX-runtime validation script This script was created to verify accuracy and performance of exported ONNX models running with the onnxruntime. It utilizes the PyTorch dataloader/processing pipeline for a fair comparison against the originals. Copyright 2020 Ross Wightman """ import argparse import numpy as np import onnxruntime from data import create_loader, resolve_data_config, Dataset from utils import AverageMeter import time parser = argparse.ArgumentParser(description='Caffe2 ImageNet Validation') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('--onnx-input', default='', type=str, metavar='PATH', help='path to onnx model/weights file') parser.add_argument('--onnx-output-opt', default='', type=str, metavar='PATH', help='path to output optimized onnx graph') parser.add_argument('--profile', action='store_true', default=False, help='Enable profiler output.') parser.add_argument('-j', '--workers', default=2, type=int, metavar='N', help='number of data loading workers (default: 2)') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256)') parser.add_argument('--img-size', default=None, type=int, metavar='N', help='Input image dimension, uses model default if empty') parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', help='Override mean pixel value of dataset') parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', help='Override std deviation of of dataset') parser.add_argument('--crop-pct', type=float, default=None, metavar='PCT', help='Override default crop pct of 0.875') parser.add_argument('--interpolation', default='', type=str, metavar='NAME', help='Image resize interpolation type (overrides model)') parser.add_argument('--tf-preprocessing', dest='tf_preprocessing', action='store_true', help='use tensorflow mnasnet preporcessing') parser.add_argument('--print-freq', '-p', default=10, type=int, metavar='N', help='print frequency (default: 10)') def main(): args = parser.parse_args() args.gpu_id = 0 # Set graph optimization level sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL if args.profile: sess_options.enable_profiling = True if args.onnx_output_opt: sess_options.optimized_model_filepath = args.onnx_output_opt session = onnxruntime.InferenceSession(args.onnx_input, sess_options) data_config = resolve_data_config(None, args) loader = create_loader( Dataset(args.data, load_bytes=args.tf_preprocessing), input_size=data_config['input_size'], batch_size=args.batch_size, use_prefetcher=False, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, crop_pct=data_config['crop_pct'], tensorflow_preprocessing=args.tf_preprocessing) input_name = session.get_inputs()[0].name batch_time = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() for i, (input, target) in enumerate(loader): # run the net and return prediction output = session.run([], {input_name: input.data.numpy()}) output = output[0] # measure accuracy and record loss prec1, prec5 = accuracy_np(output, target.numpy()) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s, {ms_avg:.3f} ms/sample) \t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, ms_avg=100 * batch_time.avg / input.size(0), top1=top1, top5=top5)) print(' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'.format( top1=top1, top1a=100-top1.avg, top5=top5, top5a=100.-top5.avg)) def accuracy_np(output, target): max_indices = np.argsort(output, axis=1)[:, ::-1] top5 = 100 * np.equal(max_indices[:, :5], target[:, np.newaxis]).sum(axis=1).mean() top1 = 100 * np.equal(max_indices[:, 0], target).mean() return top1, top5 if __name__ == '__main__': main() ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/requirements.txt ================================================ torch>=1.2.0 torchvision>=0.4.0 ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/setup.py ================================================ """ Setup """ from setuptools import setup, find_packages from codecs import open from os import path here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, 'README.md'), encoding='utf-8') as f: long_description = f.read() exec(open('geffnet/version.py').read()) setup( name='geffnet', version=__version__, description='(Generic) EfficientNets for PyTorch', long_description=long_description, long_description_content_type='text/markdown', url='https://github.com/rwightman/gen-efficientnet-pytorch', author='Ross Wightman', author_email='hello@rwightman.com', classifiers=[ # How mature is this project? Common values are # 3 - Alpha # 4 - Beta # 5 - Production/Stable 'Development Status :: 3 - Alpha', 'Intended Audience :: Education', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Software Development', 'Topic :: Software Development :: Libraries', 'Topic :: Software Development :: Libraries :: Python Modules', ], # Note that this is a string of words separated by whitespace, not a list. keywords='pytorch pretrained models efficientnet mixnet mobilenetv3 mnasnet', packages=find_packages(exclude=['data']), install_requires=['torch >= 1.4', 'torchvision'], python_requires='>=3.6', ) ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/utils.py ================================================ import os class AverageMeter: """Computes and stores the average and current value""" def __init__(self): self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def accuracy(output, target, topk=(1,)): """Computes the precision@k for the specified values of k""" maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].reshape(-1).float().sum(0) res.append(correct_k.mul_(100.0 / batch_size)) return res def get_outdir(path, *paths, inc=False): outdir = os.path.join(path, *paths) if not os.path.exists(outdir): os.makedirs(outdir) elif inc: count = 1 outdir_inc = outdir + '-' + str(count) while os.path.exists(outdir_inc): count = count + 1 outdir_inc = outdir + '-' + str(count) assert count < 100 outdir = outdir_inc os.makedirs(outdir) return outdir ================================================ FILE: annotator/normalbae/models/submodules/efficientnet_repo/validate.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function import argparse import time import torch import torch.nn as nn import torch.nn.parallel from contextlib import suppress import geffnet from data import Dataset, create_loader, resolve_data_config from utils import accuracy, AverageMeter has_native_amp = False try: if getattr(torch.cuda.amp, 'autocast') is not None: has_native_amp = True except AttributeError: pass torch.backends.cudnn.benchmark = True parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('--model', '-m', metavar='MODEL', default='spnasnet1_00', help='model architecture (default: dpn92)') parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 2)') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256)') parser.add_argument('--img-size', default=None, type=int, metavar='N', help='Input image dimension, uses model default if empty') parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', help='Override mean pixel value of dataset') parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', help='Override std deviation of of dataset') parser.add_argument('--crop-pct', type=float, default=None, metavar='PCT', help='Override default crop pct of 0.875') parser.add_argument('--interpolation', default='', type=str, metavar='NAME', help='Image resize interpolation type (overrides model)') parser.add_argument('--num-classes', type=int, default=1000, help='Number classes in dataset') parser.add_argument('--print-freq', '-p', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('--checkpoint', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') parser.add_argument('--torchscript', dest='torchscript', action='store_true', help='convert model torchscript for inference') parser.add_argument('--num-gpu', type=int, default=1, help='Number of GPUS to use') parser.add_argument('--tf-preprocessing', dest='tf_preprocessing', action='store_true', help='use tensorflow mnasnet preporcessing') parser.add_argument('--no-cuda', dest='no_cuda', action='store_true', help='') parser.add_argument('--channels-last', action='store_true', default=False, help='Use channels_last memory layout') parser.add_argument('--amp', action='store_true', default=False, help='Use native Torch AMP mixed precision.') def main(): args = parser.parse_args() if not args.checkpoint and not args.pretrained: args.pretrained = True amp_autocast = suppress # do nothing if args.amp: if not has_native_amp: print("Native Torch AMP is not available (requires torch >= 1.6), using FP32.") else: amp_autocast = torch.cuda.amp.autocast # create model model = geffnet.create_model( args.model, num_classes=args.num_classes, in_chans=3, pretrained=args.pretrained, checkpoint_path=args.checkpoint, scriptable=args.torchscript) if args.channels_last: model = model.to(memory_format=torch.channels_last) if args.torchscript: torch.jit.optimized_execution(True) model = torch.jit.script(model) print('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(model, args) criterion = nn.CrossEntropyLoss() if not args.no_cuda: if args.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() else: model = model.cuda() criterion = criterion.cuda() loader = create_loader( Dataset(args.data, load_bytes=args.tf_preprocessing), input_size=data_config['input_size'], batch_size=args.batch_size, use_prefetcher=not args.no_cuda, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, crop_pct=data_config['crop_pct'], tensorflow_preprocessing=args.tf_preprocessing) batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() model.eval() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): if not args.no_cuda: target = target.cuda() input = input.cuda() if args.channels_last: input = input.contiguous(memory_format=torch.channels_last) # compute output with amp_autocast(): output = model(input) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s) \t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, loss=losses, top1=top1, top5=top5)) print(' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'.format( top1=top1, top1a=100-top1.avg, top5=top5, top5a=100.-top5.avg)) if __name__ == '__main__': main() ================================================ FILE: annotator/normalbae/models/submodules/encoder.py ================================================ import os import torch import torch.nn as nn import torch.nn.functional as F class Encoder(nn.Module): def __init__(self): super(Encoder, self).__init__() basemodel_name = 'tf_efficientnet_b5_ap' print('Loading base model ()...'.format(basemodel_name), end='') repo_path = os.path.join(os.path.dirname(__file__), 'efficientnet_repo') basemodel = torch.hub.load(repo_path, basemodel_name, pretrained=False, source='local') print('Done.') # Remove last layer print('Removing last two layers (global_pool & classifier).') basemodel.global_pool = nn.Identity() basemodel.classifier = nn.Identity() self.original_model = basemodel def forward(self, x): features = [x] for k, v in self.original_model._modules.items(): if (k == 'blocks'): for ki, vi in v._modules.items(): features.append(vi(features[-1])) else: features.append(v(features[-1])) return features ================================================ FILE: annotator/normalbae/models/submodules/submodules.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F ######################################################################################################################## # Upsample + BatchNorm class UpSampleBN(nn.Module): def __init__(self, skip_input, output_features): super(UpSampleBN, self).__init__() self._net = nn.Sequential(nn.Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(output_features), nn.LeakyReLU(), nn.Conv2d(output_features, output_features, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(output_features), nn.LeakyReLU()) def forward(self, x, concat_with): up_x = F.interpolate(x, size=[concat_with.size(2), concat_with.size(3)], mode='bilinear', align_corners=True) f = torch.cat([up_x, concat_with], dim=1) return self._net(f) # Upsample + GroupNorm + Weight Standardization class UpSampleGN(nn.Module): def __init__(self, skip_input, output_features): super(UpSampleGN, self).__init__() self._net = nn.Sequential(Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1), nn.GroupNorm(8, output_features), nn.LeakyReLU(), Conv2d(output_features, output_features, kernel_size=3, stride=1, padding=1), nn.GroupNorm(8, output_features), nn.LeakyReLU()) def forward(self, x, concat_with): up_x = F.interpolate(x, size=[concat_with.size(2), concat_with.size(3)], mode='bilinear', align_corners=True) f = torch.cat([up_x, concat_with], dim=1) return self._net(f) # Conv2d with weight standardization class Conv2d(nn.Conv2d): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super(Conv2d, self).__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias) def forward(self, x): weight = self.weight weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True) weight = weight - weight_mean std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + 1e-5 weight = weight / std.expand_as(weight) return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups) # normalize def norm_normalize(norm_out): min_kappa = 0.01 norm_x, norm_y, norm_z, kappa = torch.split(norm_out, 1, dim=1) norm = torch.sqrt(norm_x ** 2.0 + norm_y ** 2.0 + norm_z ** 2.0) + 1e-10 kappa = F.elu(kappa) + 1.0 + min_kappa final_out = torch.cat([norm_x / norm, norm_y / norm, norm_z / norm, kappa], dim=1) return final_out # uncertainty-guided sampling (only used during training) @torch.no_grad() def sample_points(init_normal, gt_norm_mask, sampling_ratio, beta): device = init_normal.device B, _, H, W = init_normal.shape N = int(sampling_ratio * H * W) beta = beta # uncertainty map uncertainty_map = -1 * init_normal[:, 3, :, :] # B, H, W # gt_invalid_mask (B, H, W) if gt_norm_mask is not None: gt_invalid_mask = F.interpolate(gt_norm_mask.float(), size=[H, W], mode='nearest') gt_invalid_mask = gt_invalid_mask[:, 0, :, :] < 0.5 uncertainty_map[gt_invalid_mask] = -1e4 # (B, H*W) _, idx = uncertainty_map.view(B, -1).sort(1, descending=True) # importance sampling if int(beta * N) > 0: importance = idx[:, :int(beta * N)] # B, beta*N # remaining remaining = idx[:, int(beta * N):] # B, H*W - beta*N # coverage num_coverage = N - int(beta * N) if num_coverage <= 0: samples = importance else: coverage_list = [] for i in range(B): idx_c = torch.randperm(remaining.size()[1]) # shuffles "H*W - beta*N" coverage_list.append(remaining[i, :][idx_c[:num_coverage]].view(1, -1)) # 1, N-beta*N coverage = torch.cat(coverage_list, dim=0) # B, N-beta*N samples = torch.cat((importance, coverage), dim=1) # B, N else: # remaining remaining = idx[:, :] # B, H*W # coverage num_coverage = N coverage_list = [] for i in range(B): idx_c = torch.randperm(remaining.size()[1]) # shuffles "H*W - beta*N" coverage_list.append(remaining[i, :][idx_c[:num_coverage]].view(1, -1)) # 1, N-beta*N coverage = torch.cat(coverage_list, dim=0) # B, N-beta*N samples = coverage # point coordinates rows_int = samples // W # 0 for first row, H-1 for last row rows_float = rows_int / float(H-1) # 0 to 1.0 rows_float = (rows_float * 2.0) - 1.0 # -1.0 to 1.0 cols_int = samples % W # 0 for first column, W-1 for last column cols_float = cols_int / float(W-1) # 0 to 1.0 cols_float = (cols_float * 2.0) - 1.0 # -1.0 to 1.0 point_coords = torch.zeros(B, 1, N, 2) point_coords[:, 0, :, 0] = cols_float # x coord point_coords[:, 0, :, 1] = rows_float # y coord point_coords = point_coords.to(device) return point_coords, rows_int, cols_int ================================================ FILE: annotator/normaldsine/LICENSE ================================================ DSINE SOFTWARE LICENCE AGREEMENT WE (Imperial College of Science, Technology and Medicine, (“Imperial College London”)) ARE WILLING TO LICENSE THIS SOFTWARE TO YOU (a licensee “You”) ONLY ON THE CONDITION THAT YOU ACCEPT ALL OF THE TERMS CONTAINED IN THE FOLLOWING AGREEMENT. PLEASE READ THE AGREEMENT CAREFULLY BEFORE DOWNLOADING THE SOFTWARE. BY EXERCISING THE OPTION TO DOWNLOAD THE SOFTWARE YOU AGREE TO BE BOUND BY THE TERMS OF THE AGREEMENT. SOFTWARE LICENCE AGREEMENT (EXCLUDING BSD COMPONENTS) 1. This Agreement pertains to a worldwide, non-exclusive, temporary, fully paid-up, royalty free, non-transferable, non-sub- licensable licence (the “Licence”) to use the elastic fusion source code, including any modification, part or derivative (the “Software”). Ownership and Licence. Your rights to use and download the Software onto your computer, and all other copies that You are authorised to make, are specified in this Agreement. However, we (or our licensors) retain all rights, including but not limited to all copyright and other intellectual property rights anywhere in the world, in the Software not expressly granted to You in this Agreement. 2. Permitted use of the Licence: (a) You may download and install the Software onto one computer or server for use in accordance with Clause 2(b) of this Agreement provided that You ensure that the Software is not accessible by other users unless they have themselves accepted the terms of this licence agreement. (b) You may use the Software solely for non-commercial, internal or academic research purposes and only in accordance with the terms of this Agreement. You may not use the Software for commercial purposes, including but not limited to (1) integration of all or part of the source code or the Software into a product for sale or licence by or on behalf of You to third parties or (2) use of the Software or any derivative of it for research to develop software products for sale or licence to a third party or (3) use of the Software or any derivative of it for research to develop non-software products for sale or licence to a third party, or (4) use of the Software to provide any service to an external organisation for which payment is received. Should You wish to use the Software for commercial purposes, You shall email researchcontracts.engineering@imperial.ac.uk . (c) Right to Copy. You may copy the Software for back-up and archival purposes, provided that each copy is kept in your possession and provided You reproduce our copyright notice (set out in Schedule 1) on each copy. (d) Transfer and sub-licensing. You may not rent, lend, or lease the Software and You may not transmit, transfer or sub-license this licence to use the Software or any of your rights or obligations under this Agreement to another party. (e) Identity of Licensee. The licence granted herein is personal to You. You shall not permit any third party to access, modify or otherwise use the Software nor shall You access modify or otherwise use the Software on behalf of any third party. If You wish to obtain a licence for mutiple users or a site licence for the Software please contact us at researchcontracts.engineering@imperial.ac.uk . (f) Publications and presentations. You may make public, results or data obtained from, dependent on or arising from research carried out using the Software, provided that any such presentation or publication identifies the Software as the source of the results or the data, including the Copyright Notice given in each element of the Software, and stating that the Software has been made available for use by You under licence from Imperial College London and You provide a copy of any such publication to Imperial College London. 3. Prohibited Uses. You may not, without written permission from us at researchcontracts.engineering@imperial.ac.uk : (a) Use, copy, modify, merge, or transfer copies of the Software or any documentation provided by us which relates to the Software except as provided in this Agreement; (b) Use any back-up or archival copies of the Software (or allow anyone else to use such copies) for any purpose other than to replace the original copy in the event it is destroyed or becomes defective; or (c) Disassemble, decompile or "unlock", reverse translate, or in any manner decode the Software for any reason. 4. Warranty Disclaimer (a) Disclaimer. The Software has been developed for research purposes only. You acknowledge that we are providing the Software to You under this licence agreement free of charge and on condition that the disclaimer set out below shall apply. We do not represent or warrant that the Software as to: (i) the quality, accuracy or reliability of the Software; (ii) the suitability of the Software for any particular use or for use under any specific conditions; and (iii) whether use of the Software will infringe third-party rights. You acknowledge that You have reviewed and evaluated the Software to determine that it meets your needs and that You assume all responsibility and liability for determining the suitability of the Software as fit for your particular purposes and requirements. Subject to Clause 4(b), we exclude and expressly disclaim all express and implied representations, warranties, conditions and terms not stated herein (including the implied conditions or warranties of satisfactory quality, merchantable quality, merchantability and fitness for purpose). (b) Savings. Some jurisdictions may imply warranties, conditions or terms or impose obligations upon us which cannot, in whole or in part, be excluded, restricted or modified or otherwise do not allow the exclusion of implied warranties, conditions or terms, in which case the above warranty disclaimer and exclusion will only apply to You to the extent permitted in the relevant jurisdiction and does not in any event exclude any implied warranties, conditions or terms which may not under applicable law be excluded. (c) Imperial College London disclaims all responsibility for the use which is made of the Software and any liability for the outcomes arising from using the Software. 5. Limitation of Liability (a) You acknowledge that we are providing the Software to You under this licence agreement free of charge and on condition that the limitation of liability set out below shall apply. Accordingly, subject to Clause 5(b), we exclude all liability whether in contract, tort, negligence or otherwise, in respect of the Software and/or any related documentation provided to You by us including, but not limited to, liability for loss or corruption of data, loss of contracts, loss of income, loss of profits, loss of cover and any consequential or indirect loss or damage of any kind arising out of or in connection with this licence agreement, however caused. This exclusion shall apply even if we have been advised of the possibility of such loss or damage. (b) You agree to indemnify Imperial College London and hold it harmless from and against any and all claims, damages and liabilities asserted by third parties (including claims for negligence) which arise directly or indirectly from the use of the Software or any derivative of it or the sale of any products based on the Software. You undertake to make no liability claim against any employee, student, agent or appointee of Imperial College London, in connection with this Licence or the Software. (c) Nothing in this Agreement shall have the effect of excluding or limiting our statutory liability. (d) Some jurisdictions do not allow these limitations or exclusions either wholly or in part, and, to that extent, they may not apply to you. Nothing in this licence agreement will affect your statutory rights or other relevant statutory provisions which cannot be excluded, restricted or modified, and its terms and conditions must be read and construed subject to any such statutory rights and/or provisions. 6. Confidentiality. You agree not to disclose any confidential information provided to You by us pursuant to this Agreement to any third party without our prior written consent. The obligations in this Clause 6 shall survive the termination of this Agreement for any reason. 7. Termination. (a) We may terminate this licence agreement and your right to use the Software at any time with immediate effect upon written notice to You. (b) This licence agreement and your right to use the Software automatically terminate if You: (i) fail to comply with any provisions of this Agreement; or (ii) destroy the copies of the Software in your possession, or voluntarily return the Software to us. (c) Upon termination You will destroy all copies of the Software. (d) Otherwise, the restrictions on your rights to use the Software will expire 10 (ten) years after first use of the Software under this licence agreement. 8. Miscellaneous Provisions. (a) This Agreement will be governed by and construed in accordance with the substantive laws of England and Wales whose courts shall have exclusive jurisdiction over all disputes which may arise between us. (b) This is the entire agreement between us relating to the Software, and supersedes any prior purchase order, communications, advertising or representations concerning the Software. (c) No change or modification of this Agreement will be valid unless it is in writing, and is signed by us. (d) The unenforceability or invalidity of any part of this Agreement will not affect the enforceability or validity of the remaining parts. BSD Elements of the Software For BSD elements of the Software, the following terms shall apply: Copyright as indicated in the header of the individual element of the Software. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SCHEDULE 1 The Software DSINE is a framework for estimating surface normals from a single image. It is based on the techniques described in the following publication: • Gwangbin Bae, Andrew J. Davison. Rethinking Inductive Biases for Surface Normal Estimation. CVPR, 2024 _________________________ Acknowledgments If you use the software, you should reference the following paper in any publication: • Gwangbin Bae, Andrew J. Davison. Rethinking Inductive Biases for Surface Normal Estimation. CVPR, 2024 ================================================ FILE: annotator/normaldsine/__init__.py ================================================ import os import torch import torch.nn.functional as F import numpy as np from einops import rearrange from modules import devices from annotator.annotator_path import models_path import torchvision.transforms as transforms import dsine.utils.utils as utils from dsine.models.dsine import DSINE from scripts.utils import resize_image_with_pad class NormalDsineDetector: model_dir = os.path.join(models_path, "normal_dsine") def __init__(self): self.model = None self.device = devices.get_device_for("controlnet") def load_model(self): remote_model_path = "https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite/resolve/main/Annotators/dsine.pt" modelpath = os.path.join(self.model_dir, "dsine.pt") if not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=self.model_dir) model = DSINE() model.pixel_coords = model.pixel_coords.to(self.device) model = utils.load_checkpoint(modelpath, model) model.eval() self.model = model.to(self.device) self.norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) def unload_model(self): if self.model is not None: self.model.cpu() def __call__(self, input_image, new_fov=60.0, iterations=5, resulotion=512): if self.model is None: self.load_model() self.model.to(self.device) self.model.num_iter = iterations orig_H, orig_W = input_image.shape[:2] l, r, t, b = utils.pad_input(orig_H, orig_W) input_image, remove_pad = resize_image_with_pad(input_image, resulotion) assert input_image.ndim == 3 image_normal = input_image with torch.no_grad(): image_normal = torch.from_numpy(image_normal).float().to(self.device) image_normal = image_normal / 255.0 image_normal = rearrange(image_normal, 'h w c -> 1 c h w') image_normal = self.norm(image_normal) intrins = utils.get_intrins_from_fov(new_fov=new_fov, H=orig_H, W=orig_W, device=self.device).unsqueeze(0) intrins[:, 0, 2] += l intrins[:, 1, 2] += t normal = self.model(image_normal, intrins=intrins)[-1] normal = normal[:, :, t:t+orig_H, l:l+orig_W] normal = ((normal + 1) * 0.5).clip(0, 1) normal = rearrange(normal[0], 'c h w -> h w c').cpu().numpy() normal_image = (normal * 255.0).clip(0, 255).astype(np.uint8) return remove_pad(normal_image) ================================================ FILE: annotator/oneformer/LICENSE ================================================ MIT License Copyright (c) 2022 Caroline Chan Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/oneformer/__init__.py ================================================ import os from modules import devices from annotator.annotator_path import models_path from .api import make_detectron2_model, semantic_run class OneformerDetector: model_dir = os.path.join(models_path, "oneformer") configs = { "coco": { "name": "150_16_swin_l_oneformer_coco_100ep.pth", "config": 'configs/coco/oneformer_swin_large_IN21k_384_bs16_100ep.yaml' }, "ade20k": { "name": "250_16_swin_l_oneformer_ade20k_160k.pth", "config": 'configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml' } } def __init__(self, config): self.model = None self.metadata = None self.config = config self.device = devices.get_device_for("controlnet") def load_model(self): remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/" + self.config["name"] modelpath = os.path.join(self.model_dir, self.config["name"]) if not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=self.model_dir) config = os.path.join(os.path.dirname(__file__), self.config["config"]) model, self.metadata = make_detectron2_model(config, modelpath) self.model = model def unload_model(self): if self.model is not None: self.model.model.cpu() def __call__(self, img): if self.model is None: self.load_model() self.model.model.to(self.device) return semantic_run(img, self.model, self.metadata) ================================================ FILE: annotator/oneformer/api.py ================================================ import os os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" import torch from annotator.oneformer.detectron2.config import get_cfg from annotator.oneformer.detectron2.projects.deeplab import add_deeplab_config from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.oneformer import ( add_oneformer_config, add_common_config, add_swin_config, add_dinat_config, ) from annotator.oneformer.oneformer.demo.defaults import DefaultPredictor from annotator.oneformer.oneformer.demo.visualizer import Visualizer, ColorMode def make_detectron2_model(config_path, ckpt_path): cfg = get_cfg() add_deeplab_config(cfg) add_common_config(cfg) add_swin_config(cfg) add_oneformer_config(cfg) add_dinat_config(cfg) cfg.merge_from_file(config_path) cfg.MODEL.WEIGHTS = ckpt_path cfg.freeze() metadata = MetadataCatalog.get(cfg.DATASETS.TEST_PANOPTIC[0] if len(cfg.DATASETS.TEST_PANOPTIC) else "__unused") return DefaultPredictor(cfg), metadata def semantic_run(img, predictor, metadata): predictions = predictor(img[:, :, ::-1], "semantic") # Predictor of OneFormer must use BGR image !!! visualizer_map = Visualizer(img, is_img=False, metadata=metadata, instance_mode=ColorMode.IMAGE) out_map = visualizer_map.draw_sem_seg(predictions["sem_seg"].argmax(dim=0).cpu(), alpha=1, is_text=False).get_image() return out_map ================================================ FILE: annotator/oneformer/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml ================================================ MODEL: BACKBONE: FREEZE_AT: 0 NAME: "build_resnet_backbone" WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used DATASETS: TRAIN: ("ade20k_panoptic_train",) TEST_PANOPTIC: ("ade20k_panoptic_val",) TEST_INSTANCE: ("ade20k_instance_val",) TEST_SEMANTIC: ("ade20k_sem_seg_val",) SOLVER: IMS_PER_BATCH: 16 BASE_LR: 0.0001 MAX_ITER: 160000 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 0 WEIGHT_DECAY: 0.05 OPTIMIZER: "ADAMW" LR_SCHEDULER_NAME: "WarmupPolyLR" BACKBONE_MULTIPLIER: 0.1 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 AMP: ENABLED: True INPUT: MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"] MIN_SIZE_TRAIN_SAMPLING: "choice" MIN_SIZE_TEST: 512 MAX_SIZE_TRAIN: 2048 MAX_SIZE_TEST: 2048 CROP: ENABLED: True TYPE: "absolute" SIZE: (512, 512) SINGLE_CATEGORY_MAX_AREA: 1.0 COLOR_AUG_SSD: True SIZE_DIVISIBILITY: 512 # used in dataset mapper FORMAT: "RGB" DATASET_MAPPER_NAME: "oneformer_unified" MAX_SEQ_LEN: 77 TASK_SEQ_LEN: 77 TASK_PROB: SEMANTIC: 0.33 INSTANCE: 0.66 TEST: EVAL_PERIOD: 5000 AUG: ENABLED: False MIN_SIZES: [256, 384, 512, 640, 768, 896] MAX_SIZE: 3584 FLIP: True DATALOADER: FILTER_EMPTY_ANNOTATIONS: True NUM_WORKERS: 4 VERSION: 2 ================================================ FILE: annotator/oneformer/configs/ade20k/oneformer_R50_bs16_160k.yaml ================================================ _BASE_: Base-ADE20K-UnifiedSegmentation.yaml MODEL: META_ARCHITECTURE: "OneFormer" SEM_SEG_HEAD: NAME: "OneFormerHead" IGNORE_VALUE: 255 NUM_CLASSES: 150 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 ONE_FORMER: TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder" TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 2.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 CONTRASTIVE_WEIGHT: 0.5 CONTRASTIVE_TEMPERATURE: 0.07 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 150 USE_TASK_NORM: True NHEADS: 8 DROPOUT: 0.1 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 CLASS_DEC_LAYERS: 2 DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 TEXT_ENCODER: WIDTH: 256 CONTEXT_LENGTH: 77 NUM_LAYERS: 6 VOCAB_SIZE: 49408 PROJ_NUM_LAYERS: 2 N_CTX: 16 TEST: SEMANTIC_ON: True INSTANCE_ON: True PANOPTIC_ON: True OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.8 TASK: "panoptic" TEST: DETECTIONS_PER_IMAGE: 150 ================================================ FILE: annotator/oneformer/configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml ================================================ _BASE_: oneformer_R50_bs16_160k.yaml MODEL: BACKBONE: NAME: "D2SwinTransformer" SWIN: EMBED_DIM: 192 DEPTHS: [2, 2, 18, 2] NUM_HEADS: [6, 12, 24, 48] WINDOW_SIZE: 12 APE: False DROP_PATH_RATE: 0.3 PATCH_NORM: True PRETRAIN_IMG_SIZE: 384 WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] ONE_FORMER: NUM_OBJECT_QUERIES: 250 INPUT: MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] MIN_SIZE_TRAIN_SAMPLING: "choice" MIN_SIZE_TEST: 640 MAX_SIZE_TRAIN: 2560 MAX_SIZE_TEST: 2560 CROP: ENABLED: True TYPE: "absolute" SIZE: (640, 640) SINGLE_CATEGORY_MAX_AREA: 1.0 COLOR_AUG_SSD: True SIZE_DIVISIBILITY: 640 # used in dataset mapper FORMAT: "RGB" TEST: DETECTIONS_PER_IMAGE: 250 EVAL_PERIOD: 5000 AUG: ENABLED: False MIN_SIZES: [320, 480, 640, 800, 960, 1120] MAX_SIZE: 4480 FLIP: True ================================================ FILE: annotator/oneformer/configs/coco/Base-COCO-UnifiedSegmentation.yaml ================================================ MODEL: BACKBONE: FREEZE_AT: 0 NAME: "build_resnet_backbone" WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used DATASETS: TRAIN: ("coco_2017_train_panoptic_with_sem_seg",) TEST_PANOPTIC: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well TEST_INSTANCE: ("coco_2017_val",) TEST_SEMANTIC: ("coco_2017_val_panoptic_with_sem_seg",) SOLVER: IMS_PER_BATCH: 16 BASE_LR: 0.0001 STEPS: (327778, 355092) MAX_ITER: 368750 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 10 WEIGHT_DECAY: 0.05 OPTIMIZER: "ADAMW" BACKBONE_MULTIPLIER: 0.1 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 AMP: ENABLED: True INPUT: IMAGE_SIZE: 1024 MIN_SCALE: 0.1 MAX_SCALE: 2.0 FORMAT: "RGB" DATASET_MAPPER_NAME: "coco_unified_lsj" MAX_SEQ_LEN: 77 TASK_SEQ_LEN: 77 TASK_PROB: SEMANTIC: 0.33 INSTANCE: 0.66 TEST: EVAL_PERIOD: 5000 DATALOADER: FILTER_EMPTY_ANNOTATIONS: True NUM_WORKERS: 4 VERSION: 2 ================================================ FILE: annotator/oneformer/configs/coco/oneformer_R50_bs16_50ep.yaml ================================================ _BASE_: Base-COCO-UnifiedSegmentation.yaml MODEL: META_ARCHITECTURE: "OneFormer" SEM_SEG_HEAD: NAME: "OneFormerHead" IGNORE_VALUE: 255 NUM_CLASSES: 133 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 ONE_FORMER: TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder" TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 2.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 CONTRASTIVE_WEIGHT: 0.5 CONTRASTIVE_TEMPERATURE: 0.07 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 150 USE_TASK_NORM: True NHEADS: 8 DROPOUT: 0.1 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 CLASS_DEC_LAYERS: 2 DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 TEXT_ENCODER: WIDTH: 256 CONTEXT_LENGTH: 77 NUM_LAYERS: 6 VOCAB_SIZE: 49408 PROJ_NUM_LAYERS: 2 N_CTX: 16 TEST: SEMANTIC_ON: True INSTANCE_ON: True PANOPTIC_ON: True DETECTION_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.8 TASK: "panoptic" TEST: DETECTIONS_PER_IMAGE: 150 ================================================ FILE: annotator/oneformer/configs/coco/oneformer_swin_large_IN21k_384_bs16_100ep.yaml ================================================ _BASE_: oneformer_R50_bs16_50ep.yaml MODEL: BACKBONE: NAME: "D2SwinTransformer" SWIN: EMBED_DIM: 192 DEPTHS: [2, 2, 18, 2] NUM_HEADS: [6, 12, 24, 48] WINDOW_SIZE: 12 APE: False DROP_PATH_RATE: 0.3 PATCH_NORM: True PRETRAIN_IMG_SIZE: 384 WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] ONE_FORMER: NUM_OBJECT_QUERIES: 150 SOLVER: STEPS: (655556, 735184) MAX_ITER: 737500 AMP: ENABLED: False TEST: DETECTIONS_PER_IMAGE: 150 ================================================ FILE: annotator/oneformer/detectron2/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .utils.env import setup_environment setup_environment() # This line will be programatically read/write by setup.py. # Leave them at the bottom of this file and don't touch them. __version__ = "0.6" ================================================ FILE: annotator/oneformer/detectron2/checkpoint/__init__.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. # File: from . import catalog as _UNUSED # register the handler from .detection_checkpoint import DetectionCheckpointer from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer __all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"] ================================================ FILE: annotator/oneformer/detectron2/checkpoint/c2_model_loading.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import logging import re from typing import Dict, List import torch from tabulate import tabulate def convert_basic_c2_names(original_keys): """ Apply some basic name conversion to names in C2 weights. It only deals with typical backbone models. Args: original_keys (list[str]): Returns: list[str]: The same number of strings matching those in original_keys. """ layer_keys = copy.deepcopy(original_keys) layer_keys = [ {"pred_b": "linear_b", "pred_w": "linear_w"}.get(k, k) for k in layer_keys ] # some hard-coded mappings layer_keys = [k.replace("_", ".") for k in layer_keys] layer_keys = [re.sub("\\.b$", ".bias", k) for k in layer_keys] layer_keys = [re.sub("\\.w$", ".weight", k) for k in layer_keys] # Uniform both bn and gn names to "norm" layer_keys = [re.sub("bn\\.s$", "norm.weight", k) for k in layer_keys] layer_keys = [re.sub("bn\\.bias$", "norm.bias", k) for k in layer_keys] layer_keys = [re.sub("bn\\.rm", "norm.running_mean", k) for k in layer_keys] layer_keys = [re.sub("bn\\.running.mean$", "norm.running_mean", k) for k in layer_keys] layer_keys = [re.sub("bn\\.riv$", "norm.running_var", k) for k in layer_keys] layer_keys = [re.sub("bn\\.running.var$", "norm.running_var", k) for k in layer_keys] layer_keys = [re.sub("bn\\.gamma$", "norm.weight", k) for k in layer_keys] layer_keys = [re.sub("bn\\.beta$", "norm.bias", k) for k in layer_keys] layer_keys = [re.sub("gn\\.s$", "norm.weight", k) for k in layer_keys] layer_keys = [re.sub("gn\\.bias$", "norm.bias", k) for k in layer_keys] # stem layer_keys = [re.sub("^res\\.conv1\\.norm\\.", "conv1.norm.", k) for k in layer_keys] # to avoid mis-matching with "conv1" in other components (e.g. detection head) layer_keys = [re.sub("^conv1\\.", "stem.conv1.", k) for k in layer_keys] # layer1-4 is used by torchvision, however we follow the C2 naming strategy (res2-5) # layer_keys = [re.sub("^res2.", "layer1.", k) for k in layer_keys] # layer_keys = [re.sub("^res3.", "layer2.", k) for k in layer_keys] # layer_keys = [re.sub("^res4.", "layer3.", k) for k in layer_keys] # layer_keys = [re.sub("^res5.", "layer4.", k) for k in layer_keys] # blocks layer_keys = [k.replace(".branch1.", ".shortcut.") for k in layer_keys] layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys] layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys] layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys] # DensePose substitutions layer_keys = [re.sub("^body.conv.fcn", "body_conv_fcn", k) for k in layer_keys] layer_keys = [k.replace("AnnIndex.lowres", "ann_index_lowres") for k in layer_keys] layer_keys = [k.replace("Index.UV.lowres", "index_uv_lowres") for k in layer_keys] layer_keys = [k.replace("U.lowres", "u_lowres") for k in layer_keys] layer_keys = [k.replace("V.lowres", "v_lowres") for k in layer_keys] return layer_keys def convert_c2_detectron_names(weights): """ Map Caffe2 Detectron weight names to Detectron2 names. Args: weights (dict): name -> tensor Returns: dict: detectron2 names -> tensor dict: detectron2 names -> C2 names """ logger = logging.getLogger(__name__) logger.info("Renaming Caffe2 weights ......") original_keys = sorted(weights.keys()) layer_keys = copy.deepcopy(original_keys) layer_keys = convert_basic_c2_names(layer_keys) # -------------------------------------------------------------------------- # RPN hidden representation conv # -------------------------------------------------------------------------- # FPN case # In the C2 model, the RPN hidden layer conv is defined for FPN level 2 and then # shared for all other levels, hence the appearance of "fpn2" layer_keys = [ k.replace("conv.rpn.fpn2", "proposal_generator.rpn_head.conv") for k in layer_keys ] # Non-FPN case layer_keys = [k.replace("conv.rpn", "proposal_generator.rpn_head.conv") for k in layer_keys] # -------------------------------------------------------------------------- # RPN box transformation conv # -------------------------------------------------------------------------- # FPN case (see note above about "fpn2") layer_keys = [ k.replace("rpn.bbox.pred.fpn2", "proposal_generator.rpn_head.anchor_deltas") for k in layer_keys ] layer_keys = [ k.replace("rpn.cls.logits.fpn2", "proposal_generator.rpn_head.objectness_logits") for k in layer_keys ] # Non-FPN case layer_keys = [ k.replace("rpn.bbox.pred", "proposal_generator.rpn_head.anchor_deltas") for k in layer_keys ] layer_keys = [ k.replace("rpn.cls.logits", "proposal_generator.rpn_head.objectness_logits") for k in layer_keys ] # -------------------------------------------------------------------------- # Fast R-CNN box head # -------------------------------------------------------------------------- layer_keys = [re.sub("^bbox\\.pred", "bbox_pred", k) for k in layer_keys] layer_keys = [re.sub("^cls\\.score", "cls_score", k) for k in layer_keys] layer_keys = [re.sub("^fc6\\.", "box_head.fc1.", k) for k in layer_keys] layer_keys = [re.sub("^fc7\\.", "box_head.fc2.", k) for k in layer_keys] # 4conv1fc head tensor names: head_conv1_w, head_conv1_gn_s layer_keys = [re.sub("^head\\.conv", "box_head.conv", k) for k in layer_keys] # -------------------------------------------------------------------------- # FPN lateral and output convolutions # -------------------------------------------------------------------------- def fpn_map(name): """ Look for keys with the following patterns: 1) Starts with "fpn.inner." Example: "fpn.inner.res2.2.sum.lateral.weight" Meaning: These are lateral pathway convolutions 2) Starts with "fpn.res" Example: "fpn.res2.2.sum.weight" Meaning: These are FPN output convolutions """ splits = name.split(".") norm = ".norm" if "norm" in splits else "" if name.startswith("fpn.inner."): # splits example: ['fpn', 'inner', 'res2', '2', 'sum', 'lateral', 'weight'] stage = int(splits[2][len("res") :]) return "fpn_lateral{}{}.{}".format(stage, norm, splits[-1]) elif name.startswith("fpn.res"): # splits example: ['fpn', 'res2', '2', 'sum', 'weight'] stage = int(splits[1][len("res") :]) return "fpn_output{}{}.{}".format(stage, norm, splits[-1]) return name layer_keys = [fpn_map(k) for k in layer_keys] # -------------------------------------------------------------------------- # Mask R-CNN mask head # -------------------------------------------------------------------------- # roi_heads.StandardROIHeads case layer_keys = [k.replace(".[mask].fcn", "mask_head.mask_fcn") for k in layer_keys] layer_keys = [re.sub("^\\.mask\\.fcn", "mask_head.mask_fcn", k) for k in layer_keys] layer_keys = [k.replace("mask.fcn.logits", "mask_head.predictor") for k in layer_keys] # roi_heads.Res5ROIHeads case layer_keys = [k.replace("conv5.mask", "mask_head.deconv") for k in layer_keys] # -------------------------------------------------------------------------- # Keypoint R-CNN head # -------------------------------------------------------------------------- # interestingly, the keypoint head convs have blob names that are simply "conv_fcnX" layer_keys = [k.replace("conv.fcn", "roi_heads.keypoint_head.conv_fcn") for k in layer_keys] layer_keys = [ k.replace("kps.score.lowres", "roi_heads.keypoint_head.score_lowres") for k in layer_keys ] layer_keys = [k.replace("kps.score.", "roi_heads.keypoint_head.score.") for k in layer_keys] # -------------------------------------------------------------------------- # Done with replacements # -------------------------------------------------------------------------- assert len(set(layer_keys)) == len(layer_keys) assert len(original_keys) == len(layer_keys) new_weights = {} new_keys_to_original_keys = {} for orig, renamed in zip(original_keys, layer_keys): new_keys_to_original_keys[renamed] = orig if renamed.startswith("bbox_pred.") or renamed.startswith("mask_head.predictor."): # remove the meaningless prediction weight for background class new_start_idx = 4 if renamed.startswith("bbox_pred.") else 1 new_weights[renamed] = weights[orig][new_start_idx:] logger.info( "Remove prediction weight for background class in {}. The shape changes from " "{} to {}.".format( renamed, tuple(weights[orig].shape), tuple(new_weights[renamed].shape) ) ) elif renamed.startswith("cls_score."): # move weights of bg class from original index 0 to last index logger.info( "Move classification weights for background class in {} from index 0 to " "index {}.".format(renamed, weights[orig].shape[0] - 1) ) new_weights[renamed] = torch.cat([weights[orig][1:], weights[orig][:1]]) else: new_weights[renamed] = weights[orig] return new_weights, new_keys_to_original_keys # Note the current matching is not symmetric. # it assumes model_state_dict will have longer names. def align_and_update_state_dicts(model_state_dict, ckpt_state_dict, c2_conversion=True): """ Match names between the two state-dict, and returns a new chkpt_state_dict with names converted to match model_state_dict with heuristics. The returned dict can be later loaded with fvcore checkpointer. If `c2_conversion==True`, `ckpt_state_dict` is assumed to be a Caffe2 model and will be renamed at first. Strategy: suppose that the models that we will create will have prefixes appended to each of its keys, for example due to an extra level of nesting that the original pre-trained weights from ImageNet won't contain. For example, model.state_dict() might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains res2.conv1.weight. We thus want to match both parameters together. For that, we look for each model weight, look among all loaded keys if there is one that is a suffix of the current weight name, and use it if that's the case. If multiple matches exist, take the one with longest size of the corresponding name. For example, for the same model as before, the pretrained weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, we want to match backbone[0].body.conv1.weight to conv1.weight, and backbone[0].body.res2.conv1.weight to res2.conv1.weight. """ model_keys = sorted(model_state_dict.keys()) if c2_conversion: ckpt_state_dict, original_keys = convert_c2_detectron_names(ckpt_state_dict) # original_keys: the name in the original dict (before renaming) else: original_keys = {x: x for x in ckpt_state_dict.keys()} ckpt_keys = sorted(ckpt_state_dict.keys()) def match(a, b): # Matched ckpt_key should be a complete (starts with '.') suffix. # For example, roi_heads.mesh_head.whatever_conv1 does not match conv1, # but matches whatever_conv1 or mesh_head.whatever_conv1. return a == b or a.endswith("." + b) # get a matrix of string matches, where each (i, j) entry correspond to the size of the # ckpt_key string, if it matches match_matrix = [len(j) if match(i, j) else 0 for i in model_keys for j in ckpt_keys] match_matrix = torch.as_tensor(match_matrix).view(len(model_keys), len(ckpt_keys)) # use the matched one with longest size in case of multiple matches max_match_size, idxs = match_matrix.max(1) # remove indices that correspond to no-match idxs[max_match_size == 0] = -1 logger = logging.getLogger(__name__) # matched_pairs (matched checkpoint key --> matched model key) matched_keys = {} result_state_dict = {} for idx_model, idx_ckpt in enumerate(idxs.tolist()): if idx_ckpt == -1: continue key_model = model_keys[idx_model] key_ckpt = ckpt_keys[idx_ckpt] value_ckpt = ckpt_state_dict[key_ckpt] shape_in_model = model_state_dict[key_model].shape if shape_in_model != value_ckpt.shape: logger.warning( "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format( key_ckpt, value_ckpt.shape, key_model, shape_in_model ) ) logger.warning( "{} will not be loaded. Please double check and see if this is desired.".format( key_ckpt ) ) continue assert key_model not in result_state_dict result_state_dict[key_model] = value_ckpt if key_ckpt in matched_keys: # already added to matched_keys logger.error( "Ambiguity found for {} in checkpoint!" "It matches at least two keys in the model ({} and {}).".format( key_ckpt, key_model, matched_keys[key_ckpt] ) ) raise ValueError("Cannot match one checkpoint key to multiple keys in the model.") matched_keys[key_ckpt] = key_model # logging: matched_model_keys = sorted(matched_keys.values()) if len(matched_model_keys) == 0: logger.warning("No weights in checkpoint matched with model.") return ckpt_state_dict common_prefix = _longest_common_prefix(matched_model_keys) rev_matched_keys = {v: k for k, v in matched_keys.items()} original_keys = {k: original_keys[rev_matched_keys[k]] for k in matched_model_keys} model_key_groups = _group_keys_by_module(matched_model_keys, original_keys) table = [] memo = set() for key_model in matched_model_keys: if key_model in memo: continue if key_model in model_key_groups: group = model_key_groups[key_model] memo |= set(group) shapes = [tuple(model_state_dict[k].shape) for k in group] table.append( ( _longest_common_prefix([k[len(common_prefix) :] for k in group]) + "*", _group_str([original_keys[k] for k in group]), " ".join([str(x).replace(" ", "") for x in shapes]), ) ) else: key_checkpoint = original_keys[key_model] shape = str(tuple(model_state_dict[key_model].shape)) table.append((key_model[len(common_prefix) :], key_checkpoint, shape)) table_str = tabulate( table, tablefmt="pipe", headers=["Names in Model", "Names in Checkpoint", "Shapes"] ) logger.info( "Following weights matched with " + (f"submodule {common_prefix[:-1]}" if common_prefix else "model") + ":\n" + table_str ) unmatched_ckpt_keys = [k for k in ckpt_keys if k not in set(matched_keys.keys())] for k in unmatched_ckpt_keys: result_state_dict[k] = ckpt_state_dict[k] return result_state_dict def _group_keys_by_module(keys: List[str], original_names: Dict[str, str]): """ Params in the same submodule are grouped together. Args: keys: names of all parameters original_names: mapping from parameter name to their name in the checkpoint Returns: dict[name -> all other names in the same group] """ def _submodule_name(key): pos = key.rfind(".") if pos < 0: return None prefix = key[: pos + 1] return prefix all_submodules = [_submodule_name(k) for k in keys] all_submodules = [x for x in all_submodules if x] all_submodules = sorted(all_submodules, key=len) ret = {} for prefix in all_submodules: group = [k for k in keys if k.startswith(prefix)] if len(group) <= 1: continue original_name_lcp = _longest_common_prefix_str([original_names[k] for k in group]) if len(original_name_lcp) == 0: # don't group weights if original names don't share prefix continue for k in group: if k in ret: continue ret[k] = group return ret def _longest_common_prefix(names: List[str]) -> str: """ ["abc.zfg", "abc.zef"] -> "abc." """ names = [n.split(".") for n in names] m1, m2 = min(names), max(names) ret = [a for a, b in zip(m1, m2) if a == b] ret = ".".join(ret) + "." if len(ret) else "" return ret def _longest_common_prefix_str(names: List[str]) -> str: m1, m2 = min(names), max(names) lcp = [] for a, b in zip(m1, m2): if a == b: lcp.append(a) else: break lcp = "".join(lcp) return lcp def _group_str(names: List[str]) -> str: """ Turn "common1", "common2", "common3" into "common{1,2,3}" """ lcp = _longest_common_prefix_str(names) rest = [x[len(lcp) :] for x in names] rest = "{" + ",".join(rest) + "}" ret = lcp + rest # add some simplification for BN specifically ret = ret.replace("bn_{beta,running_mean,running_var,gamma}", "bn_*") ret = ret.replace("bn_beta,bn_running_mean,bn_running_var,bn_gamma", "bn_*") return ret ================================================ FILE: annotator/oneformer/detectron2/checkpoint/catalog.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging from annotator.oneformer.detectron2.utils.file_io import PathHandler, PathManager class ModelCatalog(object): """ Store mappings from names to third-party models. """ S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron" # MSRA models have STRIDE_IN_1X1=True. False otherwise. # NOTE: all BN models here have fused BN into an affine layer. # As a result, you should only load them to a model with "FrozenBN". # Loading them to a model with regular BN or SyncBN is wrong. # Even when loaded to FrozenBN, it is still different from affine by an epsilon, # which should be negligible for training. # NOTE: all models here uses PIXEL_STD=[1,1,1] # NOTE: Most of the BN models here are no longer used. We use the # re-converted pre-trained models under detectron2 model zoo instead. C2_IMAGENET_MODELS = { "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl", "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl", "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl", "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl", "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl", "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl", "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl", } C2_DETECTRON_PATH_FORMAT = ( "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl" # noqa B950 ) C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival" C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival" # format: {model_name} -> part of the url C2_DETECTRON_MODELS = { "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW", # noqa B950 "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I", # noqa B950 "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7", # noqa B950 "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ", # noqa B950 "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB", # noqa B950 "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC", # noqa B950 "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT", # noqa B950 "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI", # noqa B950 "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q", # noqa B950 "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao", # noqa B950 "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L", # noqa B950 "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179", # noqa B950 "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2", # noqa B950 } @staticmethod def get(name): if name.startswith("Caffe2Detectron/COCO"): return ModelCatalog._get_c2_detectron_baseline(name) if name.startswith("ImageNetPretrained/"): return ModelCatalog._get_c2_imagenet_pretrained(name) raise RuntimeError("model not present in the catalog: {}".format(name)) @staticmethod def _get_c2_imagenet_pretrained(name): prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX name = name[len("ImageNetPretrained/") :] name = ModelCatalog.C2_IMAGENET_MODELS[name] url = "/".join([prefix, name]) return url @staticmethod def _get_c2_detectron_baseline(name): name = name[len("Caffe2Detectron/COCO/") :] url = ModelCatalog.C2_DETECTRON_MODELS[name] if "keypoint_rcnn" in name: dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS else: dataset = ModelCatalog.C2_DATASET_COCO if "35998355/rpn_R-50-C4_1x" in name: # this one model is somehow different from others .. type = "rpn" else: type = "generalized_rcnn" # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`. url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format( prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset ) return url class ModelCatalogHandler(PathHandler): """ Resolve URL like catalog://. """ PREFIX = "catalog://" def _get_supported_prefixes(self): return [self.PREFIX] def _get_local_path(self, path, **kwargs): logger = logging.getLogger(__name__) catalog_path = ModelCatalog.get(path[len(self.PREFIX) :]) logger.info("Catalog entry {} points to {}".format(path, catalog_path)) return PathManager.get_local_path(catalog_path, **kwargs) def _open(self, path, mode="r", **kwargs): return PathManager.open(self._get_local_path(path), mode, **kwargs) PathManager.register_handler(ModelCatalogHandler()) ================================================ FILE: annotator/oneformer/detectron2/checkpoint/detection_checkpoint.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import os import pickle from urllib.parse import parse_qs, urlparse import torch from fvcore.common.checkpoint import Checkpointer from torch.nn.parallel import DistributedDataParallel import annotator.oneformer.detectron2.utils.comm as comm from annotator.oneformer.detectron2.utils.file_io import PathManager from .c2_model_loading import align_and_update_state_dicts class DetectionCheckpointer(Checkpointer): """ Same as :class:`Checkpointer`, but is able to: 1. handle models in detectron & detectron2 model zoo, and apply conversions for legacy models. 2. correctly load checkpoints that are only available on the master worker """ def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables): is_main_process = comm.is_main_process() super().__init__( model, save_dir, save_to_disk=is_main_process if save_to_disk is None else save_to_disk, **checkpointables, ) self.path_manager = PathManager self._parsed_url_during_load = None def load(self, path, *args, **kwargs): assert self._parsed_url_during_load is None need_sync = False logger = logging.getLogger(__name__) logger.info("[DetectionCheckpointer] Loading from {} ...".format(path)) if path and isinstance(self.model, DistributedDataParallel): path = self.path_manager.get_local_path(path) has_file = os.path.isfile(path) all_has_file = comm.all_gather(has_file) if not all_has_file[0]: raise OSError(f"File {path} not found on main worker.") if not all(all_has_file): logger.warning( f"Not all workers can read checkpoint {path}. " "Training may fail to fully resume." ) # TODO: broadcast the checkpoint file contents from main # worker, and load from it instead. need_sync = True if not has_file: path = None # don't load if not readable if path: parsed_url = urlparse(path) self._parsed_url_during_load = parsed_url path = parsed_url._replace(query="").geturl() # remove query from filename path = self.path_manager.get_local_path(path) self.logger.setLevel('CRITICAL') ret = super().load(path, *args, **kwargs) if need_sync: logger.info("Broadcasting model states from main worker ...") self.model._sync_params_and_buffers() self._parsed_url_during_load = None # reset to None return ret def _load_file(self, filename): if filename.endswith(".pkl"): with PathManager.open(filename, "rb") as f: data = pickle.load(f, encoding="latin1") if "model" in data and "__author__" in data: # file is in Detectron2 model zoo format self.logger.info("Reading a file from '{}'".format(data["__author__"])) return data else: # assume file is from Caffe2 / Detectron1 model zoo if "blobs" in data: # Detection models have "blobs", but ImageNet models don't data = data["blobs"] data = {k: v for k, v in data.items() if not k.endswith("_momentum")} return {"model": data, "__author__": "Caffe2", "matching_heuristics": True} elif filename.endswith(".pyth"): # assume file is from pycls; no one else seems to use the ".pyth" extension with PathManager.open(filename, "rb") as f: data = torch.load(f) assert ( "model_state" in data ), f"Cannot load .pyth file {filename}; pycls checkpoints must contain 'model_state'." model_state = { k: v for k, v in data["model_state"].items() if not k.endswith("num_batches_tracked") } return {"model": model_state, "__author__": "pycls", "matching_heuristics": True} loaded = self._torch_load(filename) if "model" not in loaded: loaded = {"model": loaded} assert self._parsed_url_during_load is not None, "`_load_file` must be called inside `load`" parsed_url = self._parsed_url_during_load queries = parse_qs(parsed_url.query) if queries.pop("matching_heuristics", "False") == ["True"]: loaded["matching_heuristics"] = True if len(queries) > 0: raise ValueError( f"Unsupported query remaining: f{queries}, orginal filename: {parsed_url.geturl()}" ) return loaded def _torch_load(self, f): return super()._load_file(f) def _load_model(self, checkpoint): if checkpoint.get("matching_heuristics", False): self._convert_ndarray_to_tensor(checkpoint["model"]) # convert weights by name-matching heuristics checkpoint["model"] = align_and_update_state_dicts( self.model.state_dict(), checkpoint["model"], c2_conversion=checkpoint.get("__author__", None) == "Caffe2", ) # for non-caffe2 models, use standard ways to load it incompatible = super()._load_model(checkpoint) model_buffers = dict(self.model.named_buffers(recurse=False)) for k in ["pixel_mean", "pixel_std"]: # Ignore missing key message about pixel_mean/std. # Though they may be missing in old checkpoints, they will be correctly # initialized from config anyway. if k in model_buffers: try: incompatible.missing_keys.remove(k) except ValueError: pass for k in incompatible.unexpected_keys[:]: # Ignore unexpected keys about cell anchors. They exist in old checkpoints # but now they are non-persistent buffers and will not be in new checkpoints. if "anchor_generator.cell_anchors" in k: incompatible.unexpected_keys.remove(k) return incompatible ================================================ FILE: annotator/oneformer/detectron2/config/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .compat import downgrade_config, upgrade_config from .config import CfgNode, get_cfg, global_cfg, set_global_cfg, configurable from .instantiate import instantiate from .lazy import LazyCall, LazyConfig __all__ = [ "CfgNode", "get_cfg", "global_cfg", "set_global_cfg", "downgrade_config", "upgrade_config", "configurable", "instantiate", "LazyCall", "LazyConfig", ] from annotator.oneformer.detectron2.utils.env import fixup_module_metadata fixup_module_metadata(__name__, globals(), __all__) del fixup_module_metadata ================================================ FILE: annotator/oneformer/detectron2/config/compat.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. """ Backward compatibility of configs. Instructions to bump version: + It's not needed to bump version if new keys are added. It's only needed when backward-incompatible changes happen (i.e., some existing keys disappear, or the meaning of a key changes) + To bump version, do the following: 1. Increment _C.VERSION in defaults.py 2. Add a converter in this file. Each ConverterVX has a function "upgrade" which in-place upgrades config from X-1 to X, and a function "downgrade" which in-place downgrades config from X to X-1 In each function, VERSION is left unchanged. Each converter assumes that its input has the relevant keys (i.e., the input is not a partial config). 3. Run the tests (test_config.py) to make sure the upgrade & downgrade functions are consistent. """ import logging from typing import List, Optional, Tuple from .config import CfgNode as CN from .defaults import _C __all__ = ["upgrade_config", "downgrade_config"] def upgrade_config(cfg: CN, to_version: Optional[int] = None) -> CN: """ Upgrade a config from its current version to a newer version. Args: cfg (CfgNode): to_version (int): defaults to the latest version. """ cfg = cfg.clone() if to_version is None: to_version = _C.VERSION assert cfg.VERSION <= to_version, "Cannot upgrade from v{} to v{}!".format( cfg.VERSION, to_version ) for k in range(cfg.VERSION, to_version): converter = globals()["ConverterV" + str(k + 1)] converter.upgrade(cfg) cfg.VERSION = k + 1 return cfg def downgrade_config(cfg: CN, to_version: int) -> CN: """ Downgrade a config from its current version to an older version. Args: cfg (CfgNode): to_version (int): Note: A general downgrade of arbitrary configs is not always possible due to the different functionalities in different versions. The purpose of downgrade is only to recover the defaults in old versions, allowing it to load an old partial yaml config. Therefore, the implementation only needs to fill in the default values in the old version when a general downgrade is not possible. """ cfg = cfg.clone() assert cfg.VERSION >= to_version, "Cannot downgrade from v{} to v{}!".format( cfg.VERSION, to_version ) for k in range(cfg.VERSION, to_version, -1): converter = globals()["ConverterV" + str(k)] converter.downgrade(cfg) cfg.VERSION = k - 1 return cfg def guess_version(cfg: CN, filename: str) -> int: """ Guess the version of a partial config where the VERSION field is not specified. Returns the version, or the latest if cannot make a guess. This makes it easier for users to migrate. """ logger = logging.getLogger(__name__) def _has(name: str) -> bool: cur = cfg for n in name.split("."): if n not in cur: return False cur = cur[n] return True # Most users' partial configs have "MODEL.WEIGHT", so guess on it ret = None if _has("MODEL.WEIGHT") or _has("TEST.AUG_ON"): ret = 1 if ret is not None: logger.warning("Config '{}' has no VERSION. Assuming it to be v{}.".format(filename, ret)) else: ret = _C.VERSION logger.warning( "Config '{}' has no VERSION. Assuming it to be compatible with latest v{}.".format( filename, ret ) ) return ret def _rename(cfg: CN, old: str, new: str) -> None: old_keys = old.split(".") new_keys = new.split(".") def _set(key_seq: List[str], val: str) -> None: cur = cfg for k in key_seq[:-1]: if k not in cur: cur[k] = CN() cur = cur[k] cur[key_seq[-1]] = val def _get(key_seq: List[str]) -> CN: cur = cfg for k in key_seq: cur = cur[k] return cur def _del(key_seq: List[str]) -> None: cur = cfg for k in key_seq[:-1]: cur = cur[k] del cur[key_seq[-1]] if len(cur) == 0 and len(key_seq) > 1: _del(key_seq[:-1]) _set(new_keys, _get(old_keys)) _del(old_keys) class _RenameConverter: """ A converter that handles simple rename. """ RENAME: List[Tuple[str, str]] = [] # list of tuples of (old name, new name) @classmethod def upgrade(cls, cfg: CN) -> None: for old, new in cls.RENAME: _rename(cfg, old, new) @classmethod def downgrade(cls, cfg: CN) -> None: for old, new in cls.RENAME[::-1]: _rename(cfg, new, old) class ConverterV1(_RenameConverter): RENAME = [("MODEL.RPN_HEAD.NAME", "MODEL.RPN.HEAD_NAME")] class ConverterV2(_RenameConverter): """ A large bulk of rename, before public release. """ RENAME = [ ("MODEL.WEIGHT", "MODEL.WEIGHTS"), ("MODEL.PANOPTIC_FPN.SEMANTIC_LOSS_SCALE", "MODEL.SEM_SEG_HEAD.LOSS_WEIGHT"), ("MODEL.PANOPTIC_FPN.RPN_LOSS_SCALE", "MODEL.RPN.LOSS_WEIGHT"), ("MODEL.PANOPTIC_FPN.INSTANCE_LOSS_SCALE", "MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT"), ("MODEL.PANOPTIC_FPN.COMBINE_ON", "MODEL.PANOPTIC_FPN.COMBINE.ENABLED"), ( "MODEL.PANOPTIC_FPN.COMBINE_OVERLAP_THRESHOLD", "MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH", ), ( "MODEL.PANOPTIC_FPN.COMBINE_STUFF_AREA_LIMIT", "MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT", ), ( "MODEL.PANOPTIC_FPN.COMBINE_INSTANCES_CONFIDENCE_THRESHOLD", "MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH", ), ("MODEL.ROI_HEADS.SCORE_THRESH", "MODEL.ROI_HEADS.SCORE_THRESH_TEST"), ("MODEL.ROI_HEADS.NMS", "MODEL.ROI_HEADS.NMS_THRESH_TEST"), ("MODEL.RETINANET.INFERENCE_SCORE_THRESHOLD", "MODEL.RETINANET.SCORE_THRESH_TEST"), ("MODEL.RETINANET.INFERENCE_TOPK_CANDIDATES", "MODEL.RETINANET.TOPK_CANDIDATES_TEST"), ("MODEL.RETINANET.INFERENCE_NMS_THRESHOLD", "MODEL.RETINANET.NMS_THRESH_TEST"), ("TEST.DETECTIONS_PER_IMG", "TEST.DETECTIONS_PER_IMAGE"), ("TEST.AUG_ON", "TEST.AUG.ENABLED"), ("TEST.AUG_MIN_SIZES", "TEST.AUG.MIN_SIZES"), ("TEST.AUG_MAX_SIZE", "TEST.AUG.MAX_SIZE"), ("TEST.AUG_FLIP", "TEST.AUG.FLIP"), ] @classmethod def upgrade(cls, cfg: CN) -> None: super().upgrade(cfg) if cfg.MODEL.META_ARCHITECTURE == "RetinaNet": _rename( cfg, "MODEL.RETINANET.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS" ) _rename(cfg, "MODEL.RETINANET.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES") del cfg["MODEL"]["RPN"]["ANCHOR_SIZES"] del cfg["MODEL"]["RPN"]["ANCHOR_ASPECT_RATIOS"] else: _rename(cfg, "MODEL.RPN.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS") _rename(cfg, "MODEL.RPN.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES") del cfg["MODEL"]["RETINANET"]["ANCHOR_SIZES"] del cfg["MODEL"]["RETINANET"]["ANCHOR_ASPECT_RATIOS"] del cfg["MODEL"]["RETINANET"]["ANCHOR_STRIDES"] @classmethod def downgrade(cls, cfg: CN) -> None: super().downgrade(cfg) _rename(cfg, "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS", "MODEL.RPN.ANCHOR_ASPECT_RATIOS") _rename(cfg, "MODEL.ANCHOR_GENERATOR.SIZES", "MODEL.RPN.ANCHOR_SIZES") cfg.MODEL.RETINANET.ANCHOR_ASPECT_RATIOS = cfg.MODEL.RPN.ANCHOR_ASPECT_RATIOS cfg.MODEL.RETINANET.ANCHOR_SIZES = cfg.MODEL.RPN.ANCHOR_SIZES cfg.MODEL.RETINANET.ANCHOR_STRIDES = [] # this is not used anywhere in any version ================================================ FILE: annotator/oneformer/detectron2/config/config.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import functools import inspect import logging from fvcore.common.config import CfgNode as _CfgNode from annotator.oneformer.detectron2.utils.file_io import PathManager class CfgNode(_CfgNode): """ The same as `fvcore.common.config.CfgNode`, but different in: 1. Use unsafe yaml loading by default. Note that this may lead to arbitrary code execution: you must not load a config file from untrusted sources before manually inspecting the content of the file. 2. Support config versioning. When attempting to merge an old config, it will convert the old config automatically. .. automethod:: clone .. automethod:: freeze .. automethod:: defrost .. automethod:: is_frozen .. automethod:: load_yaml_with_base .. automethod:: merge_from_list .. automethod:: merge_from_other_cfg """ @classmethod def _open_cfg(cls, filename): return PathManager.open(filename, "r") # Note that the default value of allow_unsafe is changed to True def merge_from_file(self, cfg_filename: str, allow_unsafe: bool = True) -> None: """ Load content from the given config file and merge it into self. Args: cfg_filename: config filename allow_unsafe: allow unsafe yaml syntax """ assert PathManager.isfile(cfg_filename), f"Config file '{cfg_filename}' does not exist!" loaded_cfg = self.load_yaml_with_base(cfg_filename, allow_unsafe=allow_unsafe) loaded_cfg = type(self)(loaded_cfg) # defaults.py needs to import CfgNode from .defaults import _C latest_ver = _C.VERSION assert ( latest_ver == self.VERSION ), "CfgNode.merge_from_file is only allowed on a config object of latest version!" logger = logging.getLogger(__name__) loaded_ver = loaded_cfg.get("VERSION", None) if loaded_ver is None: from .compat import guess_version loaded_ver = guess_version(loaded_cfg, cfg_filename) assert loaded_ver <= self.VERSION, "Cannot merge a v{} config into a v{} config.".format( loaded_ver, self.VERSION ) if loaded_ver == self.VERSION: self.merge_from_other_cfg(loaded_cfg) else: # compat.py needs to import CfgNode from .compat import upgrade_config, downgrade_config logger.warning( "Loading an old v{} config file '{}' by automatically upgrading to v{}. " "See docs/CHANGELOG.md for instructions to update your files.".format( loaded_ver, cfg_filename, self.VERSION ) ) # To convert, first obtain a full config at an old version old_self = downgrade_config(self, to_version=loaded_ver) old_self.merge_from_other_cfg(loaded_cfg) new_config = upgrade_config(old_self) self.clear() self.update(new_config) def dump(self, *args, **kwargs): """ Returns: str: a yaml string representation of the config """ # to make it show up in docs return super().dump(*args, **kwargs) global_cfg = CfgNode() def get_cfg() -> CfgNode: """ Get a copy of the default config. Returns: a detectron2 CfgNode instance. """ from .defaults import _C return _C.clone() def set_global_cfg(cfg: CfgNode) -> None: """ Let the global config point to the given cfg. Assume that the given "cfg" has the key "KEY", after calling `set_global_cfg(cfg)`, the key can be accessed by: :: from annotator.oneformer.detectron2.config import global_cfg print(global_cfg.KEY) By using a hacky global config, you can access these configs anywhere, without having to pass the config object or the values deep into the code. This is a hacky feature introduced for quick prototyping / research exploration. """ global global_cfg global_cfg.clear() global_cfg.update(cfg) def configurable(init_func=None, *, from_config=None): """ Decorate a function or a class's __init__ method so that it can be called with a :class:`CfgNode` object using a :func:`from_config` function that translates :class:`CfgNode` to arguments. Examples: :: # Usage 1: Decorator on __init__: class A: @configurable def __init__(self, a, b=2, c=3): pass @classmethod def from_config(cls, cfg): # 'cfg' must be the first argument # Returns kwargs to be passed to __init__ return {"a": cfg.A, "b": cfg.B} a1 = A(a=1, b=2) # regular construction a2 = A(cfg) # construct with a cfg a3 = A(cfg, b=3, c=4) # construct with extra overwrite # Usage 2: Decorator on any function. Needs an extra from_config argument: @configurable(from_config=lambda cfg: {"a: cfg.A, "b": cfg.B}) def a_func(a, b=2, c=3): pass a1 = a_func(a=1, b=2) # regular call a2 = a_func(cfg) # call with a cfg a3 = a_func(cfg, b=3, c=4) # call with extra overwrite Args: init_func (callable): a class's ``__init__`` method in usage 1. The class must have a ``from_config`` classmethod which takes `cfg` as the first argument. from_config (callable): the from_config function in usage 2. It must take `cfg` as its first argument. """ if init_func is not None: assert ( inspect.isfunction(init_func) and from_config is None and init_func.__name__ == "__init__" ), "Incorrect use of @configurable. Check API documentation for examples." @functools.wraps(init_func) def wrapped(self, *args, **kwargs): try: from_config_func = type(self).from_config except AttributeError as e: raise AttributeError( "Class with @configurable must have a 'from_config' classmethod." ) from e if not inspect.ismethod(from_config_func): raise TypeError("Class with @configurable must have a 'from_config' classmethod.") if _called_with_cfg(*args, **kwargs): explicit_args = _get_args_from_config(from_config_func, *args, **kwargs) init_func(self, **explicit_args) else: init_func(self, *args, **kwargs) return wrapped else: if from_config is None: return configurable # @configurable() is made equivalent to @configurable assert inspect.isfunction( from_config ), "from_config argument of configurable must be a function!" def wrapper(orig_func): @functools.wraps(orig_func) def wrapped(*args, **kwargs): if _called_with_cfg(*args, **kwargs): explicit_args = _get_args_from_config(from_config, *args, **kwargs) return orig_func(**explicit_args) else: return orig_func(*args, **kwargs) wrapped.from_config = from_config return wrapped return wrapper def _get_args_from_config(from_config_func, *args, **kwargs): """ Use `from_config` to obtain explicit arguments. Returns: dict: arguments to be used for cls.__init__ """ signature = inspect.signature(from_config_func) if list(signature.parameters.keys())[0] != "cfg": if inspect.isfunction(from_config_func): name = from_config_func.__name__ else: name = f"{from_config_func.__self__}.from_config" raise TypeError(f"{name} must take 'cfg' as the first argument!") support_var_arg = any( param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD] for param in signature.parameters.values() ) if support_var_arg: # forward all arguments to from_config, if from_config accepts them ret = from_config_func(*args, **kwargs) else: # forward supported arguments to from_config supported_arg_names = set(signature.parameters.keys()) extra_kwargs = {} for name in list(kwargs.keys()): if name not in supported_arg_names: extra_kwargs[name] = kwargs.pop(name) ret = from_config_func(*args, **kwargs) # forward the other arguments to __init__ ret.update(extra_kwargs) return ret def _called_with_cfg(*args, **kwargs): """ Returns: bool: whether the arguments contain CfgNode and should be considered forwarded to from_config. """ from omegaconf import DictConfig if len(args) and isinstance(args[0], (_CfgNode, DictConfig)): return True if isinstance(kwargs.pop("cfg", None), (_CfgNode, DictConfig)): return True # `from_config`'s first argument is forced to be "cfg". # So the above check covers all cases. return False ================================================ FILE: annotator/oneformer/detectron2/config/defaults.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .config import CfgNode as CN # NOTE: given the new config system # (https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html), # we will stop adding new functionalities to default CfgNode. # ----------------------------------------------------------------------------- # Convention about Training / Test specific parameters # ----------------------------------------------------------------------------- # Whenever an argument can be either used for training or for testing, the # corresponding name will be post-fixed by a _TRAIN for a training parameter, # or _TEST for a test-specific parameter. # For example, the number of images during training will be # IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be # IMAGES_PER_BATCH_TEST # ----------------------------------------------------------------------------- # Config definition # ----------------------------------------------------------------------------- _C = CN() # The version number, to upgrade from old configs to new ones if any # changes happen. It's recommended to keep a VERSION in your config file. _C.VERSION = 2 _C.MODEL = CN() _C.MODEL.LOAD_PROPOSALS = False _C.MODEL.MASK_ON = False _C.MODEL.KEYPOINT_ON = False _C.MODEL.DEVICE = "cuda" _C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN" # Path (a file path, or URL like detectron2://.., https://..) to a checkpoint file # to be loaded to the model. You can find available models in the model zoo. _C.MODEL.WEIGHTS = "" # Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR). # To train on images of different number of channels, just set different mean & std. # Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675] _C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675] # When using pre-trained models in Detectron1 or any MSRA models, # std has been absorbed into its conv1 weights, so the std needs to be set 1. # Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std) _C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0] # ----------------------------------------------------------------------------- # INPUT # ----------------------------------------------------------------------------- _C.INPUT = CN() # By default, {MIN,MAX}_SIZE options are used in transforms.ResizeShortestEdge. # Please refer to ResizeShortestEdge for detailed definition. # Size of the smallest side of the image during training _C.INPUT.MIN_SIZE_TRAIN = (800,) # Sample size of smallest side by choice or random selection from range give by # INPUT.MIN_SIZE_TRAIN _C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice" # Maximum size of the side of the image during training _C.INPUT.MAX_SIZE_TRAIN = 1333 # Size of the smallest side of the image during testing. Set to zero to disable resize in testing. _C.INPUT.MIN_SIZE_TEST = 800 # Maximum size of the side of the image during testing _C.INPUT.MAX_SIZE_TEST = 1333 # Mode for flipping images used in data augmentation during training # choose one of ["horizontal, "vertical", "none"] _C.INPUT.RANDOM_FLIP = "horizontal" # `True` if cropping is used for data augmentation during training _C.INPUT.CROP = CN({"ENABLED": False}) # Cropping type. See documentation of `detectron2.data.transforms.RandomCrop` for explanation. _C.INPUT.CROP.TYPE = "relative_range" # Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of # pixels if CROP.TYPE is "absolute" _C.INPUT.CROP.SIZE = [0.9, 0.9] # Whether the model needs RGB, YUV, HSV etc. # Should be one of the modes defined here, as we use PIL to read the image: # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes # with BGR being the one exception. One can set image format to BGR, we will # internally use RGB for conversion and flip the channels over _C.INPUT.FORMAT = "BGR" # The ground truth mask format that the model will use. # Mask R-CNN supports either "polygon" or "bitmask" as ground truth. _C.INPUT.MASK_FORMAT = "polygon" # alternative: "bitmask" # ----------------------------------------------------------------------------- # Dataset # ----------------------------------------------------------------------------- _C.DATASETS = CN() # List of the dataset names for training. Must be registered in DatasetCatalog # Samples from these datasets will be merged and used as one dataset. _C.DATASETS.TRAIN = () # List of the pre-computed proposal files for training, which must be consistent # with datasets listed in DATASETS.TRAIN. _C.DATASETS.PROPOSAL_FILES_TRAIN = () # Number of top scoring precomputed proposals to keep for training _C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000 # List of the dataset names for testing. Must be registered in DatasetCatalog _C.DATASETS.TEST = () # List of the pre-computed proposal files for test, which must be consistent # with datasets listed in DATASETS.TEST. _C.DATASETS.PROPOSAL_FILES_TEST = () # Number of top scoring precomputed proposals to keep for test _C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000 # ----------------------------------------------------------------------------- # DataLoader # ----------------------------------------------------------------------------- _C.DATALOADER = CN() # Number of data loading threads _C.DATALOADER.NUM_WORKERS = 4 # If True, each batch should contain only images for which the aspect ratio # is compatible. This groups portrait images together, and landscape images # are not batched with portrait images. _C.DATALOADER.ASPECT_RATIO_GROUPING = True # Options: TrainingSampler, RepeatFactorTrainingSampler _C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler" # Repeat threshold for RepeatFactorTrainingSampler _C.DATALOADER.REPEAT_THRESHOLD = 0.0 # Tf True, when working on datasets that have instance annotations, the # training dataloader will filter out images without associated annotations _C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True # ---------------------------------------------------------------------------- # # Backbone options # ---------------------------------------------------------------------------- # _C.MODEL.BACKBONE = CN() _C.MODEL.BACKBONE.NAME = "build_resnet_backbone" # Freeze the first several stages so they are not trained. # There are 5 stages in ResNet. The first is a convolution, and the following # stages are each group of residual blocks. _C.MODEL.BACKBONE.FREEZE_AT = 2 # ---------------------------------------------------------------------------- # # FPN options # ---------------------------------------------------------------------------- # _C.MODEL.FPN = CN() # Names of the input feature maps to be used by FPN # They must have contiguous power of 2 strides # e.g., ["res2", "res3", "res4", "res5"] _C.MODEL.FPN.IN_FEATURES = [] _C.MODEL.FPN.OUT_CHANNELS = 256 # Options: "" (no norm), "GN" _C.MODEL.FPN.NORM = "" # Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg" _C.MODEL.FPN.FUSE_TYPE = "sum" # ---------------------------------------------------------------------------- # # Proposal generator options # ---------------------------------------------------------------------------- # _C.MODEL.PROPOSAL_GENERATOR = CN() # Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals" _C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN" # Proposal height and width both need to be greater than MIN_SIZE # (a the scale used during training or inference) _C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0 # ---------------------------------------------------------------------------- # # Anchor generator options # ---------------------------------------------------------------------------- # _C.MODEL.ANCHOR_GENERATOR = CN() # The generator can be any name in the ANCHOR_GENERATOR registry _C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator" # Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input. # Format: list[list[float]]. SIZES[i] specifies the list of sizes to use for # IN_FEATURES[i]; len(SIZES) must be equal to len(IN_FEATURES) or 1. # When len(SIZES) == 1, SIZES[0] is used for all IN_FEATURES. _C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]] # Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect # ratios are generated by an anchor generator. # Format: list[list[float]]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W) # to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true, # or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used # for all IN_FEATURES. _C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]] # Anchor angles. # list[list[float]], the angle in degrees, for each input feature map. # ANGLES[i] specifies the list of angles for IN_FEATURES[i]. _C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]] # Relative offset between the center of the first anchor and the top-left corner of the image # Value has to be in [0, 1). Recommend to use 0.5, which means half stride. # The value is not expected to affect model accuracy. _C.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0 # ---------------------------------------------------------------------------- # # RPN options # ---------------------------------------------------------------------------- # _C.MODEL.RPN = CN() _C.MODEL.RPN.HEAD_NAME = "StandardRPNHead" # used by RPN_HEAD_REGISTRY # Names of the input feature maps to be used by RPN # e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN _C.MODEL.RPN.IN_FEATURES = ["res4"] # Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels # Set to -1 or a large value, e.g. 100000, to disable pruning anchors _C.MODEL.RPN.BOUNDARY_THRESH = -1 # IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD] # Minimum overlap required between an anchor and ground-truth box for the # (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD # ==> positive RPN example: 1) # Maximum overlap allowed between an anchor and ground-truth box for the # (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD # ==> negative RPN example: 0) # Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD) # are ignored (-1) _C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7] _C.MODEL.RPN.IOU_LABELS = [0, -1, 1] # Number of regions per image used to train RPN _C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256 # Target fraction of foreground (positive) examples per RPN minibatch _C.MODEL.RPN.POSITIVE_FRACTION = 0.5 # Options are: "smooth_l1", "giou", "diou", "ciou" _C.MODEL.RPN.BBOX_REG_LOSS_TYPE = "smooth_l1" _C.MODEL.RPN.BBOX_REG_LOSS_WEIGHT = 1.0 # Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets _C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0) # The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1. _C.MODEL.RPN.SMOOTH_L1_BETA = 0.0 _C.MODEL.RPN.LOSS_WEIGHT = 1.0 # Number of top scoring RPN proposals to keep before applying NMS # When FPN is used, this is *per FPN level* (not total) _C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000 _C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000 # Number of top scoring RPN proposals to keep after applying NMS # When FPN is used, this limit is applied per level and then again to the union # of proposals from all levels # NOTE: When FPN is used, the meaning of this config is different from Detectron1. # It means per-batch topk in Detectron1, but per-image topk here. # See the "find_top_rpn_proposals" function for details. _C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000 _C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000 # NMS threshold used on RPN proposals _C.MODEL.RPN.NMS_THRESH = 0.7 # Set this to -1 to use the same number of output channels as input channels. _C.MODEL.RPN.CONV_DIMS = [-1] # ---------------------------------------------------------------------------- # # ROI HEADS options # ---------------------------------------------------------------------------- # _C.MODEL.ROI_HEADS = CN() _C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads" # Number of foreground classes _C.MODEL.ROI_HEADS.NUM_CLASSES = 80 # Names of the input feature maps to be used by ROI heads # Currently all heads (box, mask, ...) use the same input feature map list # e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN _C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"] # IOU overlap ratios [IOU_THRESHOLD] # Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD) # Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD) _C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5] _C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1] # RoI minibatch size *per image* (number of regions of interest [ROIs]) during training # Total number of RoIs per training minibatch = # ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH # E.g., a common configuration is: 512 * 16 = 8192 _C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 # Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0) _C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25 # Only used on test mode # Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to # balance obtaining high recall with not having too many low precision # detections that will slow down inference post processing steps (like NMS) # A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down # inference. _C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05 # Overlap threshold used for non-maximum suppression (suppress boxes with # IoU >= this threshold) _C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5 # If True, augment proposals with ground-truth boxes before sampling proposals to # train ROI heads. _C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True # ---------------------------------------------------------------------------- # # Box Head # ---------------------------------------------------------------------------- # _C.MODEL.ROI_BOX_HEAD = CN() # C4 don't use head name option # Options for non-C4 models: FastRCNNConvFCHead, _C.MODEL.ROI_BOX_HEAD.NAME = "" # Options are: "smooth_l1", "giou", "diou", "ciou" _C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE = "smooth_l1" # The final scaling coefficient on the box regression loss, used to balance the magnitude of its # gradients with other losses in the model. See also `MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT`. _C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT = 1.0 # Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets # These are empirically chosen to approximately lead to unit variance targets _C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0) # The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1. _C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0 _C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14 _C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0 # Type of pooling operation applied to the incoming feature map for each RoI _C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2" _C.MODEL.ROI_BOX_HEAD.NUM_FC = 0 # Hidden layer dimension for FC layers in the RoI box head _C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024 _C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0 # Channel dimension for Conv layers in the RoI box head _C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256 # Normalization method for the convolution layers. # Options: "" (no norm), "GN", "SyncBN". _C.MODEL.ROI_BOX_HEAD.NORM = "" # Whether to use class agnostic for bbox regression _C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False # If true, RoI heads use bounding boxes predicted by the box head rather than proposal boxes. _C.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES = False # Federated loss can be used to improve the training of LVIS _C.MODEL.ROI_BOX_HEAD.USE_FED_LOSS = False # Sigmoid cross entrophy is used with federated loss _C.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE = False # The power value applied to image_count when calcualting frequency weight _C.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER = 0.5 # Number of classes to keep in total _C.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES = 50 # ---------------------------------------------------------------------------- # # Cascaded Box Head # ---------------------------------------------------------------------------- # _C.MODEL.ROI_BOX_CASCADE_HEAD = CN() # The number of cascade stages is implicitly defined by the length of the following two configs. _C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = ( (10.0, 10.0, 5.0, 5.0), (20.0, 20.0, 10.0, 10.0), (30.0, 30.0, 15.0, 15.0), ) _C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7) # ---------------------------------------------------------------------------- # # Mask Head # ---------------------------------------------------------------------------- # _C.MODEL.ROI_MASK_HEAD = CN() _C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead" _C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14 _C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0 _C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0 # The number of convs in the mask head _C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256 # Normalization method for the convolution layers. # Options: "" (no norm), "GN", "SyncBN". _C.MODEL.ROI_MASK_HEAD.NORM = "" # Whether to use class agnostic for mask prediction _C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False # Type of pooling operation applied to the incoming feature map for each RoI _C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2" # ---------------------------------------------------------------------------- # # Keypoint Head # ---------------------------------------------------------------------------- # _C.MODEL.ROI_KEYPOINT_HEAD = CN() _C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead" _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14 _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0 _C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8)) _C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17 # 17 is the number of keypoints in COCO. # Images with too few (or no) keypoints are excluded from training. _C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1 # Normalize by the total number of visible keypoints in the minibatch if True. # Otherwise, normalize by the total number of keypoints that could ever exist # in the minibatch. # The keypoint softmax loss is only calculated on visible keypoints. # Since the number of visible keypoints can vary significantly between # minibatches, this has the effect of up-weighting the importance of # minibatches with few visible keypoints. (Imagine the extreme case of # only one visible keypoint versus N: in the case of N, each one # contributes 1/N to the gradient compared to the single keypoint # determining the gradient direction). Instead, we can normalize the # loss by the total number of keypoints, if it were the case that all # keypoints were visible in a full minibatch. (Returning to the example, # this means that the one visible keypoint contributes as much as each # of the N keypoints.) _C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True # Multi-task loss weight to use for keypoints # Recommended values: # - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True # - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False _C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0 # Type of pooling operation applied to the incoming feature map for each RoI _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2" # ---------------------------------------------------------------------------- # # Semantic Segmentation Head # ---------------------------------------------------------------------------- # _C.MODEL.SEM_SEG_HEAD = CN() _C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead" _C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"] # Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for # the correposnding pixel. _C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255 # Number of classes in the semantic segmentation head _C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54 # Number of channels in the 3x3 convs inside semantic-FPN heads. _C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128 # Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride. _C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4 # Normalization method for the convolution layers. Options: "" (no norm), "GN". _C.MODEL.SEM_SEG_HEAD.NORM = "GN" _C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0 _C.MODEL.PANOPTIC_FPN = CN() # Scaling of all losses from instance detection / segmentation head. _C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0 # options when combining instance & semantic segmentation outputs _C.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True}) # "COMBINE.ENABLED" is deprecated & not used _C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5 _C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096 _C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5 # ---------------------------------------------------------------------------- # # RetinaNet Head # ---------------------------------------------------------------------------- # _C.MODEL.RETINANET = CN() # This is the number of foreground classes. _C.MODEL.RETINANET.NUM_CLASSES = 80 _C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"] # Convolutions to use in the cls and bbox tower # NOTE: this doesn't include the last conv for logits _C.MODEL.RETINANET.NUM_CONVS = 4 # IoU overlap ratio [bg, fg] for labeling anchors. # Anchors with < bg are labeled negative (0) # Anchors with >= bg and < fg are ignored (-1) # Anchors with >= fg are labeled positive (1) _C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5] _C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1] # Prior prob for rare case (i.e. foreground) at the beginning of training. # This is used to set the bias for the logits layer of the classifier subnet. # This improves training stability in the case of heavy class imbalance. _C.MODEL.RETINANET.PRIOR_PROB = 0.01 # Inference cls score threshold, only anchors with score > INFERENCE_TH are # considered for inference (to improve speed) _C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05 # Select topk candidates before NMS _C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000 _C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5 # Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets _C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0) # Loss parameters _C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0 _C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25 _C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1 # Options are: "smooth_l1", "giou", "diou", "ciou" _C.MODEL.RETINANET.BBOX_REG_LOSS_TYPE = "smooth_l1" # One of BN, SyncBN, FrozenBN, GN # Only supports GN until unshared norm is implemented _C.MODEL.RETINANET.NORM = "" # ---------------------------------------------------------------------------- # # ResNe[X]t options (ResNets = {ResNet, ResNeXt} # Note that parts of a resnet may be used for both the backbone and the head # These options apply to both # ---------------------------------------------------------------------------- # _C.MODEL.RESNETS = CN() _C.MODEL.RESNETS.DEPTH = 50 _C.MODEL.RESNETS.OUT_FEATURES = ["res4"] # res4 for C4 backbone, res2..5 for FPN backbone # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt _C.MODEL.RESNETS.NUM_GROUPS = 1 # Options: FrozenBN, GN, "SyncBN", "BN" _C.MODEL.RESNETS.NORM = "FrozenBN" # Baseline width of each group. # Scaling this parameters will scale the width of all bottleneck layers. _C.MODEL.RESNETS.WIDTH_PER_GROUP = 64 # Place the stride 2 conv on the 1x1 filter # Use True only for the original MSRA ResNet; use False for C2 and Torch models _C.MODEL.RESNETS.STRIDE_IN_1X1 = True # Apply dilation in stage "res5" _C.MODEL.RESNETS.RES5_DILATION = 1 # Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet # For R18 and R34, this needs to be set to 64 _C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256 _C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64 # Apply Deformable Convolution in stages # Specify if apply deform_conv on Res2, Res3, Res4, Res5 _C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False] # Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168); # Use False for DeformableV1. _C.MODEL.RESNETS.DEFORM_MODULATED = False # Number of groups in deformable conv. _C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1 # ---------------------------------------------------------------------------- # # Solver # ---------------------------------------------------------------------------- # _C.SOLVER = CN() # Options: WarmupMultiStepLR, WarmupCosineLR. # See detectron2/solver/build.py for definition. _C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR" _C.SOLVER.MAX_ITER = 40000 _C.SOLVER.BASE_LR = 0.001 # The end lr, only used by WarmupCosineLR _C.SOLVER.BASE_LR_END = 0.0 _C.SOLVER.MOMENTUM = 0.9 _C.SOLVER.NESTEROV = False _C.SOLVER.WEIGHT_DECAY = 0.0001 # The weight decay that's applied to parameters of normalization layers # (typically the affine transformation) _C.SOLVER.WEIGHT_DECAY_NORM = 0.0 _C.SOLVER.GAMMA = 0.1 # The iteration number to decrease learning rate by GAMMA. _C.SOLVER.STEPS = (30000,) # Number of decays in WarmupStepWithFixedGammaLR schedule _C.SOLVER.NUM_DECAYS = 3 _C.SOLVER.WARMUP_FACTOR = 1.0 / 1000 _C.SOLVER.WARMUP_ITERS = 1000 _C.SOLVER.WARMUP_METHOD = "linear" # Whether to rescale the interval for the learning schedule after warmup _C.SOLVER.RESCALE_INTERVAL = False # Save a checkpoint after every this number of iterations _C.SOLVER.CHECKPOINT_PERIOD = 5000 # Number of images per batch across all machines. This is also the number # of training images per step (i.e. per iteration). If we use 16 GPUs # and IMS_PER_BATCH = 32, each GPU will see 2 images per batch. # May be adjusted automatically if REFERENCE_WORLD_SIZE is set. _C.SOLVER.IMS_PER_BATCH = 16 # The reference number of workers (GPUs) this config is meant to train with. # It takes no effect when set to 0. # With a non-zero value, it will be used by DefaultTrainer to compute a desired # per-worker batch size, and then scale the other related configs (total batch size, # learning rate, etc) to match the per-worker batch size. # See documentation of `DefaultTrainer.auto_scale_workers` for details: _C.SOLVER.REFERENCE_WORLD_SIZE = 0 # Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for # biases. This is not useful (at least for recent models). You should avoid # changing these and they exist only to reproduce Detectron v1 training if # desired. _C.SOLVER.BIAS_LR_FACTOR = 1.0 _C.SOLVER.WEIGHT_DECAY_BIAS = None # None means following WEIGHT_DECAY # Gradient clipping _C.SOLVER.CLIP_GRADIENTS = CN({"ENABLED": False}) # Type of gradient clipping, currently 2 values are supported: # - "value": the absolute values of elements of each gradients are clipped # - "norm": the norm of the gradient for each parameter is clipped thus # affecting all elements in the parameter _C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value" # Maximum absolute value used for clipping gradients _C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0 # Floating point number p for L-p norm to be used with the "norm" # gradient clipping type; for L-inf, please specify .inf _C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0 # Enable automatic mixed precision for training # Note that this does not change model's inference behavior. # To use AMP in inference, run inference under autocast() _C.SOLVER.AMP = CN({"ENABLED": False}) # ---------------------------------------------------------------------------- # # Specific test options # ---------------------------------------------------------------------------- # _C.TEST = CN() # For end-to-end tests to verify the expected accuracy. # Each item is [task, metric, value, tolerance] # e.g.: [['bbox', 'AP', 38.5, 0.2]] _C.TEST.EXPECTED_RESULTS = [] # The period (in terms of steps) to evaluate the model during training. # Set to 0 to disable. _C.TEST.EVAL_PERIOD = 0 # The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval # When empty, it will use the defaults in COCO. # Otherwise it should be a list[float] with the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS. _C.TEST.KEYPOINT_OKS_SIGMAS = [] # Maximum number of detections to return per image during inference (100 is # based on the limit established for the COCO dataset). _C.TEST.DETECTIONS_PER_IMAGE = 100 _C.TEST.AUG = CN({"ENABLED": False}) _C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200) _C.TEST.AUG.MAX_SIZE = 4000 _C.TEST.AUG.FLIP = True _C.TEST.PRECISE_BN = CN({"ENABLED": False}) _C.TEST.PRECISE_BN.NUM_ITER = 200 # ---------------------------------------------------------------------------- # # Misc options # ---------------------------------------------------------------------------- # # Directory where output files are written _C.OUTPUT_DIR = "./output" # Set seed to negative to fully randomize everything. # Set seed to positive to use a fixed seed. Note that a fixed seed increases # reproducibility but does not guarantee fully deterministic behavior. # Disabling all parallelism further increases reproducibility. _C.SEED = -1 # Benchmark different cudnn algorithms. # If input images have very different sizes, this option will have large overhead # for about 10k iterations. It usually hurts total time, but can benefit for certain models. # If input images have the same or similar sizes, benchmark is often helpful. _C.CUDNN_BENCHMARK = False # The period (in terms of steps) for minibatch visualization at train time. # Set to 0 to disable. _C.VIS_PERIOD = 0 # global config is for quick hack purposes. # You can set them in command line or config files, # and access it with: # # from annotator.oneformer.detectron2.config import global_cfg # print(global_cfg.HACK) # # Do not commit any configs into it. _C.GLOBAL = CN() _C.GLOBAL.HACK = 1.0 ================================================ FILE: annotator/oneformer/detectron2/config/instantiate.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import collections.abc as abc import dataclasses import logging from typing import Any from annotator.oneformer.detectron2.utils.registry import _convert_target_to_string, locate __all__ = ["dump_dataclass", "instantiate"] def dump_dataclass(obj: Any): """ Dump a dataclass recursively into a dict that can be later instantiated. Args: obj: a dataclass object Returns: dict """ assert dataclasses.is_dataclass(obj) and not isinstance( obj, type ), "dump_dataclass() requires an instance of a dataclass." ret = {"_target_": _convert_target_to_string(type(obj))} for f in dataclasses.fields(obj): v = getattr(obj, f.name) if dataclasses.is_dataclass(v): v = dump_dataclass(v) if isinstance(v, (list, tuple)): v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v] ret[f.name] = v return ret def instantiate(cfg): """ Recursively instantiate objects defined in dictionaries by "_target_" and arguments. Args: cfg: a dict-like object with "_target_" that defines the caller, and other keys that define the arguments Returns: object instantiated by cfg """ from omegaconf import ListConfig, DictConfig, OmegaConf if isinstance(cfg, ListConfig): lst = [instantiate(x) for x in cfg] return ListConfig(lst, flags={"allow_objects": True}) if isinstance(cfg, list): # Specialize for list, because many classes take # list[objects] as arguments, such as ResNet, DatasetMapper return [instantiate(x) for x in cfg] # If input is a DictConfig backed by dataclasses (i.e. omegaconf's structured config), # instantiate it to the actual dataclass. if isinstance(cfg, DictConfig) and dataclasses.is_dataclass(cfg._metadata.object_type): return OmegaConf.to_object(cfg) if isinstance(cfg, abc.Mapping) and "_target_" in cfg: # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all, # but faster: https://github.com/facebookresearch/hydra/issues/1200 cfg = {k: instantiate(v) for k, v in cfg.items()} cls = cfg.pop("_target_") cls = instantiate(cls) if isinstance(cls, str): cls_name = cls cls = locate(cls_name) assert cls is not None, cls_name else: try: cls_name = cls.__module__ + "." + cls.__qualname__ except Exception: # target could be anything, so the above could fail cls_name = str(cls) assert callable(cls), f"_target_ {cls} does not define a callable object" try: return cls(**cfg) except TypeError: logger = logging.getLogger(__name__) logger.error(f"Error when instantiating {cls_name}!") raise return cfg # return as-is if don't know what to do ================================================ FILE: annotator/oneformer/detectron2/config/lazy.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import ast import builtins import collections.abc as abc import importlib import inspect import logging import os import uuid from contextlib import contextmanager from copy import deepcopy from dataclasses import is_dataclass from typing import List, Tuple, Union import yaml from omegaconf import DictConfig, ListConfig, OmegaConf, SCMode from annotator.oneformer.detectron2.utils.file_io import PathManager from annotator.oneformer.detectron2.utils.registry import _convert_target_to_string __all__ = ["LazyCall", "LazyConfig"] class LazyCall: """ Wrap a callable so that when it's called, the call will not be executed, but returns a dict that describes the call. LazyCall object has to be called with only keyword arguments. Positional arguments are not yet supported. Examples: :: from annotator.oneformer.detectron2.config import instantiate, LazyCall layer_cfg = LazyCall(nn.Conv2d)(in_channels=32, out_channels=32) layer_cfg.out_channels = 64 # can edit it afterwards layer = instantiate(layer_cfg) """ def __init__(self, target): if not (callable(target) or isinstance(target, (str, abc.Mapping))): raise TypeError( f"target of LazyCall must be a callable or defines a callable! Got {target}" ) self._target = target def __call__(self, **kwargs): if is_dataclass(self._target): # omegaconf object cannot hold dataclass type # https://github.com/omry/omegaconf/issues/784 target = _convert_target_to_string(self._target) else: target = self._target kwargs["_target_"] = target return DictConfig(content=kwargs, flags={"allow_objects": True}) def _visit_dict_config(cfg, func): """ Apply func recursively to all DictConfig in cfg. """ if isinstance(cfg, DictConfig): func(cfg) for v in cfg.values(): _visit_dict_config(v, func) elif isinstance(cfg, ListConfig): for v in cfg: _visit_dict_config(v, func) def _validate_py_syntax(filename): # see also https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py with PathManager.open(filename, "r") as f: content = f.read() try: ast.parse(content) except SyntaxError as e: raise SyntaxError(f"Config file {filename} has syntax error!") from e def _cast_to_config(obj): # if given a dict, return DictConfig instead if isinstance(obj, dict): return DictConfig(obj, flags={"allow_objects": True}) return obj _CFG_PACKAGE_NAME = "detectron2._cfg_loader" """ A namespace to put all imported config into. """ def _random_package_name(filename): # generate a random package name when loading config files return _CFG_PACKAGE_NAME + str(uuid.uuid4())[:4] + "." + os.path.basename(filename) @contextmanager def _patch_import(): """ Enhance relative import statements in config files, so that they: 1. locate files purely based on relative location, regardless of packages. e.g. you can import file without having __init__ 2. do not cache modules globally; modifications of module states has no side effect 3. support other storage system through PathManager, so config files can be in the cloud 4. imported dict are turned into omegaconf.DictConfig automatically """ old_import = builtins.__import__ def find_relative_file(original_file, relative_import_path, level): # NOTE: "from . import x" is not handled. Because then it's unclear # if such import should produce `x` as a python module or DictConfig. # This can be discussed further if needed. relative_import_err = """ Relative import of directories is not allowed within config files. Within a config file, relative import can only import other config files. """.replace( "\n", " " ) if not len(relative_import_path): raise ImportError(relative_import_err) cur_file = os.path.dirname(original_file) for _ in range(level - 1): cur_file = os.path.dirname(cur_file) cur_name = relative_import_path.lstrip(".") for part in cur_name.split("."): cur_file = os.path.join(cur_file, part) if not cur_file.endswith(".py"): cur_file += ".py" if not PathManager.isfile(cur_file): cur_file_no_suffix = cur_file[: -len(".py")] if PathManager.isdir(cur_file_no_suffix): raise ImportError(f"Cannot import from {cur_file_no_suffix}." + relative_import_err) else: raise ImportError( f"Cannot import name {relative_import_path} from " f"{original_file}: {cur_file} does not exist." ) return cur_file def new_import(name, globals=None, locals=None, fromlist=(), level=0): if ( # Only deal with relative imports inside config files level != 0 and globals is not None and (globals.get("__package__", "") or "").startswith(_CFG_PACKAGE_NAME) ): cur_file = find_relative_file(globals["__file__"], name, level) _validate_py_syntax(cur_file) spec = importlib.machinery.ModuleSpec( _random_package_name(cur_file), None, origin=cur_file ) module = importlib.util.module_from_spec(spec) module.__file__ = cur_file with PathManager.open(cur_file) as f: content = f.read() exec(compile(content, cur_file, "exec"), module.__dict__) for name in fromlist: # turn imported dict into DictConfig automatically val = _cast_to_config(module.__dict__[name]) module.__dict__[name] = val return module return old_import(name, globals, locals, fromlist=fromlist, level=level) builtins.__import__ = new_import yield new_import builtins.__import__ = old_import class LazyConfig: """ Provide methods to save, load, and overrides an omegaconf config object which may contain definition of lazily-constructed objects. """ @staticmethod def load_rel(filename: str, keys: Union[None, str, Tuple[str, ...]] = None): """ Similar to :meth:`load()`, but load path relative to the caller's source file. This has the same functionality as a relative import, except that this method accepts filename as a string, so more characters are allowed in the filename. """ caller_frame = inspect.stack()[1] caller_fname = caller_frame[0].f_code.co_filename assert caller_fname != "", "load_rel Unable to find caller" caller_dir = os.path.dirname(caller_fname) filename = os.path.join(caller_dir, filename) return LazyConfig.load(filename, keys) @staticmethod def load(filename: str, keys: Union[None, str, Tuple[str, ...]] = None): """ Load a config file. Args: filename: absolute path or relative path w.r.t. the current working directory keys: keys to load and return. If not given, return all keys (whose values are config objects) in a dict. """ has_keys = keys is not None filename = filename.replace("/./", "/") # redundant if os.path.splitext(filename)[1] not in [".py", ".yaml", ".yml"]: raise ValueError(f"Config file {filename} has to be a python or yaml file.") if filename.endswith(".py"): _validate_py_syntax(filename) with _patch_import(): # Record the filename module_namespace = { "__file__": filename, "__package__": _random_package_name(filename), } with PathManager.open(filename) as f: content = f.read() # Compile first with filename to: # 1. make filename appears in stacktrace # 2. make load_rel able to find its parent's (possibly remote) location exec(compile(content, filename, "exec"), module_namespace) ret = module_namespace else: with PathManager.open(filename) as f: obj = yaml.unsafe_load(f) ret = OmegaConf.create(obj, flags={"allow_objects": True}) if has_keys: if isinstance(keys, str): return _cast_to_config(ret[keys]) else: return tuple(_cast_to_config(ret[a]) for a in keys) else: if filename.endswith(".py"): # when not specified, only load those that are config objects ret = DictConfig( { name: _cast_to_config(value) for name, value in ret.items() if isinstance(value, (DictConfig, ListConfig, dict)) and not name.startswith("_") }, flags={"allow_objects": True}, ) return ret @staticmethod def save(cfg, filename: str): """ Save a config object to a yaml file. Note that when the config dictionary contains complex objects (e.g. lambda), it can't be saved to yaml. In that case we will print an error and attempt to save to a pkl file instead. Args: cfg: an omegaconf config object filename: yaml file name to save the config file """ logger = logging.getLogger(__name__) try: cfg = deepcopy(cfg) except Exception: pass else: # if it's deep-copyable, then... def _replace_type_by_name(x): if "_target_" in x and callable(x._target_): try: x._target_ = _convert_target_to_string(x._target_) except AttributeError: pass # not necessary, but makes yaml looks nicer _visit_dict_config(cfg, _replace_type_by_name) save_pkl = False try: dict = OmegaConf.to_container( cfg, # Do not resolve interpolation when saving, i.e. do not turn ${a} into # actual values when saving. resolve=False, # Save structures (dataclasses) in a format that can be instantiated later. # Without this option, the type information of the dataclass will be erased. structured_config_mode=SCMode.INSTANTIATE, ) dumped = yaml.dump(dict, default_flow_style=None, allow_unicode=True, width=9999) with PathManager.open(filename, "w") as f: f.write(dumped) try: _ = yaml.unsafe_load(dumped) # test that it is loadable except Exception: logger.warning( "The config contains objects that cannot serialize to a valid yaml. " f"{filename} is human-readable but cannot be loaded." ) save_pkl = True except Exception: logger.exception("Unable to serialize the config to yaml. Error:") save_pkl = True if save_pkl: new_filename = filename + ".pkl" # try: # # retry by pickle # with PathManager.open(new_filename, "wb") as f: # cloudpickle.dump(cfg, f) # logger.warning(f"Config is saved using cloudpickle at {new_filename}.") # except Exception: # pass @staticmethod def apply_overrides(cfg, overrides: List[str]): """ In-place override contents of cfg. Args: cfg: an omegaconf config object overrides: list of strings in the format of "a=b" to override configs. See https://hydra.cc/docs/next/advanced/override_grammar/basic/ for syntax. Returns: the cfg object """ def safe_update(cfg, key, value): parts = key.split(".") for idx in range(1, len(parts)): prefix = ".".join(parts[:idx]) v = OmegaConf.select(cfg, prefix, default=None) if v is None: break if not OmegaConf.is_config(v): raise KeyError( f"Trying to update key {key}, but {prefix} " f"is not a config, but has type {type(v)}." ) OmegaConf.update(cfg, key, value, merge=True) try: from hydra.core.override_parser.overrides_parser import OverridesParser has_hydra = True except ImportError: has_hydra = False if has_hydra: parser = OverridesParser.create() overrides = parser.parse_overrides(overrides) for o in overrides: key = o.key_or_group value = o.value() if o.is_delete(): # TODO support this raise NotImplementedError("deletion is not yet a supported override") safe_update(cfg, key, value) else: # Fallback. Does not support all the features and error checking like hydra. for o in overrides: key, value = o.split("=") try: value = eval(value, {}) except NameError: pass safe_update(cfg, key, value) return cfg # @staticmethod # def to_py(cfg, prefix: str = "cfg."): # """ # Try to convert a config object into Python-like psuedo code. # # Note that perfect conversion is not always possible. So the returned # results are mainly meant to be human-readable, and not meant to be executed. # # Args: # cfg: an omegaconf config object # prefix: root name for the resulting code (default: "cfg.") # # # Returns: # str of formatted Python code # """ # import black # # cfg = OmegaConf.to_container(cfg, resolve=True) # # def _to_str(obj, prefix=None, inside_call=False): # if prefix is None: # prefix = [] # if isinstance(obj, abc.Mapping) and "_target_" in obj: # # Dict representing a function call # target = _convert_target_to_string(obj.pop("_target_")) # args = [] # for k, v in sorted(obj.items()): # args.append(f"{k}={_to_str(v, inside_call=True)}") # args = ", ".join(args) # call = f"{target}({args})" # return "".join(prefix) + call # elif isinstance(obj, abc.Mapping) and not inside_call: # # Dict that is not inside a call is a list of top-level config objects that we # # render as one object per line with dot separated prefixes # key_list = [] # for k, v in sorted(obj.items()): # if isinstance(v, abc.Mapping) and "_target_" not in v: # key_list.append(_to_str(v, prefix=prefix + [k + "."])) # else: # key = "".join(prefix) + k # key_list.append(f"{key}={_to_str(v)}") # return "\n".join(key_list) # elif isinstance(obj, abc.Mapping): # # Dict that is inside a call is rendered as a regular dict # return ( # "{" # + ",".join( # f"{repr(k)}: {_to_str(v, inside_call=inside_call)}" # for k, v in sorted(obj.items()) # ) # + "}" # ) # elif isinstance(obj, list): # return "[" + ",".join(_to_str(x, inside_call=inside_call) for x in obj) + "]" # else: # return repr(obj) # # py_str = _to_str(cfg, prefix=[prefix]) # try: # return black.format_str(py_str, mode=black.Mode()) # except black.InvalidInput: # return py_str ================================================ FILE: annotator/oneformer/detectron2/data/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from . import transforms # isort:skip from .build import ( build_batch_data_loader, build_detection_test_loader, build_detection_train_loader, get_detection_dataset_dicts, load_proposals_into_dataset, print_instances_class_histogram, ) from .catalog import DatasetCatalog, MetadataCatalog, Metadata from .common import DatasetFromList, MapDataset, ToIterableDataset from .dataset_mapper import DatasetMapper # ensure the builtin datasets are registered from . import datasets, samplers # isort:skip __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: annotator/oneformer/detectron2/data/benchmark.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np from itertools import count from typing import List, Tuple import torch import tqdm from fvcore.common.timer import Timer from annotator.oneformer.detectron2.utils import comm from .build import build_batch_data_loader from .common import DatasetFromList, MapDataset from .samplers import TrainingSampler logger = logging.getLogger(__name__) class _EmptyMapDataset(torch.utils.data.Dataset): """ Map anything to emptiness. """ def __init__(self, dataset): self.ds = dataset def __len__(self): return len(self.ds) def __getitem__(self, idx): _ = self.ds[idx] return [0] def iter_benchmark( iterator, num_iter: int, warmup: int = 5, max_time_seconds: float = 60 ) -> Tuple[float, List[float]]: """ Benchmark an iterator/iterable for `num_iter` iterations with an extra `warmup` iterations of warmup. End early if `max_time_seconds` time is spent on iterations. Returns: float: average time (seconds) per iteration list[float]: time spent on each iteration. Sometimes useful for further analysis. """ num_iter, warmup = int(num_iter), int(warmup) iterator = iter(iterator) for _ in range(warmup): next(iterator) timer = Timer() all_times = [] for curr_iter in tqdm.trange(num_iter): start = timer.seconds() if start > max_time_seconds: num_iter = curr_iter break next(iterator) all_times.append(timer.seconds() - start) avg = timer.seconds() / num_iter return avg, all_times class DataLoaderBenchmark: """ Some common benchmarks that help understand perf bottleneck of a standard dataloader made of dataset, mapper and sampler. """ def __init__( self, dataset, *, mapper, sampler=None, total_batch_size, num_workers=0, max_time_seconds: int = 90, ): """ Args: max_time_seconds (int): maximum time to spent for each benchmark other args: same as in `build.py:build_detection_train_loader` """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False, serialize=True) if sampler is None: sampler = TrainingSampler(len(dataset)) self.dataset = dataset self.mapper = mapper self.sampler = sampler self.total_batch_size = total_batch_size self.num_workers = num_workers self.per_gpu_batch_size = self.total_batch_size // comm.get_world_size() self.max_time_seconds = max_time_seconds def _benchmark(self, iterator, num_iter, warmup, msg=None): avg, all_times = iter_benchmark(iterator, num_iter, warmup, self.max_time_seconds) if msg is not None: self._log_time(msg, avg, all_times) return avg, all_times def _log_time(self, msg, avg, all_times, distributed=False): percentiles = [np.percentile(all_times, k, interpolation="nearest") for k in [1, 5, 95, 99]] if not distributed: logger.info( f"{msg}: avg={1.0/avg:.1f} it/s, " f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, " f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s." ) return avg_per_gpu = comm.all_gather(avg) percentiles_per_gpu = comm.all_gather(percentiles) if comm.get_rank() > 0: return for idx, avg, percentiles in zip(count(), avg_per_gpu, percentiles_per_gpu): logger.info( f"GPU{idx} {msg}: avg={1.0/avg:.1f} it/s, " f"p1={percentiles[0]:.2g}s, p5={percentiles[1]:.2g}s, " f"p95={percentiles[2]:.2g}s, p99={percentiles[3]:.2g}s." ) def benchmark_dataset(self, num_iter, warmup=5): """ Benchmark the speed of taking raw samples from the dataset. """ def loader(): while True: for k in self.sampler: yield self.dataset[k] self._benchmark(loader(), num_iter, warmup, "Dataset Alone") def benchmark_mapper(self, num_iter, warmup=5): """ Benchmark the speed of taking raw samples from the dataset and map them in a single process. """ def loader(): while True: for k in self.sampler: yield self.mapper(self.dataset[k]) self._benchmark(loader(), num_iter, warmup, "Single Process Mapper (sec/sample)") def benchmark_workers(self, num_iter, warmup=10): """ Benchmark the dataloader by tuning num_workers to [0, 1, self.num_workers]. """ candidates = [0, 1] if self.num_workers not in candidates: candidates.append(self.num_workers) dataset = MapDataset(self.dataset, self.mapper) for n in candidates: loader = build_batch_data_loader( dataset, self.sampler, self.total_batch_size, num_workers=n, ) self._benchmark( iter(loader), num_iter * max(n, 1), warmup * max(n, 1), f"DataLoader ({n} workers, bs={self.per_gpu_batch_size})", ) del loader def benchmark_IPC(self, num_iter, warmup=10): """ Benchmark the dataloader where each worker outputs nothing. This eliminates the IPC overhead compared to the regular dataloader. PyTorch multiprocessing's IPC only optimizes for torch tensors. Large numpy arrays or other data structure may incur large IPC overhead. """ n = self.num_workers dataset = _EmptyMapDataset(MapDataset(self.dataset, self.mapper)) loader = build_batch_data_loader( dataset, self.sampler, self.total_batch_size, num_workers=n ) self._benchmark( iter(loader), num_iter * max(n, 1), warmup * max(n, 1), f"DataLoader ({n} workers, bs={self.per_gpu_batch_size}) w/o comm", ) def benchmark_distributed(self, num_iter, warmup=10): """ Benchmark the dataloader in each distributed worker, and log results of all workers. This helps understand the final performance as well as the variances among workers. It also prints startup time (first iter) of the dataloader. """ gpu = comm.get_world_size() dataset = MapDataset(self.dataset, self.mapper) n = self.num_workers loader = build_batch_data_loader( dataset, self.sampler, self.total_batch_size, num_workers=n ) timer = Timer() loader = iter(loader) next(loader) startup_time = timer.seconds() logger.info("Dataloader startup time: {:.2f} seconds".format(startup_time)) comm.synchronize() avg, all_times = self._benchmark(loader, num_iter * max(n, 1), warmup * max(n, 1)) del loader self._log_time( f"DataLoader ({gpu} GPUs x {n} workers, total bs={self.total_batch_size})", avg, all_times, True, ) ================================================ FILE: annotator/oneformer/detectron2/data/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import itertools import logging import numpy as np import operator import pickle from typing import Any, Callable, Dict, List, Optional, Union import torch import torch.utils.data as torchdata from tabulate import tabulate from termcolor import colored from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.structures import BoxMode from annotator.oneformer.detectron2.utils.comm import get_world_size from annotator.oneformer.detectron2.utils.env import seed_all_rng from annotator.oneformer.detectron2.utils.file_io import PathManager from annotator.oneformer.detectron2.utils.logger import _log_api_usage, log_first_n from .catalog import DatasetCatalog, MetadataCatalog from .common import AspectRatioGroupedDataset, DatasetFromList, MapDataset, ToIterableDataset from .dataset_mapper import DatasetMapper from .detection_utils import check_metadata_consistency from .samplers import ( InferenceSampler, RandomSubsetTrainingSampler, RepeatFactorTrainingSampler, TrainingSampler, ) """ This file contains the default logic to build a dataloader for training or testing. """ __all__ = [ "build_batch_data_loader", "build_detection_train_loader", "build_detection_test_loader", "get_detection_dataset_dicts", "load_proposals_into_dataset", "print_instances_class_histogram", ] def filter_images_with_only_crowd_annotations(dataset_dicts): """ Filter out images with none annotations or only crowd annotations (i.e., images without non-crowd annotations). A common training-time preprocessing on COCO dataset. Args: dataset_dicts (list[dict]): annotations in Detectron2 Dataset format. Returns: list[dict]: the same format, but filtered. """ num_before = len(dataset_dicts) def valid(anns): for ann in anns: if ann.get("iscrowd", 0) == 0: return True return False dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])] num_after = len(dataset_dicts) logger = logging.getLogger(__name__) logger.info( "Removed {} images with no usable annotations. {} images left.".format( num_before - num_after, num_after ) ) return dataset_dicts def filter_images_with_few_keypoints(dataset_dicts, min_keypoints_per_image): """ Filter out images with too few number of keypoints. Args: dataset_dicts (list[dict]): annotations in Detectron2 Dataset format. Returns: list[dict]: the same format as dataset_dicts, but filtered. """ num_before = len(dataset_dicts) def visible_keypoints_in_image(dic): # Each keypoints field has the format [x1, y1, v1, ...], where v is visibility annotations = dic["annotations"] return sum( (np.array(ann["keypoints"][2::3]) > 0).sum() for ann in annotations if "keypoints" in ann ) dataset_dicts = [ x for x in dataset_dicts if visible_keypoints_in_image(x) >= min_keypoints_per_image ] num_after = len(dataset_dicts) logger = logging.getLogger(__name__) logger.info( "Removed {} images with fewer than {} keypoints.".format( num_before - num_after, min_keypoints_per_image ) ) return dataset_dicts def load_proposals_into_dataset(dataset_dicts, proposal_file): """ Load precomputed object proposals into the dataset. The proposal file should be a pickled dict with the following keys: - "ids": list[int] or list[str], the image ids - "boxes": list[np.ndarray], each is an Nx4 array of boxes corresponding to the image id - "objectness_logits": list[np.ndarray], each is an N sized array of objectness scores corresponding to the boxes. - "bbox_mode": the BoxMode of the boxes array. Defaults to ``BoxMode.XYXY_ABS``. Args: dataset_dicts (list[dict]): annotations in Detectron2 Dataset format. proposal_file (str): file path of pre-computed proposals, in pkl format. Returns: list[dict]: the same format as dataset_dicts, but added proposal field. """ logger = logging.getLogger(__name__) logger.info("Loading proposals from: {}".format(proposal_file)) with PathManager.open(proposal_file, "rb") as f: proposals = pickle.load(f, encoding="latin1") # Rename the key names in D1 proposal files rename_keys = {"indexes": "ids", "scores": "objectness_logits"} for key in rename_keys: if key in proposals: proposals[rename_keys[key]] = proposals.pop(key) # Fetch the indexes of all proposals that are in the dataset # Convert image_id to str since they could be int. img_ids = set({str(record["image_id"]) for record in dataset_dicts}) id_to_index = {str(id): i for i, id in enumerate(proposals["ids"]) if str(id) in img_ids} # Assuming default bbox_mode of precomputed proposals are 'XYXY_ABS' bbox_mode = BoxMode(proposals["bbox_mode"]) if "bbox_mode" in proposals else BoxMode.XYXY_ABS for record in dataset_dicts: # Get the index of the proposal i = id_to_index[str(record["image_id"])] boxes = proposals["boxes"][i] objectness_logits = proposals["objectness_logits"][i] # Sort the proposals in descending order of the scores inds = objectness_logits.argsort()[::-1] record["proposal_boxes"] = boxes[inds] record["proposal_objectness_logits"] = objectness_logits[inds] record["proposal_bbox_mode"] = bbox_mode return dataset_dicts def print_instances_class_histogram(dataset_dicts, class_names): """ Args: dataset_dicts (list[dict]): list of dataset dicts. class_names (list[str]): list of class names (zero-indexed). """ num_classes = len(class_names) hist_bins = np.arange(num_classes + 1) histogram = np.zeros((num_classes,), dtype=np.int) for entry in dataset_dicts: annos = entry["annotations"] classes = np.asarray( [x["category_id"] for x in annos if not x.get("iscrowd", 0)], dtype=np.int ) if len(classes): assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}" assert ( classes.max() < num_classes ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes" histogram += np.histogram(classes, bins=hist_bins)[0] N_COLS = min(6, len(class_names) * 2) def short_name(x): # make long class names shorter. useful for lvis if len(x) > 13: return x[:11] + ".." return x data = list( itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)]) ) total_num_instances = sum(data[1::2]) data.extend([None] * (N_COLS - (len(data) % N_COLS))) if num_classes > 1: data.extend(["total", total_num_instances]) data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) table = tabulate( data, headers=["category", "#instances"] * (N_COLS // 2), tablefmt="pipe", numalign="left", stralign="center", ) log_first_n( logging.INFO, "Distribution of instances among all {} categories:\n".format(num_classes) + colored(table, "cyan"), key="message", ) def get_detection_dataset_dicts( names, filter_empty=True, min_keypoints=0, proposal_files=None, check_consistency=True, ): """ Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation. Args: names (str or list[str]): a dataset name or a list of dataset names filter_empty (bool): whether to filter out images without instance annotations min_keypoints (int): filter out images with fewer keypoints than `min_keypoints`. Set to 0 to do nothing. proposal_files (list[str]): if given, a list of object proposal files that match each dataset in `names`. check_consistency (bool): whether to check if datasets have consistent metadata. Returns: list[dict]: a list of dicts following the standard dataset dict format. """ if isinstance(names, str): names = [names] assert len(names), names dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names] if isinstance(dataset_dicts[0], torchdata.Dataset): if len(dataset_dicts) > 1: # ConcatDataset does not work for iterable style dataset. # We could support concat for iterable as well, but it's often # not a good idea to concat iterables anyway. return torchdata.ConcatDataset(dataset_dicts) return dataset_dicts[0] for dataset_name, dicts in zip(names, dataset_dicts): assert len(dicts), "Dataset '{}' is empty!".format(dataset_name) if proposal_files is not None: assert len(names) == len(proposal_files) # load precomputed proposals from proposal files dataset_dicts = [ load_proposals_into_dataset(dataset_i_dicts, proposal_file) for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files) ] dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) has_instances = "annotations" in dataset_dicts[0] if filter_empty and has_instances: dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts) if min_keypoints > 0 and has_instances: dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints) if check_consistency and has_instances: try: class_names = MetadataCatalog.get(names[0]).thing_classes check_metadata_consistency("thing_classes", names) print_instances_class_histogram(dataset_dicts, class_names) except AttributeError: # class names are not available for this dataset pass assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names)) return dataset_dicts def build_batch_data_loader( dataset, sampler, total_batch_size, *, aspect_ratio_grouping=False, num_workers=0, collate_fn=None, ): """ Build a batched dataloader. The main differences from `torch.utils.data.DataLoader` are: 1. support aspect ratio grouping options 2. use no "batch collation", because this is common for detection training Args: dataset (torch.utils.data.Dataset): a pytorch map-style or iterable dataset. sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices. Must be provided iff. ``dataset`` is a map-style dataset. total_batch_size, aspect_ratio_grouping, num_workers, collate_fn: see :func:`build_detection_train_loader`. Returns: iterable[list]. Length of each list is the batch size of the current GPU. Each element in the list comes from the dataset. """ world_size = get_world_size() assert ( total_batch_size > 0 and total_batch_size % world_size == 0 ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format( total_batch_size, world_size ) batch_size = total_batch_size // world_size if isinstance(dataset, torchdata.IterableDataset): assert sampler is None, "sampler must be None if dataset is IterableDataset" else: dataset = ToIterableDataset(dataset, sampler) if aspect_ratio_grouping: data_loader = torchdata.DataLoader( dataset, num_workers=num_workers, collate_fn=operator.itemgetter(0), # don't batch, but yield individual elements worker_init_fn=worker_init_reset_seed, ) # yield individual mapped dict data_loader = AspectRatioGroupedDataset(data_loader, batch_size) if collate_fn is None: return data_loader return MapDataset(data_loader, collate_fn) else: return torchdata.DataLoader( dataset, batch_size=batch_size, drop_last=True, num_workers=num_workers, collate_fn=trivial_batch_collator if collate_fn is None else collate_fn, worker_init_fn=worker_init_reset_seed, ) def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None): if dataset is None: dataset = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0]) if mapper is None: mapper = DatasetMapper(cfg, True) if sampler is None: sampler_name = cfg.DATALOADER.SAMPLER_TRAIN logger = logging.getLogger(__name__) if isinstance(dataset, torchdata.IterableDataset): logger.info("Not using any sampler since the dataset is IterableDataset.") sampler = None else: logger.info("Using training sampler {}".format(sampler_name)) if sampler_name == "TrainingSampler": sampler = TrainingSampler(len(dataset)) elif sampler_name == "RepeatFactorTrainingSampler": repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( dataset, cfg.DATALOADER.REPEAT_THRESHOLD ) sampler = RepeatFactorTrainingSampler(repeat_factors) elif sampler_name == "RandomSubsetTrainingSampler": sampler = RandomSubsetTrainingSampler( len(dataset), cfg.DATALOADER.RANDOM_SUBSET_RATIO ) else: raise ValueError("Unknown training sampler: {}".format(sampler_name)) return { "dataset": dataset, "sampler": sampler, "mapper": mapper, "total_batch_size": cfg.SOLVER.IMS_PER_BATCH, "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING, "num_workers": cfg.DATALOADER.NUM_WORKERS, } @configurable(from_config=_train_loader_from_config) def build_detection_train_loader( dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0, collate_fn=None, ): """ Build a dataloader for object detection with some default features. Args: dataset (list or torch.utils.data.Dataset): a list of dataset dicts, or a pytorch dataset (either map-style or iterable). It can be obtained by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. mapper (callable): a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``. sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces indices to be applied on ``dataset``. If ``dataset`` is map-style, the default sampler is a :class:`TrainingSampler`, which coordinates an infinite random shuffle sequence across all workers. Sampler must be None if ``dataset`` is iterable. total_batch_size (int): total batch size across all workers. aspect_ratio_grouping (bool): whether to group images with similar aspect ratio for efficiency. When enabled, it requires each element in dataset be a dict with keys "width" and "height". num_workers (int): number of parallel data loading workers collate_fn: a function that determines how to do batching, same as the argument of `torch.utils.data.DataLoader`. Defaults to do no collation and return a list of data. No collation is OK for small batch size and simple data structures. If your batch size is large and each sample contains too many small tensors, it's more efficient to collate them in data loader. Returns: torch.utils.data.DataLoader: a dataloader. Each output from it is a ``list[mapped_element]`` of length ``total_batch_size / num_workers``, where ``mapped_element`` is produced by the ``mapper``. """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False) if mapper is not None: dataset = MapDataset(dataset, mapper) if isinstance(dataset, torchdata.IterableDataset): assert sampler is None, "sampler must be None if dataset is IterableDataset" else: if sampler is None: sampler = TrainingSampler(len(dataset)) assert isinstance(sampler, torchdata.Sampler), f"Expect a Sampler but got {type(sampler)}" return build_batch_data_loader( dataset, sampler, total_batch_size, aspect_ratio_grouping=aspect_ratio_grouping, num_workers=num_workers, collate_fn=collate_fn, ) def _test_loader_from_config(cfg, dataset_name, mapper=None): """ Uses the given `dataset_name` argument (instead of the names in cfg), because the standard practice is to evaluate each test set individually (not combining them). """ if isinstance(dataset_name, str): dataset_name = [dataset_name] dataset = get_detection_dataset_dicts( dataset_name, filter_empty=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name ] if cfg.MODEL.LOAD_PROPOSALS else None, ) if mapper is None: mapper = DatasetMapper(cfg, False) return { "dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS, "sampler": InferenceSampler(len(dataset)) if not isinstance(dataset, torchdata.IterableDataset) else None, } @configurable(from_config=_test_loader_from_config) def build_detection_test_loader( dataset: Union[List[Any], torchdata.Dataset], *, mapper: Callable[[Dict[str, Any]], Any], sampler: Optional[torchdata.Sampler] = None, batch_size: int = 1, num_workers: int = 0, collate_fn: Optional[Callable[[List[Any]], Any]] = None, ) -> torchdata.DataLoader: """ Similar to `build_detection_train_loader`, with default batch size = 1, and sampler = :class:`InferenceSampler`. This sampler coordinates all workers to produce the exact set of all samples. Args: dataset: a list of dataset dicts, or a pytorch dataset (either map-style or iterable). They can be obtained by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. mapper: a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. sampler: a sampler that produces indices to be applied on ``dataset``. Default to :class:`InferenceSampler`, which splits the dataset across all workers. Sampler must be None if `dataset` is iterable. batch_size: the batch size of the data loader to be created. Default to 1 image per worker since this is the standard when reporting inference time in papers. num_workers: number of parallel data loading workers collate_fn: same as the argument of `torch.utils.data.DataLoader`. Defaults to do no collation and return a list of data. Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. Examples: :: data_loader = build_detection_test_loader( DatasetRegistry.get("my_test"), mapper=DatasetMapper(...)) # or, instantiate with a CfgNode: data_loader = build_detection_test_loader(cfg, "my_test") """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False) if mapper is not None: dataset = MapDataset(dataset, mapper) if isinstance(dataset, torchdata.IterableDataset): assert sampler is None, "sampler must be None if dataset is IterableDataset" else: if sampler is None: sampler = InferenceSampler(len(dataset)) return torchdata.DataLoader( dataset, batch_size=batch_size, sampler=sampler, drop_last=False, num_workers=num_workers, collate_fn=trivial_batch_collator if collate_fn is None else collate_fn, ) def trivial_batch_collator(batch): """ A batch collator that does nothing. """ return batch def worker_init_reset_seed(worker_id): initial_seed = torch.initial_seed() % 2**31 seed_all_rng(initial_seed + worker_id) ================================================ FILE: annotator/oneformer/detectron2/data/catalog.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import logging import types from collections import UserDict from typing import List from annotator.oneformer.detectron2.utils.logger import log_first_n __all__ = ["DatasetCatalog", "MetadataCatalog", "Metadata"] class _DatasetCatalog(UserDict): """ A global dictionary that stores information about the datasets and how to obtain them. It contains a mapping from strings (which are names that identify a dataset, e.g. "coco_2014_train") to a function which parses the dataset and returns the samples in the format of `list[dict]`. The returned dicts should be in Detectron2 Dataset format (See DATASETS.md for details) if used with the data loader functionalities in `data/build.py,data/detection_transform.py`. The purpose of having this catalog is to make it easy to choose different datasets, by just using the strings in the config. """ def register(self, name, func): """ Args: name (str): the name that identifies a dataset, e.g. "coco_2014_train". func (callable): a callable which takes no arguments and returns a list of dicts. It must return the same results if called multiple times. """ assert callable(func), "You must register a function with `DatasetCatalog.register`!" assert name not in self, "Dataset '{}' is already registered!".format(name) self[name] = func def get(self, name): """ Call the registered function and return its results. Args: name (str): the name that identifies a dataset, e.g. "coco_2014_train". Returns: list[dict]: dataset annotations. """ try: f = self[name] except KeyError as e: raise KeyError( "Dataset '{}' is not registered! Available datasets are: {}".format( name, ", ".join(list(self.keys())) ) ) from e return f() def list(self) -> List[str]: """ List all registered datasets. Returns: list[str] """ return list(self.keys()) def remove(self, name): """ Alias of ``pop``. """ self.pop(name) def __str__(self): return "DatasetCatalog(registered datasets: {})".format(", ".join(self.keys())) __repr__ = __str__ DatasetCatalog = _DatasetCatalog() DatasetCatalog.__doc__ = ( _DatasetCatalog.__doc__ + """ .. automethod:: detectron2.data.catalog.DatasetCatalog.register .. automethod:: detectron2.data.catalog.DatasetCatalog.get """ ) class Metadata(types.SimpleNamespace): """ A class that supports simple attribute setter/getter. It is intended for storing metadata of a dataset and make it accessible globally. Examples: :: # somewhere when you load the data: MetadataCatalog.get("mydataset").thing_classes = ["person", "dog"] # somewhere when you print statistics or visualize: classes = MetadataCatalog.get("mydataset").thing_classes """ # the name of the dataset # set default to N/A so that `self.name` in the errors will not trigger getattr again name: str = "N/A" _RENAMED = { "class_names": "thing_classes", "dataset_id_to_contiguous_id": "thing_dataset_id_to_contiguous_id", "stuff_class_names": "stuff_classes", } def __getattr__(self, key): if key in self._RENAMED: log_first_n( logging.WARNING, "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]), n=10, ) return getattr(self, self._RENAMED[key]) # "name" exists in every metadata if len(self.__dict__) > 1: raise AttributeError( "Attribute '{}' does not exist in the metadata of dataset '{}'. Available " "keys are {}.".format(key, self.name, str(self.__dict__.keys())) ) else: raise AttributeError( f"Attribute '{key}' does not exist in the metadata of dataset '{self.name}': " "metadata is empty." ) def __setattr__(self, key, val): if key in self._RENAMED: log_first_n( logging.WARNING, "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]), n=10, ) setattr(self, self._RENAMED[key], val) # Ensure that metadata of the same name stays consistent try: oldval = getattr(self, key) assert oldval == val, ( "Attribute '{}' in the metadata of '{}' cannot be set " "to a different value!\n{} != {}".format(key, self.name, oldval, val) ) except AttributeError: super().__setattr__(key, val) def as_dict(self): """ Returns all the metadata as a dict. Note that modifications to the returned dict will not reflect on the Metadata object. """ return copy.copy(self.__dict__) def set(self, **kwargs): """ Set multiple metadata with kwargs. """ for k, v in kwargs.items(): setattr(self, k, v) return self def get(self, key, default=None): """ Access an attribute and return its value if exists. Otherwise return default. """ try: return getattr(self, key) except AttributeError: return default class _MetadataCatalog(UserDict): """ MetadataCatalog is a global dictionary that provides access to :class:`Metadata` of a given dataset. The metadata associated with a certain name is a singleton: once created, the metadata will stay alive and will be returned by future calls to ``get(name)``. It's like global variables, so don't abuse it. It's meant for storing knowledge that's constant and shared across the execution of the program, e.g.: the class names in COCO. """ def get(self, name): """ Args: name (str): name of a dataset (e.g. coco_2014_train). Returns: Metadata: The :class:`Metadata` instance associated with this name, or create an empty one if none is available. """ assert len(name) r = super().get(name, None) if r is None: r = self[name] = Metadata(name=name) return r def list(self): """ List all registered metadata. Returns: list[str]: keys (names of datasets) of all registered metadata """ return list(self.keys()) def remove(self, name): """ Alias of ``pop``. """ self.pop(name) def __str__(self): return "MetadataCatalog(registered metadata: {})".format(", ".join(self.keys())) __repr__ = __str__ MetadataCatalog = _MetadataCatalog() MetadataCatalog.__doc__ = ( _MetadataCatalog.__doc__ + """ .. automethod:: detectron2.data.catalog.MetadataCatalog.get """ ) ================================================ FILE: annotator/oneformer/detectron2/data/common.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import contextlib import copy import itertools import logging import numpy as np import pickle import random from typing import Callable, Union import torch import torch.utils.data as data from torch.utils.data.sampler import Sampler from annotator.oneformer.detectron2.utils.serialize import PicklableWrapper __all__ = ["MapDataset", "DatasetFromList", "AspectRatioGroupedDataset", "ToIterableDataset"] logger = logging.getLogger(__name__) def _shard_iterator_dataloader_worker(iterable): # Shard the iterable if we're currently inside pytorch dataloader worker. worker_info = data.get_worker_info() if worker_info is None or worker_info.num_workers == 1: # do nothing yield from iterable else: yield from itertools.islice(iterable, worker_info.id, None, worker_info.num_workers) class _MapIterableDataset(data.IterableDataset): """ Map a function over elements in an IterableDataset. Similar to pytorch's MapIterDataPipe, but support filtering when map_func returns None. This class is not public-facing. Will be called by `MapDataset`. """ def __init__(self, dataset, map_func): self._dataset = dataset self._map_func = PicklableWrapper(map_func) # wrap so that a lambda will work def __len__(self): return len(self._dataset) def __iter__(self): for x in map(self._map_func, self._dataset): if x is not None: yield x class MapDataset(data.Dataset): """ Map a function over the elements in a dataset. """ def __init__(self, dataset, map_func): """ Args: dataset: a dataset where map function is applied. Can be either map-style or iterable dataset. When given an iterable dataset, the returned object will also be an iterable dataset. map_func: a callable which maps the element in dataset. map_func can return None to skip the data (e.g. in case of errors). How None is handled depends on the style of `dataset`. If `dataset` is map-style, it randomly tries other elements. If `dataset` is iterable, it skips the data and tries the next. """ self._dataset = dataset self._map_func = PicklableWrapper(map_func) # wrap so that a lambda will work self._rng = random.Random(42) self._fallback_candidates = set(range(len(dataset))) def __new__(cls, dataset, map_func): is_iterable = isinstance(dataset, data.IterableDataset) if is_iterable: return _MapIterableDataset(dataset, map_func) else: return super().__new__(cls) def __getnewargs__(self): return self._dataset, self._map_func def __len__(self): return len(self._dataset) def __getitem__(self, idx): retry_count = 0 cur_idx = int(idx) while True: data = self._map_func(self._dataset[cur_idx]) if data is not None: self._fallback_candidates.add(cur_idx) return data # _map_func fails for this idx, use a random new index from the pool retry_count += 1 self._fallback_candidates.discard(cur_idx) cur_idx = self._rng.sample(self._fallback_candidates, k=1)[0] if retry_count >= 3: logger = logging.getLogger(__name__) logger.warning( "Failed to apply `_map_func` for idx: {}, retry count: {}".format( idx, retry_count ) ) class _TorchSerializedList(object): """ A list-like object whose items are serialized and stored in a torch tensor. When launching a process that uses TorchSerializedList with "fork" start method, the subprocess can read the same buffer without triggering copy-on-access. When launching a process that uses TorchSerializedList with "spawn/forkserver" start method, the list will be pickled by a special ForkingPickler registered by PyTorch that moves data to shared memory. In both cases, this allows parent and child processes to share RAM for the list data, hence avoids the issue in https://github.com/pytorch/pytorch/issues/13246. See also https://ppwwyyxx.com/blog/2022/Demystify-RAM-Usage-in-Multiprocess-DataLoader/ on how it works. """ def __init__(self, lst: list): self._lst = lst def _serialize(data): buffer = pickle.dumps(data, protocol=-1) return np.frombuffer(buffer, dtype=np.uint8) logger.info( "Serializing {} elements to byte tensors and concatenating them all ...".format( len(self._lst) ) ) self._lst = [_serialize(x) for x in self._lst] self._addr = np.asarray([len(x) for x in self._lst], dtype=np.int64) self._addr = torch.from_numpy(np.cumsum(self._addr)) self._lst = torch.from_numpy(np.concatenate(self._lst)) logger.info("Serialized dataset takes {:.2f} MiB".format(len(self._lst) / 1024**2)) def __len__(self): return len(self._addr) def __getitem__(self, idx): start_addr = 0 if idx == 0 else self._addr[idx - 1].item() end_addr = self._addr[idx].item() bytes = memoryview(self._lst[start_addr:end_addr].numpy()) # @lint-ignore PYTHONPICKLEISBAD return pickle.loads(bytes) _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD = _TorchSerializedList @contextlib.contextmanager def set_default_dataset_from_list_serialize_method(new): """ Context manager for using custom serialize function when creating DatasetFromList """ global _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD orig = _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD = new yield _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD = orig class DatasetFromList(data.Dataset): """ Wrap a list to a torch Dataset. It produces elements of the list as data. """ def __init__( self, lst: list, copy: bool = True, serialize: Union[bool, Callable] = True, ): """ Args: lst (list): a list which contains elements to produce. copy (bool): whether to deepcopy the element when producing it, so that the result can be modified in place without affecting the source in the list. serialize (bool or callable): whether to serialize the stroage to other backend. If `True`, the default serialize method will be used, if given a callable, the callable will be used as serialize method. """ self._lst = lst self._copy = copy if not isinstance(serialize, (bool, Callable)): raise TypeError(f"Unsupported type for argument `serailzie`: {serialize}") self._serialize = serialize is not False if self._serialize: serialize_method = ( serialize if isinstance(serialize, Callable) else _DEFAULT_DATASET_FROM_LIST_SERIALIZE_METHOD ) logger.info(f"Serializing the dataset using: {serialize_method}") self._lst = serialize_method(self._lst) def __len__(self): return len(self._lst) def __getitem__(self, idx): if self._copy and not self._serialize: return copy.deepcopy(self._lst[idx]) else: return self._lst[idx] class ToIterableDataset(data.IterableDataset): """ Convert an old indices-based (also called map-style) dataset to an iterable-style dataset. """ def __init__(self, dataset: data.Dataset, sampler: Sampler, shard_sampler: bool = True): """ Args: dataset: an old-style dataset with ``__getitem__`` sampler: a cheap iterable that produces indices to be applied on ``dataset``. shard_sampler: whether to shard the sampler based on the current pytorch data loader worker id. When an IterableDataset is forked by pytorch's DataLoader into multiple workers, it is responsible for sharding its data based on worker id so that workers don't produce identical data. Most samplers (like our TrainingSampler) do not shard based on dataloader worker id and this argument should be set to True. But certain samplers may be already sharded, in that case this argument should be set to False. """ assert not isinstance(dataset, data.IterableDataset), dataset assert isinstance(sampler, Sampler), sampler self.dataset = dataset self.sampler = sampler self.shard_sampler = shard_sampler def __iter__(self): if not self.shard_sampler: sampler = self.sampler else: # With map-style dataset, `DataLoader(dataset, sampler)` runs the # sampler in main process only. But `DataLoader(ToIterableDataset(dataset, sampler))` # will run sampler in every of the N worker. So we should only keep 1/N of the ids on # each worker. The assumption is that sampler is cheap to iterate so it's fine to # discard ids in workers. sampler = _shard_iterator_dataloader_worker(self.sampler) for idx in sampler: yield self.dataset[idx] def __len__(self): return len(self.sampler) class AspectRatioGroupedDataset(data.IterableDataset): """ Batch data that have similar aspect ratio together. In this implementation, images whose aspect ratio < (or >) 1 will be batched together. This improves training speed because the images then need less padding to form a batch. It assumes the underlying dataset produces dicts with "width" and "height" keys. It will then produce a list of original dicts with length = batch_size, all with similar aspect ratios. """ def __init__(self, dataset, batch_size): """ Args: dataset: an iterable. Each element must be a dict with keys "width" and "height", which will be used to batch data. batch_size (int): """ self.dataset = dataset self.batch_size = batch_size self._buckets = [[] for _ in range(2)] # Hard-coded two aspect ratio groups: w > h and w < h. # Can add support for more aspect ratio groups, but doesn't seem useful def __iter__(self): for d in self.dataset: w, h = d["width"], d["height"] bucket_id = 0 if w > h else 1 bucket = self._buckets[bucket_id] bucket.append(d) if len(bucket) == self.batch_size: data = bucket[:] # Clear bucket first, because code after yield is not # guaranteed to execute del bucket[:] yield data ================================================ FILE: annotator/oneformer/detectron2/data/dataset_mapper.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import logging import numpy as np from typing import List, Optional, Union import torch from annotator.oneformer.detectron2.config import configurable from . import detection_utils as utils from . import transforms as T """ This file contains the default mapping that's applied to "dataset dicts". """ __all__ = ["DatasetMapper"] class DatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by the model. This is the default callable to be used to map your dataset dict into training data. You may need to follow it to implement your own one for customized logic, such as a different way to read or transform images. See :doc:`/tutorials/data_loading` for details. The callable currently does the following: 1. Read the image from "file_name" 2. Applies cropping/geometric transforms to the image and annotations 3. Prepare data and annotations to Tensor and :class:`Instances` """ @configurable def __init__( self, is_train: bool, *, augmentations: List[Union[T.Augmentation, T.Transform]], image_format: str, use_instance_mask: bool = False, use_keypoint: bool = False, instance_mask_format: str = "polygon", keypoint_hflip_indices: Optional[np.ndarray] = None, precomputed_proposal_topk: Optional[int] = None, recompute_boxes: bool = False, ): """ NOTE: this interface is experimental. Args: is_train: whether it's used in training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. use_instance_mask: whether to process instance segmentation annotations, if available use_keypoint: whether to process keypoint annotations if available instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation masks into this format. keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices` precomputed_proposal_topk: if given, will load pre-computed proposals from dataset_dict and keep the top k proposals for each image. recompute_boxes: whether to overwrite bounding box annotations by computing tight bounding boxes from instance mask annotations. """ if recompute_boxes: assert use_instance_mask, "recompute_boxes requires instance masks" # fmt: off self.is_train = is_train self.augmentations = T.AugmentationList(augmentations) self.image_format = image_format self.use_instance_mask = use_instance_mask self.instance_mask_format = instance_mask_format self.use_keypoint = use_keypoint self.keypoint_hflip_indices = keypoint_hflip_indices self.proposal_topk = precomputed_proposal_topk self.recompute_boxes = recompute_boxes # fmt: on logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train: bool = True): augs = utils.build_augmentation(cfg, is_train) if cfg.INPUT.CROP.ENABLED and is_train: augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) recompute_boxes = cfg.MODEL.MASK_ON else: recompute_boxes = False ret = { "is_train": is_train, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "use_instance_mask": cfg.MODEL.MASK_ON, "instance_mask_format": cfg.INPUT.MASK_FORMAT, "use_keypoint": cfg.MODEL.KEYPOINT_ON, "recompute_boxes": recompute_boxes, } if cfg.MODEL.KEYPOINT_ON: ret["keypoint_hflip_indices"] = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN) if cfg.MODEL.LOAD_PROPOSALS: ret["precomputed_proposal_topk"] = ( cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN if is_train else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST ) return ret def _transform_annotations(self, dataset_dict, transforms, image_shape): # USER: Modify this if you want to keep them for some reason. for anno in dataset_dict["annotations"]: if not self.use_instance_mask: anno.pop("segmentation", None) if not self.use_keypoint: anno.pop("keypoints", None) # USER: Implement additional transformations if you have other types of data annos = [ utils.transform_instance_annotations( obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices ) for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] instances = utils.annotations_to_instances( annos, image_shape, mask_format=self.instance_mask_format ) # After transforms such as cropping are applied, the bounding box may no longer # tightly bound the object. As an example, imagine a triangle object # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to # the intersection of original bounding box and the cropping box. if self.recompute_boxes: instances.gt_boxes = instances.gt_masks.get_bounding_boxes() dataset_dict["instances"] = utils.filter_empty_instances(instances) def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below # USER: Write your own image loading if it's not from a file image = utils.read_image(dataset_dict["file_name"], format=self.image_format) utils.check_image_size(dataset_dict, image) # USER: Remove if you don't do semantic/panoptic segmentation. if "sem_seg_file_name" in dataset_dict: sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2) else: sem_seg_gt = None aug_input = T.AugInput(image, sem_seg=sem_seg_gt) transforms = self.augmentations(aug_input) image, sem_seg_gt = aug_input.image, aug_input.sem_seg image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if sem_seg_gt is not None: dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long")) # USER: Remove if you don't use pre-computed proposals. # Most users would not need this feature. if self.proposal_topk is not None: utils.transform_proposals( dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk ) if not self.is_train: # USER: Modify this if you want to keep them for some reason. dataset_dict.pop("annotations", None) dataset_dict.pop("sem_seg_file_name", None) return dataset_dict if "annotations" in dataset_dict: self._transform_annotations(dataset_dict, transforms, image_shape) return dataset_dict ================================================ FILE: annotator/oneformer/detectron2/data/datasets/README.md ================================================ ### Common Datasets The dataset implemented here do not need to load the data into the final format. It should provide the minimal data structure needed to use the dataset, so it can be very efficient. For example, for an image dataset, just provide the file names and labels, but don't read the images. Let the downstream decide how to read. ================================================ FILE: annotator/oneformer/detectron2/data/datasets/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .coco import load_coco_json, load_sem_seg, register_coco_instances, convert_to_coco_json from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated from .lvis import load_lvis_json, register_lvis_instances, get_lvis_instances_meta from .pascal_voc import load_voc_instances, register_pascal_voc from . import builtin as _builtin # ensure the builtin datasets are registered __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: annotator/oneformer/detectron2/data/datasets/builtin.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. """ This file registers pre-defined datasets at hard-coded paths, and their metadata. We hard-code metadata for common datasets. This will enable: 1. Consistency check when loading the datasets 2. Use models on these standard datasets directly and run demos, without having to download the dataset annotations We hard-code some paths to the dataset that's assumed to exist in "./datasets/". Users SHOULD NOT use this file to create new dataset / metadata for new dataset. To add new dataset, refer to the tutorial "docs/DATASETS.md". """ import os from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from .builtin_meta import ADE20K_SEM_SEG_CATEGORIES, _get_builtin_metadata from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic from .cityscapes_panoptic import register_all_cityscapes_panoptic from .coco import load_sem_seg, register_coco_instances from .coco_panoptic import register_coco_panoptic, register_coco_panoptic_separated from .lvis import get_lvis_instances_meta, register_lvis_instances from .pascal_voc import register_pascal_voc # ==== Predefined datasets and splits for COCO ========== _PREDEFINED_SPLITS_COCO = {} _PREDEFINED_SPLITS_COCO["coco"] = { "coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"), "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"), "coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"), "coco_2014_valminusminival": ( "coco/val2014", "coco/annotations/instances_valminusminival2014.json", ), "coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"), "coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"), "coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"), "coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"), "coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"), } _PREDEFINED_SPLITS_COCO["coco_person"] = { "keypoints_coco_2014_train": ( "coco/train2014", "coco/annotations/person_keypoints_train2014.json", ), "keypoints_coco_2014_val": ("coco/val2014", "coco/annotations/person_keypoints_val2014.json"), "keypoints_coco_2014_minival": ( "coco/val2014", "coco/annotations/person_keypoints_minival2014.json", ), "keypoints_coco_2014_valminusminival": ( "coco/val2014", "coco/annotations/person_keypoints_valminusminival2014.json", ), "keypoints_coco_2017_train": ( "coco/train2017", "coco/annotations/person_keypoints_train2017.json", ), "keypoints_coco_2017_val": ("coco/val2017", "coco/annotations/person_keypoints_val2017.json"), "keypoints_coco_2017_val_100": ( "coco/val2017", "coco/annotations/person_keypoints_val2017_100.json", ), } _PREDEFINED_SPLITS_COCO_PANOPTIC = { "coco_2017_train_panoptic": ( # This is the original panoptic annotation directory "coco/panoptic_train2017", "coco/annotations/panoptic_train2017.json", # This directory contains semantic annotations that are # converted from panoptic annotations. # It is used by PanopticFPN. # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py # to create these directories. "coco/panoptic_stuff_train2017", ), "coco_2017_val_panoptic": ( "coco/panoptic_val2017", "coco/annotations/panoptic_val2017.json", "coco/panoptic_stuff_val2017", ), "coco_2017_val_100_panoptic": ( "coco/panoptic_val2017_100", "coco/annotations/panoptic_val2017_100.json", "coco/panoptic_stuff_val2017_100", ), } def register_all_coco(root): for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items(): for key, (image_root, json_file) in splits_per_dataset.items(): # Assume pre-defined datasets live in `./datasets`. register_coco_instances( key, _get_builtin_metadata(dataset_name), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) for ( prefix, (panoptic_root, panoptic_json, semantic_root), ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items(): prefix_instances = prefix[: -len("_panoptic")] instances_meta = MetadataCatalog.get(prefix_instances) image_root, instances_json = instances_meta.image_root, instances_meta.json_file # The "separated" version of COCO panoptic segmentation dataset, # e.g. used by Panoptic FPN register_coco_panoptic_separated( prefix, _get_builtin_metadata("coco_panoptic_separated"), image_root, os.path.join(root, panoptic_root), os.path.join(root, panoptic_json), os.path.join(root, semantic_root), instances_json, ) # The "standard" version of COCO panoptic segmentation dataset, # e.g. used by Panoptic-DeepLab register_coco_panoptic( prefix, _get_builtin_metadata("coco_panoptic_standard"), image_root, os.path.join(root, panoptic_root), os.path.join(root, panoptic_json), instances_json, ) # ==== Predefined datasets and splits for LVIS ========== _PREDEFINED_SPLITS_LVIS = { "lvis_v1": { "lvis_v1_train": ("coco/", "lvis/lvis_v1_train.json"), "lvis_v1_val": ("coco/", "lvis/lvis_v1_val.json"), "lvis_v1_test_dev": ("coco/", "lvis/lvis_v1_image_info_test_dev.json"), "lvis_v1_test_challenge": ("coco/", "lvis/lvis_v1_image_info_test_challenge.json"), }, "lvis_v0.5": { "lvis_v0.5_train": ("coco/", "lvis/lvis_v0.5_train.json"), "lvis_v0.5_val": ("coco/", "lvis/lvis_v0.5_val.json"), "lvis_v0.5_val_rand_100": ("coco/", "lvis/lvis_v0.5_val_rand_100.json"), "lvis_v0.5_test": ("coco/", "lvis/lvis_v0.5_image_info_test.json"), }, "lvis_v0.5_cocofied": { "lvis_v0.5_train_cocofied": ("coco/", "lvis/lvis_v0.5_train_cocofied.json"), "lvis_v0.5_val_cocofied": ("coco/", "lvis/lvis_v0.5_val_cocofied.json"), }, } def register_all_lvis(root): for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items(): for key, (image_root, json_file) in splits_per_dataset.items(): register_lvis_instances( key, get_lvis_instances_meta(dataset_name), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) # ==== Predefined splits for raw cityscapes images =========== _RAW_CITYSCAPES_SPLITS = { "cityscapes_fine_{task}_train": ("cityscapes/leftImg8bit/train/", "cityscapes/gtFine/train/"), "cityscapes_fine_{task}_val": ("cityscapes/leftImg8bit/val/", "cityscapes/gtFine/val/"), "cityscapes_fine_{task}_test": ("cityscapes/leftImg8bit/test/", "cityscapes/gtFine/test/"), } def register_all_cityscapes(root): for key, (image_dir, gt_dir) in _RAW_CITYSCAPES_SPLITS.items(): meta = _get_builtin_metadata("cityscapes") image_dir = os.path.join(root, image_dir) gt_dir = os.path.join(root, gt_dir) inst_key = key.format(task="instance_seg") DatasetCatalog.register( inst_key, lambda x=image_dir, y=gt_dir: load_cityscapes_instances( x, y, from_json=True, to_polygons=True ), ) MetadataCatalog.get(inst_key).set( image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes_instance", **meta ) sem_key = key.format(task="sem_seg") DatasetCatalog.register( sem_key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y) ) MetadataCatalog.get(sem_key).set( image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes_sem_seg", ignore_label=255, **meta, ) # ==== Predefined splits for PASCAL VOC =========== def register_all_pascal_voc(root): SPLITS = [ ("voc_2007_trainval", "VOC2007", "trainval"), ("voc_2007_train", "VOC2007", "train"), ("voc_2007_val", "VOC2007", "val"), ("voc_2007_test", "VOC2007", "test"), ("voc_2012_trainval", "VOC2012", "trainval"), ("voc_2012_train", "VOC2012", "train"), ("voc_2012_val", "VOC2012", "val"), ] for name, dirname, split in SPLITS: year = 2007 if "2007" in name else 2012 register_pascal_voc(name, os.path.join(root, dirname), split, year) MetadataCatalog.get(name).evaluator_type = "pascal_voc" def register_all_ade20k(root): root = os.path.join(root, "ADEChallengeData2016") for name, dirname in [("train", "training"), ("val", "validation")]: image_dir = os.path.join(root, "images", dirname) gt_dir = os.path.join(root, "annotations_detectron2", dirname) name = f"ade20k_sem_seg_{name}" DatasetCatalog.register( name, lambda x=image_dir, y=gt_dir: load_sem_seg(y, x, gt_ext="png", image_ext="jpg") ) MetadataCatalog.get(name).set( stuff_classes=ADE20K_SEM_SEG_CATEGORIES[:], image_root=image_dir, sem_seg_root=gt_dir, evaluator_type="sem_seg", ignore_label=255, ) # True for open source; # Internally at fb, we register them elsewhere if __name__.endswith(".builtin"): # Assume pre-defined datasets live in `./datasets`. _root = os.path.expanduser(os.getenv("DETECTRON2_DATASETS", "datasets")) register_all_coco(_root) register_all_lvis(_root) register_all_cityscapes(_root) register_all_cityscapes_panoptic(_root) register_all_pascal_voc(_root) register_all_ade20k(_root) ================================================ FILE: annotator/oneformer/detectron2/data/datasets/builtin_meta.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. """ Note: For your custom dataset, there is no need to hard-code metadata anywhere in the code. For example, for COCO-format dataset, metadata will be obtained automatically when calling `load_coco_json`. For other dataset, metadata may also be obtained in other ways during loading. However, we hard-coded metadata for a few common dataset here. The only goal is to allow users who don't have these dataset to use pre-trained models. Users don't have to download a COCO json (which contains metadata), in order to visualize a COCO model (with correct class names and colors). """ # All coco categories, together with their nice-looking visualization colors # It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json COCO_CATEGORIES = [ {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"}, {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"}, {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"}, {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"}, {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"}, {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"}, {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"}, {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"}, {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"}, {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"}, {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"}, {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"}, {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"}, {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"}, {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"}, {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"}, {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"}, {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"}, {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"}, {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"}, {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"}, {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"}, {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"}, {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"}, {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"}, {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"}, {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"}, {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"}, {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"}, {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"}, {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"}, {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"}, {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"}, {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"}, {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"}, {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"}, {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"}, {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"}, {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"}, {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"}, {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"}, {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"}, {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"}, {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"}, {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"}, {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"}, {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"}, {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"}, {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"}, {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"}, {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"}, {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"}, {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"}, {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"}, {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"}, {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"}, {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"}, {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"}, {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"}, {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"}, {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"}, {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"}, {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"}, {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"}, {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"}, {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"}, {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"}, {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"}, {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"}, {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"}, {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"}, {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"}, {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"}, {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"}, {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"}, {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"}, {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"}, {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"}, {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"}, {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"}, {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"}, {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"}, {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"}, {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"}, {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"}, {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"}, {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"}, {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"}, {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"}, {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"}, {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"}, {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"}, {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"}, {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"}, {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"}, {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"}, {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"}, {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"}, {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"}, {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"}, {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"}, {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"}, {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"}, {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"}, {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"}, {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"}, {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"}, {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"}, {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"}, {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"}, {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"}, {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"}, {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"}, {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"}, {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"}, {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"}, {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"}, {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"}, {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"}, {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"}, {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"}, {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"}, {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"}, {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"}, {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"}, {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"}, {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"}, {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"}, {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"}, {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"}, {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"}, ] # fmt: off COCO_PERSON_KEYPOINT_NAMES = ( "nose", "left_eye", "right_eye", "left_ear", "right_ear", "left_shoulder", "right_shoulder", "left_elbow", "right_elbow", "left_wrist", "right_wrist", "left_hip", "right_hip", "left_knee", "right_knee", "left_ankle", "right_ankle", ) # fmt: on # Pairs of keypoints that should be exchanged under horizontal flipping COCO_PERSON_KEYPOINT_FLIP_MAP = ( ("left_eye", "right_eye"), ("left_ear", "right_ear"), ("left_shoulder", "right_shoulder"), ("left_elbow", "right_elbow"), ("left_wrist", "right_wrist"), ("left_hip", "right_hip"), ("left_knee", "right_knee"), ("left_ankle", "right_ankle"), ) # rules for pairs of keypoints to draw a line between, and the line color to use. KEYPOINT_CONNECTION_RULES = [ # face ("left_ear", "left_eye", (102, 204, 255)), ("right_ear", "right_eye", (51, 153, 255)), ("left_eye", "nose", (102, 0, 204)), ("nose", "right_eye", (51, 102, 255)), # upper-body ("left_shoulder", "right_shoulder", (255, 128, 0)), ("left_shoulder", "left_elbow", (153, 255, 204)), ("right_shoulder", "right_elbow", (128, 229, 255)), ("left_elbow", "left_wrist", (153, 255, 153)), ("right_elbow", "right_wrist", (102, 255, 224)), # lower-body ("left_hip", "right_hip", (255, 102, 0)), ("left_hip", "left_knee", (255, 255, 77)), ("right_hip", "right_knee", (153, 255, 204)), ("left_knee", "left_ankle", (191, 255, 128)), ("right_knee", "right_ankle", (255, 195, 77)), ] # All Cityscapes categories, together with their nice-looking visualization colors # It's from https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/helpers/labels.py # noqa CITYSCAPES_CATEGORIES = [ {"color": (128, 64, 128), "isthing": 0, "id": 7, "trainId": 0, "name": "road"}, {"color": (244, 35, 232), "isthing": 0, "id": 8, "trainId": 1, "name": "sidewalk"}, {"color": (70, 70, 70), "isthing": 0, "id": 11, "trainId": 2, "name": "building"}, {"color": (102, 102, 156), "isthing": 0, "id": 12, "trainId": 3, "name": "wall"}, {"color": (190, 153, 153), "isthing": 0, "id": 13, "trainId": 4, "name": "fence"}, {"color": (153, 153, 153), "isthing": 0, "id": 17, "trainId": 5, "name": "pole"}, {"color": (250, 170, 30), "isthing": 0, "id": 19, "trainId": 6, "name": "traffic light"}, {"color": (220, 220, 0), "isthing": 0, "id": 20, "trainId": 7, "name": "traffic sign"}, {"color": (107, 142, 35), "isthing": 0, "id": 21, "trainId": 8, "name": "vegetation"}, {"color": (152, 251, 152), "isthing": 0, "id": 22, "trainId": 9, "name": "terrain"}, {"color": (70, 130, 180), "isthing": 0, "id": 23, "trainId": 10, "name": "sky"}, {"color": (220, 20, 60), "isthing": 1, "id": 24, "trainId": 11, "name": "person"}, {"color": (255, 0, 0), "isthing": 1, "id": 25, "trainId": 12, "name": "rider"}, {"color": (0, 0, 142), "isthing": 1, "id": 26, "trainId": 13, "name": "car"}, {"color": (0, 0, 70), "isthing": 1, "id": 27, "trainId": 14, "name": "truck"}, {"color": (0, 60, 100), "isthing": 1, "id": 28, "trainId": 15, "name": "bus"}, {"color": (0, 80, 100), "isthing": 1, "id": 31, "trainId": 16, "name": "train"}, {"color": (0, 0, 230), "isthing": 1, "id": 32, "trainId": 17, "name": "motorcycle"}, {"color": (119, 11, 32), "isthing": 1, "id": 33, "trainId": 18, "name": "bicycle"}, ] # fmt: off ADE20K_SEM_SEG_CATEGORIES = [ "wall", "building", "sky", "floor", "tree", "ceiling", "road, route", "bed", "window ", "grass", "cabinet", "sidewalk, pavement", "person", "earth, ground", "door", "table", "mountain, mount", "plant", "curtain", "chair", "car", "water", "painting, picture", "sofa", "shelf", "house", "sea", "mirror", "rug", "field", "armchair", "seat", "fence", "desk", "rock, stone", "wardrobe, closet, press", "lamp", "tub", "rail", "cushion", "base, pedestal, stand", "box", "column, pillar", "signboard, sign", "chest of drawers, chest, bureau, dresser", "counter", "sand", "sink", "skyscraper", "fireplace", "refrigerator, icebox", "grandstand, covered stand", "path", "stairs", "runway", "case, display case, showcase, vitrine", "pool table, billiard table, snooker table", "pillow", "screen door, screen", "stairway, staircase", "river", "bridge, span", "bookcase", "blind, screen", "coffee table", "toilet, can, commode, crapper, pot, potty, stool, throne", "flower", "book", "hill", "bench", "countertop", "stove", "palm, palm tree", "kitchen island", "computer", "swivel chair", "boat", "bar", "arcade machine", "hovel, hut, hutch, shack, shanty", "bus", "towel", "light", "truck", "tower", "chandelier", "awning, sunshade, sunblind", "street lamp", "booth", "tv", "plane", "dirt track", "clothes", "pole", "land, ground, soil", "bannister, banister, balustrade, balusters, handrail", "escalator, moving staircase, moving stairway", "ottoman, pouf, pouffe, puff, hassock", "bottle", "buffet, counter, sideboard", "poster, posting, placard, notice, bill, card", "stage", "van", "ship", "fountain", "conveyer belt, conveyor belt, conveyer, conveyor, transporter", "canopy", "washer, automatic washer, washing machine", "plaything, toy", "pool", "stool", "barrel, cask", "basket, handbasket", "falls", "tent", "bag", "minibike, motorbike", "cradle", "oven", "ball", "food, solid food", "step, stair", "tank, storage tank", "trade name", "microwave", "pot", "animal", "bicycle", "lake", "dishwasher", "screen", "blanket, cover", "sculpture", "hood, exhaust hood", "sconce", "vase", "traffic light", "tray", "trash can", "fan", "pier", "crt screen", "plate", "monitor", "bulletin board", "shower", "radiator", "glass, drinking glass", "clock", "flag", # noqa ] # After processed by `prepare_ade20k_sem_seg.py`, id 255 means ignore # fmt: on def _get_coco_instances_meta(): thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1] thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1] assert len(thing_ids) == 80, len(thing_ids) # Mapping from the incontiguous COCO category id to an id in [0, 79] thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1] ret = { "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes, "thing_colors": thing_colors, } return ret def _get_coco_panoptic_separated_meta(): """ Returns metadata for "separated" version of the panoptic segmentation dataset. """ stuff_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 0] assert len(stuff_ids) == 53, len(stuff_ids) # For semantic segmentation, this mapping maps from contiguous stuff id # (in [0, 53], used in models) to ids in the dataset (used for processing results) # The id 0 is mapped to an extra category "thing". stuff_dataset_id_to_contiguous_id = {k: i + 1 for i, k in enumerate(stuff_ids)} # When converting COCO panoptic annotations to semantic annotations # We label the "thing" category to 0 stuff_dataset_id_to_contiguous_id[0] = 0 # 54 names for COCO stuff categories (including "things") stuff_classes = ["things"] + [ k["name"].replace("-other", "").replace("-merged", "") for k in COCO_CATEGORIES if k["isthing"] == 0 ] # NOTE: I randomly picked a color for things stuff_colors = [[82, 18, 128]] + [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 0] ret = { "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id, "stuff_classes": stuff_classes, "stuff_colors": stuff_colors, } ret.update(_get_coco_instances_meta()) return ret def _get_builtin_metadata(dataset_name): if dataset_name == "coco": return _get_coco_instances_meta() if dataset_name == "coco_panoptic_separated": return _get_coco_panoptic_separated_meta() elif dataset_name == "coco_panoptic_standard": meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in COCO_CATEGORIES] thing_colors = [k["color"] for k in COCO_CATEGORIES] stuff_classes = [k["name"] for k in COCO_CATEGORIES] stuff_colors = [k["color"] for k in COCO_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # Convert category id for training: # category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the linear # softmax classifier. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for i, cat in enumerate(COCO_CATEGORIES): if cat["isthing"]: thing_dataset_id_to_contiguous_id[cat["id"]] = i else: stuff_dataset_id_to_contiguous_id[cat["id"]] = i meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id return meta elif dataset_name == "coco_person": return { "thing_classes": ["person"], "keypoint_names": COCO_PERSON_KEYPOINT_NAMES, "keypoint_flip_map": COCO_PERSON_KEYPOINT_FLIP_MAP, "keypoint_connection_rules": KEYPOINT_CONNECTION_RULES, } elif dataset_name == "cityscapes": # fmt: off CITYSCAPES_THING_CLASSES = [ "person", "rider", "car", "truck", "bus", "train", "motorcycle", "bicycle", ] CITYSCAPES_STUFF_CLASSES = [ "road", "sidewalk", "building", "wall", "fence", "pole", "traffic light", "traffic sign", "vegetation", "terrain", "sky", "person", "rider", "car", "truck", "bus", "train", "motorcycle", "bicycle", ] # fmt: on return { "thing_classes": CITYSCAPES_THING_CLASSES, "stuff_classes": CITYSCAPES_STUFF_CLASSES, } raise KeyError("No built-in metadata for dataset {}".format(dataset_name)) ================================================ FILE: annotator/oneformer/detectron2/data/datasets/cityscapes.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import functools import json import logging import multiprocessing as mp import numpy as np import os from itertools import chain import annotator.oneformer.pycocotools.mask as mask_util from PIL import Image from annotator.oneformer.detectron2.structures import BoxMode from annotator.oneformer.detectron2.utils.comm import get_world_size from annotator.oneformer.detectron2.utils.file_io import PathManager from annotator.oneformer.detectron2.utils.logger import setup_logger try: import cv2 # noqa except ImportError: # OpenCV is an optional dependency at the moment pass logger = logging.getLogger(__name__) def _get_cityscapes_files(image_dir, gt_dir): files = [] # scan through the directory cities = PathManager.ls(image_dir) logger.info(f"{len(cities)} cities found in '{image_dir}'.") for city in cities: city_img_dir = os.path.join(image_dir, city) city_gt_dir = os.path.join(gt_dir, city) for basename in PathManager.ls(city_img_dir): image_file = os.path.join(city_img_dir, basename) suffix = "leftImg8bit.png" assert basename.endswith(suffix), basename basename = basename[: -len(suffix)] instance_file = os.path.join(city_gt_dir, basename + "gtFine_instanceIds.png") label_file = os.path.join(city_gt_dir, basename + "gtFine_labelIds.png") json_file = os.path.join(city_gt_dir, basename + "gtFine_polygons.json") files.append((image_file, instance_file, label_file, json_file)) assert len(files), "No images found in {}".format(image_dir) for f in files[0]: assert PathManager.isfile(f), f return files def load_cityscapes_instances(image_dir, gt_dir, from_json=True, to_polygons=True): """ Args: image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train". from_json (bool): whether to read annotations from the raw json file or the png files. to_polygons (bool): whether to represent the segmentation as polygons (COCO's format) instead of masks (cityscapes's format). Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ if from_json: assert to_polygons, ( "Cityscapes's json annotations are in polygon format. " "Converting to mask format is not supported now." ) files = _get_cityscapes_files(image_dir, gt_dir) logger.info("Preprocessing cityscapes annotations ...") # This is still not fast: all workers will execute duplicate works and will # take up to 10m on a 8GPU server. pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4)) ret = pool.map( functools.partial(_cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons), files, ) logger.info("Loaded {} images from {}".format(len(ret), image_dir)) # Map cityscape ids to contiguous ids from cityscapesscripts.helpers.labels import labels labels = [l for l in labels if l.hasInstances and not l.ignoreInEval] dataset_id_to_contiguous_id = {l.id: idx for idx, l in enumerate(labels)} for dict_per_image in ret: for anno in dict_per_image["annotations"]: anno["category_id"] = dataset_id_to_contiguous_id[anno["category_id"]] return ret def load_cityscapes_semantic(image_dir, gt_dir): """ Args: image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train". Returns: list[dict]: a list of dict, each has "file_name" and "sem_seg_file_name". """ ret = [] # gt_dir is small and contain many small files. make sense to fetch to local first gt_dir = PathManager.get_local_path(gt_dir) for image_file, _, label_file, json_file in _get_cityscapes_files(image_dir, gt_dir): label_file = label_file.replace("labelIds", "labelTrainIds") with PathManager.open(json_file, "r") as f: jsonobj = json.load(f) ret.append( { "file_name": image_file, "sem_seg_file_name": label_file, "height": jsonobj["imgHeight"], "width": jsonobj["imgWidth"], } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile( ret[0]["sem_seg_file_name"] ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py" # noqa return ret def _cityscapes_files_to_dict(files, from_json, to_polygons): """ Parse cityscapes annotation files to a instance segmentation dataset dict. Args: files (tuple): consists of (image_file, instance_id_file, label_id_file, json_file) from_json (bool): whether to read annotations from the raw json file or the png files. to_polygons (bool): whether to represent the segmentation as polygons (COCO's format) instead of masks (cityscapes's format). Returns: A dict in Detectron2 Dataset format. """ from cityscapesscripts.helpers.labels import id2label, name2label image_file, instance_id_file, _, json_file = files annos = [] if from_json: from shapely.geometry import MultiPolygon, Polygon with PathManager.open(json_file, "r") as f: jsonobj = json.load(f) ret = { "file_name": image_file, "image_id": os.path.basename(image_file), "height": jsonobj["imgHeight"], "width": jsonobj["imgWidth"], } # `polygons_union` contains the union of all valid polygons. polygons_union = Polygon() # CityscapesScripts draw the polygons in sequential order # and each polygon *overwrites* existing ones. See # (https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/preparation/json2instanceImg.py) # noqa # We use reverse order, and each polygon *avoids* early ones. # This will resolve the ploygon overlaps in the same way as CityscapesScripts. for obj in jsonobj["objects"][::-1]: if "deleted" in obj: # cityscapes data format specific continue label_name = obj["label"] try: label = name2label[label_name] except KeyError: if label_name.endswith("group"): # crowd area label = name2label[label_name[: -len("group")]] else: raise if label.id < 0: # cityscapes data format continue # Cityscapes's raw annotations uses integer coordinates # Therefore +0.5 here poly_coord = np.asarray(obj["polygon"], dtype="f4") + 0.5 # CityscapesScript uses PIL.ImageDraw.polygon to rasterize # polygons for evaluation. This function operates in integer space # and draws each pixel whose center falls into the polygon. # Therefore it draws a polygon which is 0.5 "fatter" in expectation. # We therefore dilate the input polygon by 0.5 as our input. poly = Polygon(poly_coord).buffer(0.5, resolution=4) if not label.hasInstances or label.ignoreInEval: # even if we won't store the polygon it still contributes to overlaps resolution polygons_union = polygons_union.union(poly) continue # Take non-overlapping part of the polygon poly_wo_overlaps = poly.difference(polygons_union) if poly_wo_overlaps.is_empty: continue polygons_union = polygons_union.union(poly) anno = {} anno["iscrowd"] = label_name.endswith("group") anno["category_id"] = label.id if isinstance(poly_wo_overlaps, Polygon): poly_list = [poly_wo_overlaps] elif isinstance(poly_wo_overlaps, MultiPolygon): poly_list = poly_wo_overlaps.geoms else: raise NotImplementedError("Unknown geometric structure {}".format(poly_wo_overlaps)) poly_coord = [] for poly_el in poly_list: # COCO API can work only with exterior boundaries now, hence we store only them. # TODO: store both exterior and interior boundaries once other parts of the # codebase support holes in polygons. poly_coord.append(list(chain(*poly_el.exterior.coords))) anno["segmentation"] = poly_coord (xmin, ymin, xmax, ymax) = poly_wo_overlaps.bounds anno["bbox"] = (xmin, ymin, xmax, ymax) anno["bbox_mode"] = BoxMode.XYXY_ABS annos.append(anno) else: # See also the official annotation parsing scripts at # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/instances2dict.py # noqa with PathManager.open(instance_id_file, "rb") as f: inst_image = np.asarray(Image.open(f), order="F") # ids < 24 are stuff labels (filtering them first is about 5% faster) flattened_ids = np.unique(inst_image[inst_image >= 24]) ret = { "file_name": image_file, "image_id": os.path.basename(image_file), "height": inst_image.shape[0], "width": inst_image.shape[1], } for instance_id in flattened_ids: # For non-crowd annotations, instance_id // 1000 is the label_id # Crowd annotations have <1000 instance ids label_id = instance_id // 1000 if instance_id >= 1000 else instance_id label = id2label[label_id] if not label.hasInstances or label.ignoreInEval: continue anno = {} anno["iscrowd"] = instance_id < 1000 anno["category_id"] = label.id mask = np.asarray(inst_image == instance_id, dtype=np.uint8, order="F") inds = np.nonzero(mask) ymin, ymax = inds[0].min(), inds[0].max() xmin, xmax = inds[1].min(), inds[1].max() anno["bbox"] = (xmin, ymin, xmax, ymax) if xmax <= xmin or ymax <= ymin: continue anno["bbox_mode"] = BoxMode.XYXY_ABS if to_polygons: # This conversion comes from D4809743 and D5171122, # when Mask-RCNN was first developed. contours = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[ -2 ] polygons = [c.reshape(-1).tolist() for c in contours if len(c) >= 3] # opencv's can produce invalid polygons if len(polygons) == 0: continue anno["segmentation"] = polygons else: anno["segmentation"] = mask_util.encode(mask[:, :, None])[0] annos.append(anno) ret["annotations"] = annos return ret if __name__ == "__main__": """ Test the cityscapes dataset loader. Usage: python -m detectron2.data.datasets.cityscapes \ cityscapes/leftImg8bit/train cityscapes/gtFine/train """ import argparse parser = argparse.ArgumentParser() parser.add_argument("image_dir") parser.add_argument("gt_dir") parser.add_argument("--type", choices=["instance", "semantic"], default="instance") args = parser.parse_args() from annotator.oneformer.detectron2.data.catalog import Metadata from annotator.oneformer.detectron2.utils.visualizer import Visualizer from cityscapesscripts.helpers.labels import labels logger = setup_logger(name=__name__) dirname = "cityscapes-data-vis" os.makedirs(dirname, exist_ok=True) if args.type == "instance": dicts = load_cityscapes_instances( args.image_dir, args.gt_dir, from_json=True, to_polygons=True ) logger.info("Done loading {} samples.".format(len(dicts))) thing_classes = [k.name for k in labels if k.hasInstances and not k.ignoreInEval] meta = Metadata().set(thing_classes=thing_classes) else: dicts = load_cityscapes_semantic(args.image_dir, args.gt_dir) logger.info("Done loading {} samples.".format(len(dicts))) stuff_classes = [k.name for k in labels if k.trainId != 255] stuff_colors = [k.color for k in labels if k.trainId != 255] meta = Metadata().set(stuff_classes=stuff_classes, stuff_colors=stuff_colors) for d in dicts: img = np.array(Image.open(PathManager.open(d["file_name"], "rb"))) visualizer = Visualizer(img, metadata=meta) vis = visualizer.draw_dataset_dict(d) # cv2.imshow("a", vis.get_image()[:, :, ::-1]) # cv2.waitKey() fpath = os.path.join(dirname, os.path.basename(d["file_name"])) vis.save(fpath) ================================================ FILE: annotator/oneformer/detectron2/data/datasets/cityscapes_panoptic.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import json import logging import os from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from annotator.oneformer.detectron2.data.datasets.builtin_meta import CITYSCAPES_CATEGORIES from annotator.oneformer.detectron2.utils.file_io import PathManager """ This file contains functions to register the Cityscapes panoptic dataset to the DatasetCatalog. """ logger = logging.getLogger(__name__) def get_cityscapes_panoptic_files(image_dir, gt_dir, json_info): files = [] # scan through the directory cities = PathManager.ls(image_dir) logger.info(f"{len(cities)} cities found in '{image_dir}'.") image_dict = {} for city in cities: city_img_dir = os.path.join(image_dir, city) for basename in PathManager.ls(city_img_dir): image_file = os.path.join(city_img_dir, basename) suffix = "_leftImg8bit.png" assert basename.endswith(suffix), basename basename = os.path.basename(basename)[: -len(suffix)] image_dict[basename] = image_file for ann in json_info["annotations"]: image_file = image_dict.get(ann["image_id"], None) assert image_file is not None, "No image {} found for annotation {}".format( ann["image_id"], ann["file_name"] ) label_file = os.path.join(gt_dir, ann["file_name"]) segments_info = ann["segments_info"] files.append((image_file, label_file, segments_info)) assert len(files), "No images found in {}".format(image_dir) assert PathManager.isfile(files[0][0]), files[0][0] assert PathManager.isfile(files[0][1]), files[0][1] return files def load_cityscapes_panoptic(image_dir, gt_dir, gt_json, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/cityscapes_panoptic_train". gt_json (str): path to the json file. e.g., "~/cityscapes/gtFine/cityscapes_panoptic_train.json". meta (dict): dictionary containing "thing_dataset_id_to_contiguous_id" and "stuff_dataset_id_to_contiguous_id" to map category ids to contiguous ids for training. Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] return segment_info assert os.path.exists( gt_json ), "Please run `python cityscapesscripts/preparation/createPanopticImgs.py` to generate label files." # noqa with open(gt_json) as f: json_info = json.load(f) files = get_cityscapes_panoptic_files(image_dir, gt_dir, json_info) ret = [] for image_file, label_file, segments_info in files: sem_label_file = ( image_file.replace("leftImg8bit", "gtFine").split(".")[0] + "_labelTrainIds.png" ) segments_info = [_convert_category_id(x, meta) for x in segments_info] ret.append( { "file_name": image_file, "image_id": "_".join( os.path.splitext(os.path.basename(image_file))[0].split("_")[:3] ), "sem_seg_file_name": sem_label_file, "pan_seg_file_name": label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile( ret[0]["sem_seg_file_name"] ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py" # noqa assert PathManager.isfile( ret[0]["pan_seg_file_name"] ), "Please generate panoptic annotation with python cityscapesscripts/preparation/createPanopticImgs.py" # noqa return ret _RAW_CITYSCAPES_PANOPTIC_SPLITS = { "cityscapes_fine_panoptic_train": ( "cityscapes/leftImg8bit/train", "cityscapes/gtFine/cityscapes_panoptic_train", "cityscapes/gtFine/cityscapes_panoptic_train.json", ), "cityscapes_fine_panoptic_val": ( "cityscapes/leftImg8bit/val", "cityscapes/gtFine/cityscapes_panoptic_val", "cityscapes/gtFine/cityscapes_panoptic_val.json", ), # "cityscapes_fine_panoptic_test": not supported yet } def register_all_cityscapes_panoptic(root): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES] thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES] stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES] stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # There are three types of ids in cityscapes panoptic segmentation: # (1) category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the classifier # (2) instance id: this id is used to differentiate different instances from # the same category. For "stuff" classes, the instance id is always 0; for # "thing" classes, the instance id starts from 1 and 0 is reserved for # ignored instances (e.g. crowd annotation). # (3) panoptic id: this is the compact id that encode both category and # instance id by: category_id * 1000 + instance_id. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for k in CITYSCAPES_CATEGORIES: if k["isthing"] == 1: thing_dataset_id_to_contiguous_id[k["id"]] = k["trainId"] else: stuff_dataset_id_to_contiguous_id[k["id"]] = k["trainId"] meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id for key, (image_dir, gt_dir, gt_json) in _RAW_CITYSCAPES_PANOPTIC_SPLITS.items(): image_dir = os.path.join(root, image_dir) gt_dir = os.path.join(root, gt_dir) gt_json = os.path.join(root, gt_json) DatasetCatalog.register( key, lambda x=image_dir, y=gt_dir, z=gt_json: load_cityscapes_panoptic(x, y, z, meta) ) MetadataCatalog.get(key).set( panoptic_root=gt_dir, image_root=image_dir, panoptic_json=gt_json, gt_dir=gt_dir.replace("cityscapes_panoptic_", ""), evaluator_type="cityscapes_panoptic_seg", ignore_label=255, label_divisor=1000, **meta, ) ================================================ FILE: annotator/oneformer/detectron2/data/datasets/coco.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import contextlib import datetime import io import json import logging import numpy as np import os import shutil import annotator.oneformer.pycocotools.mask as mask_util from fvcore.common.timer import Timer from iopath.common.file_io import file_lock from PIL import Image from annotator.oneformer.detectron2.structures import Boxes, BoxMode, PolygonMasks, RotatedBoxes from annotator.oneformer.detectron2.utils.file_io import PathManager from .. import DatasetCatalog, MetadataCatalog """ This file contains functions to parse COCO-format annotations into dicts in "Detectron2 format". """ logger = logging.getLogger(__name__) __all__ = ["load_coco_json", "load_sem_seg", "convert_to_coco_json", "register_coco_instances"] def load_coco_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None): """ Load a json file with COCO's instances annotation format. Currently supports instance detection, instance segmentation, and person keypoints annotations. Args: json_file (str): full path to the json file in COCO instances annotation format. image_root (str or path-like): the directory where the images in this json file exists. dataset_name (str or None): the name of the dataset (e.g., coco_2017_train). When provided, this function will also do the following: * Put "thing_classes" into the metadata associated with this dataset. * Map the category ids into a contiguous range (needed by standard dataset format), and add "thing_dataset_id_to_contiguous_id" to the metadata associated with this dataset. This option should usually be provided, unless users need to load the original json content and apply more processing manually. extra_annotation_keys (list[str]): list of per-annotation keys that should also be loaded into the dataset dict (besides "iscrowd", "bbox", "keypoints", "category_id", "segmentation"). The values for these keys will be returned as-is. For example, the densepose annotations are loaded in this way. Returns: list[dict]: a list of dicts in Detectron2 standard dataset dicts format (See `Using Custom Datasets `_ ) when `dataset_name` is not None. If `dataset_name` is None, the returned `category_ids` may be incontiguous and may not conform to the Detectron2 standard format. Notes: 1. This function does not read the image files. The results do not have the "image" field. """ from annotator.oneformer.pycocotools.coco import COCO timer = Timer() json_file = PathManager.get_local_path(json_file) with contextlib.redirect_stdout(io.StringIO()): coco_api = COCO(json_file) if timer.seconds() > 1: logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) id_map = None if dataset_name is not None: meta = MetadataCatalog.get(dataset_name) cat_ids = sorted(coco_api.getCatIds()) cats = coco_api.loadCats(cat_ids) # The categories in a custom json file may not be sorted. thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])] meta.thing_classes = thing_classes # In COCO, certain category ids are artificially removed, # and by convention they are always ignored. # We deal with COCO's id issue and translate # the category ids to contiguous ids in [0, 80). # It works by looking at the "categories" field in the json, therefore # if users' own json also have incontiguous ids, we'll # apply this mapping as well but print a warning. if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)): if "coco" not in dataset_name: logger.warning( """ Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you. """ ) id_map = {v: i for i, v in enumerate(cat_ids)} meta.thing_dataset_id_to_contiguous_id = id_map # sort indices for reproducible results img_ids = sorted(coco_api.imgs.keys()) # imgs is a list of dicts, each looks something like: # {'license': 4, # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', # 'file_name': 'COCO_val2014_000000001268.jpg', # 'height': 427, # 'width': 640, # 'date_captured': '2013-11-17 05:57:24', # 'id': 1268} imgs = coco_api.loadImgs(img_ids) # anns is a list[list[dict]], where each dict is an annotation # record for an object. The inner list enumerates the objects in an image # and the outer list enumerates over images. Example of anns[0]: # [{'segmentation': [[192.81, # 247.09, # ... # 219.03, # 249.06]], # 'area': 1035.749, # 'iscrowd': 0, # 'image_id': 1268, # 'bbox': [192.81, 224.8, 74.73, 33.43], # 'category_id': 16, # 'id': 42986}, # ...] anns = [coco_api.imgToAnns[img_id] for img_id in img_ids] total_num_valid_anns = sum([len(x) for x in anns]) total_num_anns = len(coco_api.anns) if total_num_valid_anns < total_num_anns: logger.warning( f"{json_file} contains {total_num_anns} annotations, but only " f"{total_num_valid_anns} of them match to images in the file." ) if "minival" not in json_file: # The popular valminusminival & minival annotations for COCO2014 contain this bug. # However the ratio of buggy annotations there is tiny and does not affect accuracy. # Therefore we explicitly white-list them. ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format( json_file ) imgs_anns = list(zip(imgs, anns)) logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file)) dataset_dicts = [] ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + (extra_annotation_keys or []) num_instances_without_valid_segmentation = 0 for (img_dict, anno_dict_list) in imgs_anns: record = {} record["file_name"] = os.path.join(image_root, img_dict["file_name"]) record["height"] = img_dict["height"] record["width"] = img_dict["width"] image_id = record["image_id"] = img_dict["id"] objs = [] for anno in anno_dict_list: # Check that the image_id in this annotation is the same as # the image_id we're looking at. # This fails only when the data parsing logic or the annotation file is buggy. # The original COCO valminusminival2014 & minival2014 annotation files # actually contains bugs that, together with certain ways of using COCO API, # can trigger this assertion. assert anno["image_id"] == image_id assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.' obj = {key: anno[key] for key in ann_keys if key in anno} if "bbox" in obj and len(obj["bbox"]) == 0: raise ValueError( f"One annotation of image {image_id} contains empty 'bbox' value! " "This json does not have valid COCO format." ) segm = anno.get("segmentation", None) if segm: # either list[list[float]] or dict(RLE) if isinstance(segm, dict): if isinstance(segm["counts"], list): # convert to compressed RLE segm = mask_util.frPyObjects(segm, *segm["size"]) else: # filter out invalid polygons (< 3 points) segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] if len(segm) == 0: num_instances_without_valid_segmentation += 1 continue # ignore this instance obj["segmentation"] = segm keypts = anno.get("keypoints", None) if keypts: # list[int] for idx, v in enumerate(keypts): if idx % 3 != 2: # COCO's segmentation coordinates are floating points in [0, H or W], # but keypoint coordinates are integers in [0, H-1 or W-1] # Therefore we assume the coordinates are "pixel indices" and # add 0.5 to convert to floating point coordinates. keypts[idx] = v + 0.5 obj["keypoints"] = keypts obj["bbox_mode"] = BoxMode.XYWH_ABS if id_map: annotation_category_id = obj["category_id"] try: obj["category_id"] = id_map[annotation_category_id] except KeyError as e: raise KeyError( f"Encountered category_id={annotation_category_id} " "but this id does not exist in 'categories' of the json file." ) from e objs.append(obj) record["annotations"] = objs dataset_dicts.append(record) if num_instances_without_valid_segmentation > 0: logger.warning( "Filtered out {} instances without valid segmentation. ".format( num_instances_without_valid_segmentation ) + "There might be issues in your dataset generation process. Please " "check https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html carefully" ) return dataset_dicts def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"): """ Load semantic segmentation datasets. All files under "gt_root" with "gt_ext" extension are treated as ground truth annotations and all files under "image_root" with "image_ext" extension as input images. Ground truth and input images are matched using file paths relative to "gt_root" and "image_root" respectively without taking into account file extensions. This works for COCO as well as some other datasets. Args: gt_root (str): full path to ground truth semantic segmentation files. Semantic segmentation annotations are stored as images with integer values in pixels that represent corresponding semantic labels. image_root (str): the directory where the input images are. gt_ext (str): file extension for ground truth annotations. image_ext (str): file extension for input images. Returns: list[dict]: a list of dicts in detectron2 standard format without instance-level annotation. Notes: 1. This function does not read the image and ground truth files. The results do not have the "image" and "sem_seg" fields. """ # We match input images with ground truth based on their relative filepaths (without file # extensions) starting from 'image_root' and 'gt_root' respectively. def file2id(folder_path, file_path): # extract relative path starting from `folder_path` image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path)) # remove file extension image_id = os.path.splitext(image_id)[0] return image_id input_files = sorted( (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)), key=lambda file_path: file2id(image_root, file_path), ) gt_files = sorted( (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)), key=lambda file_path: file2id(gt_root, file_path), ) assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root) # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images if len(input_files) != len(gt_files): logger.warn( "Directory {} and {} has {} and {} files, respectively.".format( image_root, gt_root, len(input_files), len(gt_files) ) ) input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files] gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files] intersect = list(set(input_basenames) & set(gt_basenames)) # sort, otherwise each worker may obtain a list[dict] in different order intersect = sorted(intersect) logger.warn("Will use their intersection of {} files.".format(len(intersect))) input_files = [os.path.join(image_root, f + image_ext) for f in intersect] gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect] logger.info( "Loaded {} images with semantic segmentation from {}".format(len(input_files), image_root) ) dataset_dicts = [] for (img_path, gt_path) in zip(input_files, gt_files): record = {} record["file_name"] = img_path record["sem_seg_file_name"] = gt_path dataset_dicts.append(record) return dataset_dicts def convert_to_coco_dict(dataset_name): """ Convert an instance detection/segmentation or keypoint detection dataset in detectron2's standard format into COCO json format. Generic dataset description can be found here: https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset COCO data format description can be found here: http://cocodataset.org/#format-data Args: dataset_name (str): name of the source dataset Must be registered in DatastCatalog and in detectron2's standard format. Must have corresponding metadata "thing_classes" Returns: coco_dict: serializable dict in COCO json format """ dataset_dicts = DatasetCatalog.get(dataset_name) metadata = MetadataCatalog.get(dataset_name) # unmap the category mapping ids for COCO if hasattr(metadata, "thing_dataset_id_to_contiguous_id"): reverse_id_mapping = {v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items()} reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[contiguous_id] # noqa else: reverse_id_mapper = lambda contiguous_id: contiguous_id # noqa categories = [ {"id": reverse_id_mapper(id), "name": name} for id, name in enumerate(metadata.thing_classes) ] logger.info("Converting dataset dicts into COCO format") coco_images = [] coco_annotations = [] for image_id, image_dict in enumerate(dataset_dicts): coco_image = { "id": image_dict.get("image_id", image_id), "width": int(image_dict["width"]), "height": int(image_dict["height"]), "file_name": str(image_dict["file_name"]), } coco_images.append(coco_image) anns_per_image = image_dict.get("annotations", []) for annotation in anns_per_image: # create a new dict with only COCO fields coco_annotation = {} # COCO requirement: XYWH box format for axis-align and XYWHA for rotated bbox = annotation["bbox"] if isinstance(bbox, np.ndarray): if bbox.ndim != 1: raise ValueError(f"bbox has to be 1-dimensional. Got shape={bbox.shape}.") bbox = bbox.tolist() if len(bbox) not in [4, 5]: raise ValueError(f"bbox has to has length 4 or 5. Got {bbox}.") from_bbox_mode = annotation["bbox_mode"] to_bbox_mode = BoxMode.XYWH_ABS if len(bbox) == 4 else BoxMode.XYWHA_ABS bbox = BoxMode.convert(bbox, from_bbox_mode, to_bbox_mode) # COCO requirement: instance area if "segmentation" in annotation: # Computing areas for instances by counting the pixels segmentation = annotation["segmentation"] # TODO: check segmentation type: RLE, BinaryMask or Polygon if isinstance(segmentation, list): polygons = PolygonMasks([segmentation]) area = polygons.area()[0].item() elif isinstance(segmentation, dict): # RLE area = mask_util.area(segmentation).item() else: raise TypeError(f"Unknown segmentation type {type(segmentation)}!") else: # Computing areas using bounding boxes if to_bbox_mode == BoxMode.XYWH_ABS: bbox_xy = BoxMode.convert(bbox, to_bbox_mode, BoxMode.XYXY_ABS) area = Boxes([bbox_xy]).area()[0].item() else: area = RotatedBoxes([bbox]).area()[0].item() if "keypoints" in annotation: keypoints = annotation["keypoints"] # list[int] for idx, v in enumerate(keypoints): if idx % 3 != 2: # COCO's segmentation coordinates are floating points in [0, H or W], # but keypoint coordinates are integers in [0, H-1 or W-1] # For COCO format consistency we substract 0.5 # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163 keypoints[idx] = v - 0.5 if "num_keypoints" in annotation: num_keypoints = annotation["num_keypoints"] else: num_keypoints = sum(kp > 0 for kp in keypoints[2::3]) # COCO requirement: # linking annotations to images # "id" field must start with 1 coco_annotation["id"] = len(coco_annotations) + 1 coco_annotation["image_id"] = coco_image["id"] coco_annotation["bbox"] = [round(float(x), 3) for x in bbox] coco_annotation["area"] = float(area) coco_annotation["iscrowd"] = int(annotation.get("iscrowd", 0)) coco_annotation["category_id"] = int(reverse_id_mapper(annotation["category_id"])) # Add optional fields if "keypoints" in annotation: coco_annotation["keypoints"] = keypoints coco_annotation["num_keypoints"] = num_keypoints if "segmentation" in annotation: seg = coco_annotation["segmentation"] = annotation["segmentation"] if isinstance(seg, dict): # RLE counts = seg["counts"] if not isinstance(counts, str): # make it json-serializable seg["counts"] = counts.decode("ascii") coco_annotations.append(coco_annotation) logger.info( "Conversion finished, " f"#images: {len(coco_images)}, #annotations: {len(coco_annotations)}" ) info = { "date_created": str(datetime.datetime.now()), "description": "Automatically generated COCO json file for Detectron2.", } coco_dict = {"info": info, "images": coco_images, "categories": categories, "licenses": None} if len(coco_annotations) > 0: coco_dict["annotations"] = coco_annotations return coco_dict def convert_to_coco_json(dataset_name, output_file, allow_cached=True): """ Converts dataset into COCO format and saves it to a json file. dataset_name must be registered in DatasetCatalog and in detectron2's standard format. Args: dataset_name: reference from the config file to the catalogs must be registered in DatasetCatalog and in detectron2's standard format output_file: path of json file that will be saved to allow_cached: if json file is already present then skip conversion """ # TODO: The dataset or the conversion script *may* change, # a checksum would be useful for validating the cached data PathManager.mkdirs(os.path.dirname(output_file)) with file_lock(output_file): if PathManager.exists(output_file) and allow_cached: logger.warning( f"Using previously cached COCO format annotations at '{output_file}'. " "You need to clear the cache file if your dataset has been modified." ) else: logger.info(f"Converting annotations of dataset '{dataset_name}' to COCO format ...)") coco_dict = convert_to_coco_dict(dataset_name) logger.info(f"Caching COCO format annotations at '{output_file}' ...") tmp_file = output_file + ".tmp" with PathManager.open(tmp_file, "w") as f: json.dump(coco_dict, f) shutil.move(tmp_file, output_file) def register_coco_instances(name, metadata, json_file, image_root): """ Register a dataset in COCO's json annotation format for instance detection, instance segmentation and keypoint detection. (i.e., Type 1 and 2 in http://cocodataset.org/#format-data. `instances*.json` and `person_keypoints*.json` in the dataset). This is an example of how to register a new dataset. You can do something similar to this function, to register new datasets. Args: name (str): the name that identifies a dataset, e.g. "coco_2014_train". metadata (dict): extra metadata associated with this dataset. You can leave it as an empty dict. json_file (str): path to the json instance annotation file. image_root (str or path-like): directory which contains all the images. """ assert isinstance(name, str), name assert isinstance(json_file, (str, os.PathLike)), json_file assert isinstance(image_root, (str, os.PathLike)), image_root # 1. register a function which returns dicts DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name)) # 2. Optionally, add metadata about this dataset, # since they might be useful in evaluation, visualization or logging MetadataCatalog.get(name).set( json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata ) if __name__ == "__main__": """ Test the COCO json dataset loader. Usage: python -m detectron2.data.datasets.coco \ path/to/json path/to/image_root dataset_name "dataset_name" can be "coco_2014_minival_100", or other pre-registered ones """ from annotator.oneformer.detectron2.utils.logger import setup_logger from annotator.oneformer.detectron2.utils.visualizer import Visualizer import annotator.oneformer.detectron2.data.datasets # noqa # add pre-defined metadata import sys logger = setup_logger(name=__name__) assert sys.argv[3] in DatasetCatalog.list() meta = MetadataCatalog.get(sys.argv[3]) dicts = load_coco_json(sys.argv[1], sys.argv[2], sys.argv[3]) logger.info("Done loading {} samples.".format(len(dicts))) dirname = "coco-data-vis" os.makedirs(dirname, exist_ok=True) for d in dicts: img = np.array(Image.open(d["file_name"])) visualizer = Visualizer(img, metadata=meta) vis = visualizer.draw_dataset_dict(d) fpath = os.path.join(dirname, os.path.basename(d["file_name"])) vis.save(fpath) ================================================ FILE: annotator/oneformer/detectron2/data/datasets/coco_panoptic.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import json import os from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from annotator.oneformer.detectron2.utils.file_io import PathManager from .coco import load_coco_json, load_sem_seg __all__ = ["register_coco_panoptic", "register_coco_panoptic_separated"] def load_coco_panoptic_json(json_file, image_dir, gt_dir, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = True else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = False return segment_info with PathManager.open(json_file) as f: json_info = json.load(f) ret = [] for ann in json_info["annotations"]: image_id = int(ann["image_id"]) # TODO: currently we assume image and label has the same filename but # different extension, and images have extension ".jpg" for COCO. Need # to make image extension a user-provided argument if we extend this # function to support other COCO-like datasets. image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") label_file = os.path.join(gt_dir, ann["file_name"]) segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] ret.append( { "file_name": image_file, "image_id": image_id, "pan_seg_file_name": label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] return ret def register_coco_panoptic( name, metadata, image_root, panoptic_root, panoptic_json, instances_json=None ): """ Register a "standard" version of COCO panoptic segmentation dataset named `name`. The dictionaries in this registered dataset follows detectron2's standard format. Hence it's called "standard". Args: name (str): the name that identifies a dataset, e.g. "coco_2017_train_panoptic" metadata (dict): extra metadata associated with this dataset. image_root (str): directory which contains all the images panoptic_root (str): directory which contains panoptic annotation images in COCO format panoptic_json (str): path to the json panoptic annotation file in COCO format sem_seg_root (none): not used, to be consistent with `register_coco_panoptic_separated`. instances_json (str): path to the json instance annotation file """ panoptic_name = name DatasetCatalog.register( panoptic_name, lambda: load_coco_panoptic_json(panoptic_json, image_root, panoptic_root, metadata), ) MetadataCatalog.get(panoptic_name).set( panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, json_file=instances_json, evaluator_type="coco_panoptic_seg", ignore_label=255, label_divisor=1000, **metadata, ) def register_coco_panoptic_separated( name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json ): """ Register a "separated" version of COCO panoptic segmentation dataset named `name`. The annotations in this registered dataset will contain both instance annotations and semantic annotations, each with its own contiguous ids. Hence it's called "separated". It follows the setting used by the PanopticFPN paper: 1. The instance annotations directly come from polygons in the COCO instances annotation task, rather than from the masks in the COCO panoptic annotations. The two format have small differences: Polygons in the instance annotations may have overlaps. The mask annotations are produced by labeling the overlapped polygons with depth ordering. 2. The semantic annotations are converted from panoptic annotations, where all "things" are assigned a semantic id of 0. All semantic categories will therefore have ids in contiguous range [1, #stuff_categories]. This function will also register a pure semantic segmentation dataset named ``name + '_stuffonly'``. Args: name (str): the name that identifies a dataset, e.g. "coco_2017_train_panoptic" metadata (dict): extra metadata associated with this dataset. image_root (str): directory which contains all the images panoptic_root (str): directory which contains panoptic annotation images panoptic_json (str): path to the json panoptic annotation file sem_seg_root (str): directory which contains all the ground truth segmentation annotations. instances_json (str): path to the json instance annotation file """ panoptic_name = name + "_separated" DatasetCatalog.register( panoptic_name, lambda: merge_to_panoptic( load_coco_json(instances_json, image_root, panoptic_name), load_sem_seg(sem_seg_root, image_root), ), ) MetadataCatalog.get(panoptic_name).set( panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, sem_seg_root=sem_seg_root, json_file=instances_json, # TODO rename evaluator_type="coco_panoptic_seg", ignore_label=255, **metadata, ) semantic_name = name + "_stuffonly" DatasetCatalog.register(semantic_name, lambda: load_sem_seg(sem_seg_root, image_root)) MetadataCatalog.get(semantic_name).set( sem_seg_root=sem_seg_root, image_root=image_root, evaluator_type="sem_seg", ignore_label=255, **metadata, ) def merge_to_panoptic(detection_dicts, sem_seg_dicts): """ Create dataset dicts for panoptic segmentation, by merging two dicts using "file_name" field to match their entries. Args: detection_dicts (list[dict]): lists of dicts for object detection or instance segmentation. sem_seg_dicts (list[dict]): lists of dicts for semantic segmentation. Returns: list[dict] (one per input image): Each dict contains all (key, value) pairs from dicts in both detection_dicts and sem_seg_dicts that correspond to the same image. The function assumes that the same key in different dicts has the same value. """ results = [] sem_seg_file_to_entry = {x["file_name"]: x for x in sem_seg_dicts} assert len(sem_seg_file_to_entry) > 0 for det_dict in detection_dicts: dic = copy.copy(det_dict) dic.update(sem_seg_file_to_entry[dic["file_name"]]) results.append(dic) return results if __name__ == "__main__": """ Test the COCO panoptic dataset loader. Usage: python -m detectron2.data.datasets.coco_panoptic \ path/to/image_root path/to/panoptic_root path/to/panoptic_json dataset_name 10 "dataset_name" can be "coco_2017_train_panoptic", or other pre-registered ones """ from annotator.oneformer.detectron2.utils.logger import setup_logger from annotator.oneformer.detectron2.utils.visualizer import Visualizer import annotator.oneformer.detectron2.data.datasets # noqa # add pre-defined metadata import sys from PIL import Image import numpy as np logger = setup_logger(name=__name__) assert sys.argv[4] in DatasetCatalog.list() meta = MetadataCatalog.get(sys.argv[4]) dicts = load_coco_panoptic_json(sys.argv[3], sys.argv[1], sys.argv[2], meta.as_dict()) logger.info("Done loading {} samples.".format(len(dicts))) dirname = "coco-data-vis" os.makedirs(dirname, exist_ok=True) num_imgs_to_vis = int(sys.argv[5]) for i, d in enumerate(dicts): img = np.array(Image.open(d["file_name"])) visualizer = Visualizer(img, metadata=meta) vis = visualizer.draw_dataset_dict(d) fpath = os.path.join(dirname, os.path.basename(d["file_name"])) vis.save(fpath) if i + 1 >= num_imgs_to_vis: break ================================================ FILE: annotator/oneformer/detectron2/data/datasets/lvis.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import os from fvcore.common.timer import Timer from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from annotator.oneformer.detectron2.structures import BoxMode from annotator.oneformer.detectron2.utils.file_io import PathManager from .builtin_meta import _get_coco_instances_meta from .lvis_v0_5_categories import LVIS_CATEGORIES as LVIS_V0_5_CATEGORIES from .lvis_v1_categories import LVIS_CATEGORIES as LVIS_V1_CATEGORIES from .lvis_v1_category_image_count import LVIS_CATEGORY_IMAGE_COUNT as LVIS_V1_CATEGORY_IMAGE_COUNT """ This file contains functions to parse LVIS-format annotations into dicts in the "Detectron2 format". """ logger = logging.getLogger(__name__) __all__ = ["load_lvis_json", "register_lvis_instances", "get_lvis_instances_meta"] def register_lvis_instances(name, metadata, json_file, image_root): """ Register a dataset in LVIS's json annotation format for instance detection and segmentation. Args: name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train". metadata (dict): extra metadata associated with this dataset. It can be an empty dict. json_file (str): path to the json instance annotation file. image_root (str or path-like): directory which contains all the images. """ DatasetCatalog.register(name, lambda: load_lvis_json(json_file, image_root, name)) MetadataCatalog.get(name).set( json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata ) def load_lvis_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None): """ Load a json file in LVIS's annotation format. Args: json_file (str): full path to the LVIS json annotation file. image_root (str): the directory where the images in this json file exists. dataset_name (str): the name of the dataset (e.g., "lvis_v0.5_train"). If provided, this function will put "thing_classes" into the metadata associated with this dataset. extra_annotation_keys (list[str]): list of per-annotation keys that should also be loaded into the dataset dict (besides "bbox", "bbox_mode", "category_id", "segmentation"). The values for these keys will be returned as-is. Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) Notes: 1. This function does not read the image files. The results do not have the "image" field. """ from lvis import LVIS json_file = PathManager.get_local_path(json_file) timer = Timer() lvis_api = LVIS(json_file) if timer.seconds() > 1: logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) if dataset_name is not None: meta = get_lvis_instances_meta(dataset_name) MetadataCatalog.get(dataset_name).set(**meta) # sort indices for reproducible results img_ids = sorted(lvis_api.imgs.keys()) # imgs is a list of dicts, each looks something like: # {'license': 4, # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', # 'file_name': 'COCO_val2014_000000001268.jpg', # 'height': 427, # 'width': 640, # 'date_captured': '2013-11-17 05:57:24', # 'id': 1268} imgs = lvis_api.load_imgs(img_ids) # anns is a list[list[dict]], where each dict is an annotation # record for an object. The inner list enumerates the objects in an image # and the outer list enumerates over images. Example of anns[0]: # [{'segmentation': [[192.81, # 247.09, # ... # 219.03, # 249.06]], # 'area': 1035.749, # 'image_id': 1268, # 'bbox': [192.81, 224.8, 74.73, 33.43], # 'category_id': 16, # 'id': 42986}, # ...] anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] # Sanity check that each annotation has a unique id ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format( json_file ) imgs_anns = list(zip(imgs, anns)) logger.info("Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file)) if extra_annotation_keys: logger.info( "The following extra annotation keys will be loaded: {} ".format(extra_annotation_keys) ) else: extra_annotation_keys = [] def get_file_name(img_root, img_dict): # Determine the path including the split folder ("train2017", "val2017", "test2017") from # the coco_url field. Example: # 'coco_url': 'http://images.cocodataset.org/train2017/000000155379.jpg' split_folder, file_name = img_dict["coco_url"].split("/")[-2:] return os.path.join(img_root + split_folder, file_name) dataset_dicts = [] for (img_dict, anno_dict_list) in imgs_anns: record = {} record["file_name"] = get_file_name(image_root, img_dict) record["height"] = img_dict["height"] record["width"] = img_dict["width"] record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", []) record["neg_category_ids"] = img_dict.get("neg_category_ids", []) image_id = record["image_id"] = img_dict["id"] objs = [] for anno in anno_dict_list: # Check that the image_id in this annotation is the same as # the image_id we're looking at. # This fails only when the data parsing logic or the annotation file is buggy. assert anno["image_id"] == image_id obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS} # LVIS data loader can be used to load COCO dataset categories. In this case `meta` # variable will have a field with COCO-specific category mapping. if dataset_name is not None and "thing_dataset_id_to_contiguous_id" in meta: obj["category_id"] = meta["thing_dataset_id_to_contiguous_id"][anno["category_id"]] else: obj["category_id"] = anno["category_id"] - 1 # Convert 1-indexed to 0-indexed segm = anno["segmentation"] # list[list[float]] # filter out invalid polygons (< 3 points) valid_segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] assert len(segm) == len( valid_segm ), "Annotation contains an invalid polygon with < 3 points" assert len(segm) > 0 obj["segmentation"] = segm for extra_ann_key in extra_annotation_keys: obj[extra_ann_key] = anno[extra_ann_key] objs.append(obj) record["annotations"] = objs dataset_dicts.append(record) return dataset_dicts def get_lvis_instances_meta(dataset_name): """ Load LVIS metadata. Args: dataset_name (str): LVIS dataset name without the split name (e.g., "lvis_v0.5"). Returns: dict: LVIS metadata with keys: thing_classes """ if "cocofied" in dataset_name: return _get_coco_instances_meta() if "v0.5" in dataset_name: return _get_lvis_instances_meta_v0_5() elif "v1" in dataset_name: return _get_lvis_instances_meta_v1() raise ValueError("No built-in metadata for dataset {}".format(dataset_name)) def _get_lvis_instances_meta_v0_5(): assert len(LVIS_V0_5_CATEGORIES) == 1230 cat_ids = [k["id"] for k in LVIS_V0_5_CATEGORIES] assert min(cat_ids) == 1 and max(cat_ids) == len( cat_ids ), "Category ids are not in [1, #categories], as expected" # Ensure that the category list is sorted by id lvis_categories = sorted(LVIS_V0_5_CATEGORIES, key=lambda x: x["id"]) thing_classes = [k["synonyms"][0] for k in lvis_categories] meta = {"thing_classes": thing_classes} return meta def _get_lvis_instances_meta_v1(): assert len(LVIS_V1_CATEGORIES) == 1203 cat_ids = [k["id"] for k in LVIS_V1_CATEGORIES] assert min(cat_ids) == 1 and max(cat_ids) == len( cat_ids ), "Category ids are not in [1, #categories], as expected" # Ensure that the category list is sorted by id lvis_categories = sorted(LVIS_V1_CATEGORIES, key=lambda x: x["id"]) thing_classes = [k["synonyms"][0] for k in lvis_categories] meta = {"thing_classes": thing_classes, "class_image_count": LVIS_V1_CATEGORY_IMAGE_COUNT} return meta if __name__ == "__main__": """ Test the LVIS json dataset loader. Usage: python -m detectron2.data.datasets.lvis \ path/to/json path/to/image_root dataset_name vis_limit """ import sys import numpy as np from annotator.oneformer.detectron2.utils.logger import setup_logger from PIL import Image import annotator.oneformer.detectron2.data.datasets # noqa # add pre-defined metadata from annotator.oneformer.detectron2.utils.visualizer import Visualizer logger = setup_logger(name=__name__) meta = MetadataCatalog.get(sys.argv[3]) dicts = load_lvis_json(sys.argv[1], sys.argv[2], sys.argv[3]) logger.info("Done loading {} samples.".format(len(dicts))) dirname = "lvis-data-vis" os.makedirs(dirname, exist_ok=True) for d in dicts[: int(sys.argv[4])]: img = np.array(Image.open(d["file_name"])) visualizer = Visualizer(img, metadata=meta) vis = visualizer.draw_dataset_dict(d) fpath = os.path.join(dirname, os.path.basename(d["file_name"])) vis.save(fpath) ================================================ FILE: annotator/oneformer/detectron2/data/datasets/lvis_v0_5_categories.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Autogen with # with open("lvis_v0.5_val.json", "r") as f: # a = json.load(f) # c = a["categories"] # for x in c: # del x["image_count"] # del x["instance_count"] # LVIS_CATEGORIES = repr(c) + " # noqa" # fmt: off LVIS_CATEGORIES = [{'frequency': 'r', 'id': 1, 'synset': 'acorn.n.01', 'synonyms': ['acorn'], 'def': 'nut from an oak tree', 'name': 'acorn'}, {'frequency': 'c', 'id': 2, 'synset': 'aerosol.n.02', 'synonyms': ['aerosol_can', 'spray_can'], 'def': 'a dispenser that holds a substance under pressure', 'name': 'aerosol_can'}, {'frequency': 'f', 'id': 3, 'synset': 'air_conditioner.n.01', 'synonyms': ['air_conditioner'], 'def': 'a machine that keeps air cool and dry', 'name': 'air_conditioner'}, {'frequency': 'f', 'id': 4, 'synset': 'airplane.n.01', 'synonyms': ['airplane', 'aeroplane'], 'def': 'an aircraft that has a fixed wing and is powered by propellers or jets', 'name': 'airplane'}, {'frequency': 'c', 'id': 5, 'synset': 'alarm_clock.n.01', 'synonyms': ['alarm_clock'], 'def': 'a clock that wakes a sleeper at some preset time', 'name': 'alarm_clock'}, {'frequency': 'c', 'id': 6, 'synset': 'alcohol.n.01', 'synonyms': ['alcohol', 'alcoholic_beverage'], 'def': 'a liquor or brew containing alcohol as the active agent', 'name': 'alcohol'}, {'frequency': 'r', 'id': 7, 'synset': 'alligator.n.02', 'synonyms': ['alligator', 'gator'], 'def': 'amphibious reptiles related to crocodiles but with shorter broader snouts', 'name': 'alligator'}, {'frequency': 'c', 'id': 8, 'synset': 'almond.n.02', 'synonyms': ['almond'], 'def': 'oval-shaped edible seed of the almond tree', 'name': 'almond'}, {'frequency': 'c', 'id': 9, 'synset': 'ambulance.n.01', 'synonyms': ['ambulance'], 'def': 'a vehicle that takes people to and from hospitals', 'name': 'ambulance'}, {'frequency': 'r', 'id': 10, 'synset': 'amplifier.n.01', 'synonyms': ['amplifier'], 'def': 'electronic equipment that increases strength of signals', 'name': 'amplifier'}, {'frequency': 'c', 'id': 11, 'synset': 'anklet.n.03', 'synonyms': ['anklet', 'ankle_bracelet'], 'def': 'an ornament worn around the ankle', 'name': 'anklet'}, {'frequency': 'f', 'id': 12, 'synset': 'antenna.n.01', 'synonyms': ['antenna', 'aerial', 'transmitting_aerial'], 'def': 'an electrical device that sends or receives radio or television signals', 'name': 'antenna'}, {'frequency': 'f', 'id': 13, 'synset': 'apple.n.01', 'synonyms': ['apple'], 'def': 'fruit with red or yellow or green skin and sweet to tart crisp whitish flesh', 'name': 'apple'}, {'frequency': 'r', 'id': 14, 'synset': 'apple_juice.n.01', 'synonyms': ['apple_juice'], 'def': 'the juice of apples', 'name': 'apple_juice'}, {'frequency': 'r', 'id': 15, 'synset': 'applesauce.n.01', 'synonyms': ['applesauce'], 'def': 'puree of stewed apples usually sweetened and spiced', 'name': 'applesauce'}, {'frequency': 'r', 'id': 16, 'synset': 'apricot.n.02', 'synonyms': ['apricot'], 'def': 'downy yellow to rosy-colored fruit resembling a small peach', 'name': 'apricot'}, {'frequency': 'f', 'id': 17, 'synset': 'apron.n.01', 'synonyms': ['apron'], 'def': 'a garment of cloth that is tied about the waist and worn to protect clothing', 'name': 'apron'}, {'frequency': 'c', 'id': 18, 'synset': 'aquarium.n.01', 'synonyms': ['aquarium', 'fish_tank'], 'def': 'a tank/pool/bowl filled with water for keeping live fish and underwater animals', 'name': 'aquarium'}, {'frequency': 'c', 'id': 19, 'synset': 'armband.n.02', 'synonyms': ['armband'], 'def': 'a band worn around the upper arm', 'name': 'armband'}, {'frequency': 'f', 'id': 20, 'synset': 'armchair.n.01', 'synonyms': ['armchair'], 'def': 'chair with a support on each side for arms', 'name': 'armchair'}, {'frequency': 'r', 'id': 21, 'synset': 'armoire.n.01', 'synonyms': ['armoire'], 'def': 'a large wardrobe or cabinet', 'name': 'armoire'}, {'frequency': 'r', 'id': 22, 'synset': 'armor.n.01', 'synonyms': ['armor', 'armour'], 'def': 'protective covering made of metal and used in combat', 'name': 'armor'}, {'frequency': 'c', 'id': 23, 'synset': 'artichoke.n.02', 'synonyms': ['artichoke'], 'def': 'a thistlelike flower head with edible fleshy leaves and heart', 'name': 'artichoke'}, {'frequency': 'f', 'id': 24, 'synset': 'ashcan.n.01', 'synonyms': ['trash_can', 'garbage_can', 'wastebin', 'dustbin', 'trash_barrel', 'trash_bin'], 'def': 'a bin that holds rubbish until it is collected', 'name': 'trash_can'}, {'frequency': 'c', 'id': 25, 'synset': 'ashtray.n.01', 'synonyms': ['ashtray'], 'def': "a receptacle for the ash from smokers' cigars or cigarettes", 'name': 'ashtray'}, {'frequency': 'c', 'id': 26, 'synset': 'asparagus.n.02', 'synonyms': ['asparagus'], 'def': 'edible young shoots of the asparagus plant', 'name': 'asparagus'}, {'frequency': 'c', 'id': 27, 'synset': 'atomizer.n.01', 'synonyms': ['atomizer', 'atomiser', 'spray', 'sprayer', 'nebulizer', 'nebuliser'], 'def': 'a dispenser that turns a liquid (such as perfume) into a fine mist', 'name': 'atomizer'}, {'frequency': 'c', 'id': 28, 'synset': 'avocado.n.01', 'synonyms': ['avocado'], 'def': 'a pear-shaped fruit with green or blackish skin and rich yellowish pulp enclosing a single large seed', 'name': 'avocado'}, {'frequency': 'c', 'id': 29, 'synset': 'award.n.02', 'synonyms': ['award', 'accolade'], 'def': 'a tangible symbol signifying approval or distinction', 'name': 'award'}, {'frequency': 'f', 'id': 30, 'synset': 'awning.n.01', 'synonyms': ['awning'], 'def': 'a canopy made of canvas to shelter people or things from rain or sun', 'name': 'awning'}, {'frequency': 'r', 'id': 31, 'synset': 'ax.n.01', 'synonyms': ['ax', 'axe'], 'def': 'an edge tool with a heavy bladed head mounted across a handle', 'name': 'ax'}, {'frequency': 'f', 'id': 32, 'synset': 'baby_buggy.n.01', 'synonyms': ['baby_buggy', 'baby_carriage', 'perambulator', 'pram', 'stroller'], 'def': 'a small vehicle with four wheels in which a baby or child is pushed around', 'name': 'baby_buggy'}, {'frequency': 'c', 'id': 33, 'synset': 'backboard.n.01', 'synonyms': ['basketball_backboard'], 'def': 'a raised vertical board with basket attached; used to play basketball', 'name': 'basketball_backboard'}, {'frequency': 'f', 'id': 34, 'synset': 'backpack.n.01', 'synonyms': ['backpack', 'knapsack', 'packsack', 'rucksack', 'haversack'], 'def': 'a bag carried by a strap on your back or shoulder', 'name': 'backpack'}, {'frequency': 'f', 'id': 35, 'synset': 'bag.n.04', 'synonyms': ['handbag', 'purse', 'pocketbook'], 'def': 'a container used for carrying money and small personal items or accessories', 'name': 'handbag'}, {'frequency': 'f', 'id': 36, 'synset': 'bag.n.06', 'synonyms': ['suitcase', 'baggage', 'luggage'], 'def': 'cases used to carry belongings when traveling', 'name': 'suitcase'}, {'frequency': 'c', 'id': 37, 'synset': 'bagel.n.01', 'synonyms': ['bagel', 'beigel'], 'def': 'glazed yeast-raised doughnut-shaped roll with hard crust', 'name': 'bagel'}, {'frequency': 'r', 'id': 38, 'synset': 'bagpipe.n.01', 'synonyms': ['bagpipe'], 'def': 'a tubular wind instrument; the player blows air into a bag and squeezes it out', 'name': 'bagpipe'}, {'frequency': 'r', 'id': 39, 'synset': 'baguet.n.01', 'synonyms': ['baguet', 'baguette'], 'def': 'narrow French stick loaf', 'name': 'baguet'}, {'frequency': 'r', 'id': 40, 'synset': 'bait.n.02', 'synonyms': ['bait', 'lure'], 'def': 'something used to lure fish or other animals into danger so they can be trapped or killed', 'name': 'bait'}, {'frequency': 'f', 'id': 41, 'synset': 'ball.n.06', 'synonyms': ['ball'], 'def': 'a spherical object used as a plaything', 'name': 'ball'}, {'frequency': 'r', 'id': 42, 'synset': 'ballet_skirt.n.01', 'synonyms': ['ballet_skirt', 'tutu'], 'def': 'very short skirt worn by ballerinas', 'name': 'ballet_skirt'}, {'frequency': 'f', 'id': 43, 'synset': 'balloon.n.01', 'synonyms': ['balloon'], 'def': 'large tough nonrigid bag filled with gas or heated air', 'name': 'balloon'}, {'frequency': 'c', 'id': 44, 'synset': 'bamboo.n.02', 'synonyms': ['bamboo'], 'def': 'woody tropical grass having hollow woody stems', 'name': 'bamboo'}, {'frequency': 'f', 'id': 45, 'synset': 'banana.n.02', 'synonyms': ['banana'], 'def': 'elongated crescent-shaped yellow fruit with soft sweet flesh', 'name': 'banana'}, {'frequency': 'r', 'id': 46, 'synset': 'band_aid.n.01', 'synonyms': ['Band_Aid'], 'def': 'trade name for an adhesive bandage to cover small cuts or blisters', 'name': 'Band_Aid'}, {'frequency': 'c', 'id': 47, 'synset': 'bandage.n.01', 'synonyms': ['bandage'], 'def': 'a piece of soft material that covers and protects an injured part of the body', 'name': 'bandage'}, {'frequency': 'c', 'id': 48, 'synset': 'bandanna.n.01', 'synonyms': ['bandanna', 'bandana'], 'def': 'large and brightly colored handkerchief; often used as a neckerchief', 'name': 'bandanna'}, {'frequency': 'r', 'id': 49, 'synset': 'banjo.n.01', 'synonyms': ['banjo'], 'def': 'a stringed instrument of the guitar family with a long neck and circular body', 'name': 'banjo'}, {'frequency': 'f', 'id': 50, 'synset': 'banner.n.01', 'synonyms': ['banner', 'streamer'], 'def': 'long strip of cloth or paper used for decoration or advertising', 'name': 'banner'}, {'frequency': 'r', 'id': 51, 'synset': 'barbell.n.01', 'synonyms': ['barbell'], 'def': 'a bar to which heavy discs are attached at each end; used in weightlifting', 'name': 'barbell'}, {'frequency': 'r', 'id': 52, 'synset': 'barge.n.01', 'synonyms': ['barge'], 'def': 'a flatbottom boat for carrying heavy loads (especially on canals)', 'name': 'barge'}, {'frequency': 'f', 'id': 53, 'synset': 'barrel.n.02', 'synonyms': ['barrel', 'cask'], 'def': 'a cylindrical container that holds liquids', 'name': 'barrel'}, {'frequency': 'c', 'id': 54, 'synset': 'barrette.n.01', 'synonyms': ['barrette'], 'def': "a pin for holding women's hair in place", 'name': 'barrette'}, {'frequency': 'c', 'id': 55, 'synset': 'barrow.n.03', 'synonyms': ['barrow', 'garden_cart', 'lawn_cart', 'wheelbarrow'], 'def': 'a cart for carrying small loads; has handles and one or more wheels', 'name': 'barrow'}, {'frequency': 'f', 'id': 56, 'synset': 'base.n.03', 'synonyms': ['baseball_base'], 'def': 'a place that the runner must touch before scoring', 'name': 'baseball_base'}, {'frequency': 'f', 'id': 57, 'synset': 'baseball.n.02', 'synonyms': ['baseball'], 'def': 'a ball used in playing baseball', 'name': 'baseball'}, {'frequency': 'f', 'id': 58, 'synset': 'baseball_bat.n.01', 'synonyms': ['baseball_bat'], 'def': 'an implement used in baseball by the batter', 'name': 'baseball_bat'}, {'frequency': 'f', 'id': 59, 'synset': 'baseball_cap.n.01', 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'def': 'a cap with a bill', 'name': 'baseball_cap'}, {'frequency': 'f', 'id': 60, 'synset': 'baseball_glove.n.01', 'synonyms': ['baseball_glove', 'baseball_mitt'], 'def': 'the handwear used by fielders in playing baseball', 'name': 'baseball_glove'}, {'frequency': 'f', 'id': 61, 'synset': 'basket.n.01', 'synonyms': ['basket', 'handbasket'], 'def': 'a container that is usually woven and has handles', 'name': 'basket'}, {'frequency': 'c', 'id': 62, 'synset': 'basket.n.03', 'synonyms': ['basketball_hoop'], 'def': 'metal hoop supporting a net through which players try to throw the basketball', 'name': 'basketball_hoop'}, {'frequency': 'c', 'id': 63, 'synset': 'basketball.n.02', 'synonyms': ['basketball'], 'def': 'an inflated ball used in playing basketball', 'name': 'basketball'}, {'frequency': 'r', 'id': 64, 'synset': 'bass_horn.n.01', 'synonyms': ['bass_horn', 'sousaphone', 'tuba'], 'def': 'the lowest brass wind instrument', 'name': 'bass_horn'}, {'frequency': 'r', 'id': 65, 'synset': 'bat.n.01', 'synonyms': ['bat_(animal)'], 'def': 'nocturnal mouselike mammal with forelimbs modified to form membranous wings', 'name': 'bat_(animal)'}, {'frequency': 'f', 'id': 66, 'synset': 'bath_mat.n.01', 'synonyms': ['bath_mat'], 'def': 'a heavy towel or mat to stand on while drying yourself after a bath', 'name': 'bath_mat'}, {'frequency': 'f', 'id': 67, 'synset': 'bath_towel.n.01', 'synonyms': ['bath_towel'], 'def': 'a large towel; to dry yourself after a bath', 'name': 'bath_towel'}, {'frequency': 'c', 'id': 68, 'synset': 'bathrobe.n.01', 'synonyms': ['bathrobe'], 'def': 'a loose-fitting robe of towelling; worn after a bath or swim', 'name': 'bathrobe'}, {'frequency': 'f', 'id': 69, 'synset': 'bathtub.n.01', 'synonyms': ['bathtub', 'bathing_tub'], 'def': 'a large open container that you fill with water and use to wash the body', 'name': 'bathtub'}, {'frequency': 'r', 'id': 70, 'synset': 'batter.n.02', 'synonyms': ['batter_(food)'], 'def': 'a liquid or semiliquid mixture, as of flour, eggs, and milk, used in cooking', 'name': 'batter_(food)'}, {'frequency': 'c', 'id': 71, 'synset': 'battery.n.02', 'synonyms': ['battery'], 'def': 'a portable device that produces electricity', 'name': 'battery'}, {'frequency': 'r', 'id': 72, 'synset': 'beach_ball.n.01', 'synonyms': ['beachball'], 'def': 'large and light ball; for play at the seaside', 'name': 'beachball'}, {'frequency': 'c', 'id': 73, 'synset': 'bead.n.01', 'synonyms': ['bead'], 'def': 'a small ball with a hole through the middle used for ornamentation, jewellery, etc.', 'name': 'bead'}, {'frequency': 'r', 'id': 74, 'synset': 'beaker.n.01', 'synonyms': ['beaker'], 'def': 'a flatbottomed jar made of glass or plastic; used for chemistry', 'name': 'beaker'}, {'frequency': 'c', 'id': 75, 'synset': 'bean_curd.n.01', 'synonyms': ['bean_curd', 'tofu'], 'def': 'cheeselike food made of curdled soybean milk', 'name': 'bean_curd'}, {'frequency': 'c', 'id': 76, 'synset': 'beanbag.n.01', 'synonyms': ['beanbag'], 'def': 'a bag filled with dried beans or similar items; used in games or to sit on', 'name': 'beanbag'}, {'frequency': 'f', 'id': 77, 'synset': 'beanie.n.01', 'synonyms': ['beanie', 'beany'], 'def': 'a small skullcap; formerly worn by schoolboys and college freshmen', 'name': 'beanie'}, {'frequency': 'f', 'id': 78, 'synset': 'bear.n.01', 'synonyms': ['bear'], 'def': 'large carnivorous or omnivorous mammals with shaggy coats and claws', 'name': 'bear'}, {'frequency': 'f', 'id': 79, 'synset': 'bed.n.01', 'synonyms': ['bed'], 'def': 'a piece of furniture that provides a place to sleep', 'name': 'bed'}, {'frequency': 'c', 'id': 80, 'synset': 'bedspread.n.01', 'synonyms': ['bedspread', 'bedcover', 'bed_covering', 'counterpane', 'spread'], 'def': 'decorative cover for a bed', 'name': 'bedspread'}, {'frequency': 'f', 'id': 81, 'synset': 'beef.n.01', 'synonyms': ['cow'], 'def': 'cattle that are reared for their meat', 'name': 'cow'}, {'frequency': 'c', 'id': 82, 'synset': 'beef.n.02', 'synonyms': ['beef_(food)', 'boeuf_(food)'], 'def': 'meat from an adult domestic bovine', 'name': 'beef_(food)'}, {'frequency': 'r', 'id': 83, 'synset': 'beeper.n.01', 'synonyms': ['beeper', 'pager'], 'def': 'an device that beeps when the person carrying it is being paged', 'name': 'beeper'}, {'frequency': 'f', 'id': 84, 'synset': 'beer_bottle.n.01', 'synonyms': ['beer_bottle'], 'def': 'a bottle that holds beer', 'name': 'beer_bottle'}, {'frequency': 'c', 'id': 85, 'synset': 'beer_can.n.01', 'synonyms': ['beer_can'], 'def': 'a can that holds beer', 'name': 'beer_can'}, {'frequency': 'r', 'id': 86, 'synset': 'beetle.n.01', 'synonyms': ['beetle'], 'def': 'insect with hard wing covers', 'name': 'beetle'}, {'frequency': 'f', 'id': 87, 'synset': 'bell.n.01', 'synonyms': ['bell'], 'def': 'a hollow device made of metal that makes a ringing sound when struck', 'name': 'bell'}, {'frequency': 'f', 'id': 88, 'synset': 'bell_pepper.n.02', 'synonyms': ['bell_pepper', 'capsicum'], 'def': 'large bell-shaped sweet pepper in green or red or yellow or orange or black varieties', 'name': 'bell_pepper'}, {'frequency': 'f', 'id': 89, 'synset': 'belt.n.02', 'synonyms': ['belt'], 'def': 'a band to tie or buckle around the body (usually at the waist)', 'name': 'belt'}, {'frequency': 'f', 'id': 90, 'synset': 'belt_buckle.n.01', 'synonyms': ['belt_buckle'], 'def': 'the buckle used to fasten a belt', 'name': 'belt_buckle'}, {'frequency': 'f', 'id': 91, 'synset': 'bench.n.01', 'synonyms': ['bench'], 'def': 'a long seat for more than one person', 'name': 'bench'}, {'frequency': 'c', 'id': 92, 'synset': 'beret.n.01', 'synonyms': ['beret'], 'def': 'a cap with no brim or bill; made of soft cloth', 'name': 'beret'}, {'frequency': 'c', 'id': 93, 'synset': 'bib.n.02', 'synonyms': ['bib'], 'def': 'a napkin tied under the chin of a child while eating', 'name': 'bib'}, {'frequency': 'r', 'id': 94, 'synset': 'bible.n.01', 'synonyms': ['Bible'], 'def': 'the sacred writings of the Christian religions', 'name': 'Bible'}, {'frequency': 'f', 'id': 95, 'synset': 'bicycle.n.01', 'synonyms': ['bicycle', 'bike_(bicycle)'], 'def': 'a wheeled vehicle that has two wheels and is moved by foot pedals', 'name': 'bicycle'}, {'frequency': 'f', 'id': 96, 'synset': 'bill.n.09', 'synonyms': ['visor', 'vizor'], 'def': 'a brim that projects to the front to shade the eyes', 'name': 'visor'}, {'frequency': 'c', 'id': 97, 'synset': 'binder.n.03', 'synonyms': ['binder', 'ring-binder'], 'def': 'holds loose papers or magazines', 'name': 'binder'}, {'frequency': 'c', 'id': 98, 'synset': 'binoculars.n.01', 'synonyms': ['binoculars', 'field_glasses', 'opera_glasses'], 'def': 'an optical instrument designed for simultaneous use by both eyes', 'name': 'binoculars'}, {'frequency': 'f', 'id': 99, 'synset': 'bird.n.01', 'synonyms': ['bird'], 'def': 'animal characterized by feathers and wings', 'name': 'bird'}, {'frequency': 'r', 'id': 100, 'synset': 'bird_feeder.n.01', 'synonyms': ['birdfeeder'], 'def': 'an outdoor device that supplies food for wild birds', 'name': 'birdfeeder'}, {'frequency': 'r', 'id': 101, 'synset': 'birdbath.n.01', 'synonyms': ['birdbath'], 'def': 'an ornamental basin (usually in a garden) for birds to bathe in', 'name': 'birdbath'}, {'frequency': 'c', 'id': 102, 'synset': 'birdcage.n.01', 'synonyms': ['birdcage'], 'def': 'a cage in which a bird can be kept', 'name': 'birdcage'}, {'frequency': 'c', 'id': 103, 'synset': 'birdhouse.n.01', 'synonyms': ['birdhouse'], 'def': 'a shelter for birds', 'name': 'birdhouse'}, {'frequency': 'f', 'id': 104, 'synset': 'birthday_cake.n.01', 'synonyms': ['birthday_cake'], 'def': 'decorated cake served at a birthday party', 'name': 'birthday_cake'}, {'frequency': 'r', 'id': 105, 'synset': 'birthday_card.n.01', 'synonyms': ['birthday_card'], 'def': 'a card expressing a birthday greeting', 'name': 'birthday_card'}, {'frequency': 'r', 'id': 106, 'synset': 'biscuit.n.01', 'synonyms': ['biscuit_(bread)'], 'def': 'small round bread leavened with baking-powder or soda', 'name': 'biscuit_(bread)'}, {'frequency': 'r', 'id': 107, 'synset': 'black_flag.n.01', 'synonyms': ['pirate_flag'], 'def': 'a flag usually bearing a white skull and crossbones on a black background', 'name': 'pirate_flag'}, {'frequency': 'c', 'id': 108, 'synset': 'black_sheep.n.02', 'synonyms': ['black_sheep'], 'def': 'sheep with a black coat', 'name': 'black_sheep'}, {'frequency': 'c', 'id': 109, 'synset': 'blackboard.n.01', 'synonyms': ['blackboard', 'chalkboard'], 'def': 'sheet of slate; for writing with chalk', 'name': 'blackboard'}, {'frequency': 'f', 'id': 110, 'synset': 'blanket.n.01', 'synonyms': ['blanket'], 'def': 'bedding that keeps a person warm in bed', 'name': 'blanket'}, {'frequency': 'c', 'id': 111, 'synset': 'blazer.n.01', 'synonyms': ['blazer', 'sport_jacket', 'sport_coat', 'sports_jacket', 'sports_coat'], 'def': 'lightweight jacket; often striped in the colors of a club or school', 'name': 'blazer'}, {'frequency': 'f', 'id': 112, 'synset': 'blender.n.01', 'synonyms': ['blender', 'liquidizer', 'liquidiser'], 'def': 'an electrically powered mixer that mix or chop or liquefy foods', 'name': 'blender'}, {'frequency': 'r', 'id': 113, 'synset': 'blimp.n.02', 'synonyms': ['blimp'], 'def': 'a small nonrigid airship used for observation or as a barrage balloon', 'name': 'blimp'}, {'frequency': 'c', 'id': 114, 'synset': 'blinker.n.01', 'synonyms': ['blinker', 'flasher'], 'def': 'a light that flashes on and off; used as a signal or to send messages', 'name': 'blinker'}, {'frequency': 'c', 'id': 115, 'synset': 'blueberry.n.02', 'synonyms': ['blueberry'], 'def': 'sweet edible dark-blue berries of blueberry plants', 'name': 'blueberry'}, {'frequency': 'r', 'id': 116, 'synset': 'boar.n.02', 'synonyms': ['boar'], 'def': 'an uncastrated male hog', 'name': 'boar'}, {'frequency': 'r', 'id': 117, 'synset': 'board.n.09', 'synonyms': ['gameboard'], 'def': 'a flat portable surface (usually rectangular) designed for board games', 'name': 'gameboard'}, {'frequency': 'f', 'id': 118, 'synset': 'boat.n.01', 'synonyms': ['boat', 'ship_(boat)'], 'def': 'a vessel for travel on water', 'name': 'boat'}, {'frequency': 'c', 'id': 119, 'synset': 'bobbin.n.01', 'synonyms': ['bobbin', 'spool', 'reel'], 'def': 'a thing around which thread/tape/film or other flexible materials can be wound', 'name': 'bobbin'}, {'frequency': 'r', 'id': 120, 'synset': 'bobby_pin.n.01', 'synonyms': ['bobby_pin', 'hairgrip'], 'def': 'a flat wire hairpin used to hold bobbed hair in place', 'name': 'bobby_pin'}, {'frequency': 'c', 'id': 121, 'synset': 'boiled_egg.n.01', 'synonyms': ['boiled_egg', 'coddled_egg'], 'def': 'egg cooked briefly in the shell in gently boiling water', 'name': 'boiled_egg'}, {'frequency': 'r', 'id': 122, 'synset': 'bolo_tie.n.01', 'synonyms': ['bolo_tie', 'bolo', 'bola_tie', 'bola'], 'def': 'a cord fastened around the neck with an ornamental clasp and worn as a necktie', 'name': 'bolo_tie'}, {'frequency': 'c', 'id': 123, 'synset': 'bolt.n.03', 'synonyms': ['deadbolt'], 'def': 'the part of a lock that is engaged or withdrawn with a key', 'name': 'deadbolt'}, {'frequency': 'f', 'id': 124, 'synset': 'bolt.n.06', 'synonyms': ['bolt'], 'def': 'a screw that screws into a nut to form a fastener', 'name': 'bolt'}, {'frequency': 'r', 'id': 125, 'synset': 'bonnet.n.01', 'synonyms': ['bonnet'], 'def': 'a hat tied under the chin', 'name': 'bonnet'}, {'frequency': 'f', 'id': 126, 'synset': 'book.n.01', 'synonyms': ['book'], 'def': 'a written work or composition that has been published', 'name': 'book'}, {'frequency': 'r', 'id': 127, 'synset': 'book_bag.n.01', 'synonyms': ['book_bag'], 'def': 'a bag in which students carry their books', 'name': 'book_bag'}, {'frequency': 'c', 'id': 128, 'synset': 'bookcase.n.01', 'synonyms': ['bookcase'], 'def': 'a piece of furniture with shelves for storing books', 'name': 'bookcase'}, {'frequency': 'c', 'id': 129, 'synset': 'booklet.n.01', 'synonyms': ['booklet', 'brochure', 'leaflet', 'pamphlet'], 'def': 'a small book usually having a paper cover', 'name': 'booklet'}, {'frequency': 'r', 'id': 130, 'synset': 'bookmark.n.01', 'synonyms': ['bookmark', 'bookmarker'], 'def': 'a marker (a piece of paper or ribbon) placed between the pages of a book', 'name': 'bookmark'}, {'frequency': 'r', 'id': 131, 'synset': 'boom.n.04', 'synonyms': ['boom_microphone', 'microphone_boom'], 'def': 'a pole carrying an overhead microphone projected over a film or tv set', 'name': 'boom_microphone'}, {'frequency': 'f', 'id': 132, 'synset': 'boot.n.01', 'synonyms': ['boot'], 'def': 'footwear that covers the whole foot and lower leg', 'name': 'boot'}, {'frequency': 'f', 'id': 133, 'synset': 'bottle.n.01', 'synonyms': ['bottle'], 'def': 'a glass or plastic vessel used for storing drinks or other liquids', 'name': 'bottle'}, {'frequency': 'c', 'id': 134, 'synset': 'bottle_opener.n.01', 'synonyms': ['bottle_opener'], 'def': 'an opener for removing caps or corks from bottles', 'name': 'bottle_opener'}, {'frequency': 'c', 'id': 135, 'synset': 'bouquet.n.01', 'synonyms': ['bouquet'], 'def': 'an arrangement of flowers that is usually given as a present', 'name': 'bouquet'}, {'frequency': 'r', 'id': 136, 'synset': 'bow.n.04', 'synonyms': ['bow_(weapon)'], 'def': 'a weapon for shooting arrows', 'name': 'bow_(weapon)'}, {'frequency': 'f', 'id': 137, 'synset': 'bow.n.08', 'synonyms': ['bow_(decorative_ribbons)'], 'def': 'a decorative interlacing of ribbons', 'name': 'bow_(decorative_ribbons)'}, {'frequency': 'f', 'id': 138, 'synset': 'bow_tie.n.01', 'synonyms': ['bow-tie', 'bowtie'], 'def': "a man's tie that ties in a bow", 'name': 'bow-tie'}, {'frequency': 'f', 'id': 139, 'synset': 'bowl.n.03', 'synonyms': ['bowl'], 'def': 'a dish that is round and open at the top for serving foods', 'name': 'bowl'}, {'frequency': 'r', 'id': 140, 'synset': 'bowl.n.08', 'synonyms': ['pipe_bowl'], 'def': 'a small round container that is open at the top for holding tobacco', 'name': 'pipe_bowl'}, {'frequency': 'c', 'id': 141, 'synset': 'bowler_hat.n.01', 'synonyms': ['bowler_hat', 'bowler', 'derby_hat', 'derby', 'plug_hat'], 'def': 'a felt hat that is round and hard with a narrow brim', 'name': 'bowler_hat'}, {'frequency': 'r', 'id': 142, 'synset': 'bowling_ball.n.01', 'synonyms': ['bowling_ball'], 'def': 'a large ball with finger holes used in the sport of bowling', 'name': 'bowling_ball'}, {'frequency': 'r', 'id': 143, 'synset': 'bowling_pin.n.01', 'synonyms': ['bowling_pin'], 'def': 'a club-shaped wooden object used in bowling', 'name': 'bowling_pin'}, {'frequency': 'r', 'id': 144, 'synset': 'boxing_glove.n.01', 'synonyms': ['boxing_glove'], 'def': 'large glove coverings the fists of a fighter worn for the sport of boxing', 'name': 'boxing_glove'}, {'frequency': 'c', 'id': 145, 'synset': 'brace.n.06', 'synonyms': ['suspenders'], 'def': 'elastic straps that hold trousers up (usually used in the plural)', 'name': 'suspenders'}, {'frequency': 'f', 'id': 146, 'synset': 'bracelet.n.02', 'synonyms': ['bracelet', 'bangle'], 'def': 'jewelry worn around the wrist for decoration', 'name': 'bracelet'}, {'frequency': 'r', 'id': 147, 'synset': 'brass.n.07', 'synonyms': ['brass_plaque'], 'def': 'a memorial made of brass', 'name': 'brass_plaque'}, {'frequency': 'c', 'id': 148, 'synset': 'brassiere.n.01', 'synonyms': ['brassiere', 'bra', 'bandeau'], 'def': 'an undergarment worn by women to support their breasts', 'name': 'brassiere'}, {'frequency': 'c', 'id': 149, 'synset': 'bread-bin.n.01', 'synonyms': ['bread-bin', 'breadbox'], 'def': 'a container used to keep bread or cake in', 'name': 'bread-bin'}, {'frequency': 'r', 'id': 150, 'synset': 'breechcloth.n.01', 'synonyms': ['breechcloth', 'breechclout', 'loincloth'], 'def': 'a garment that provides covering for the loins', 'name': 'breechcloth'}, {'frequency': 'c', 'id': 151, 'synset': 'bridal_gown.n.01', 'synonyms': ['bridal_gown', 'wedding_gown', 'wedding_dress'], 'def': 'a gown worn by the bride at a wedding', 'name': 'bridal_gown'}, {'frequency': 'c', 'id': 152, 'synset': 'briefcase.n.01', 'synonyms': ['briefcase'], 'def': 'a case with a handle; for carrying papers or files or books', 'name': 'briefcase'}, {'frequency': 'c', 'id': 153, 'synset': 'bristle_brush.n.01', 'synonyms': ['bristle_brush'], 'def': 'a brush that is made with the short stiff hairs of an animal or plant', 'name': 'bristle_brush'}, {'frequency': 'f', 'id': 154, 'synset': 'broccoli.n.01', 'synonyms': ['broccoli'], 'def': 'plant with dense clusters of tight green flower buds', 'name': 'broccoli'}, {'frequency': 'r', 'id': 155, 'synset': 'brooch.n.01', 'synonyms': ['broach'], 'def': 'a decorative pin worn by women', 'name': 'broach'}, {'frequency': 'c', 'id': 156, 'synset': 'broom.n.01', 'synonyms': ['broom'], 'def': 'bundle of straws or twigs attached to a long handle; used for cleaning', 'name': 'broom'}, {'frequency': 'c', 'id': 157, 'synset': 'brownie.n.03', 'synonyms': ['brownie'], 'def': 'square or bar of very rich chocolate cake usually with nuts', 'name': 'brownie'}, {'frequency': 'c', 'id': 158, 'synset': 'brussels_sprouts.n.01', 'synonyms': ['brussels_sprouts'], 'def': 'the small edible cabbage-like buds growing along a stalk', 'name': 'brussels_sprouts'}, {'frequency': 'r', 'id': 159, 'synset': 'bubble_gum.n.01', 'synonyms': ['bubble_gum'], 'def': 'a kind of chewing gum that can be blown into bubbles', 'name': 'bubble_gum'}, {'frequency': 'f', 'id': 160, 'synset': 'bucket.n.01', 'synonyms': ['bucket', 'pail'], 'def': 'a roughly cylindrical vessel that is open at the top', 'name': 'bucket'}, {'frequency': 'r', 'id': 161, 'synset': 'buggy.n.01', 'synonyms': ['horse_buggy'], 'def': 'a small lightweight carriage; drawn by a single horse', 'name': 'horse_buggy'}, {'frequency': 'c', 'id': 162, 'synset': 'bull.n.11', 'synonyms': ['bull'], 'def': 'mature male cow', 'name': 'bull'}, {'frequency': 'r', 'id': 163, 'synset': 'bulldog.n.01', 'synonyms': ['bulldog'], 'def': 'a thickset short-haired dog with a large head and strong undershot lower jaw', 'name': 'bulldog'}, {'frequency': 'r', 'id': 164, 'synset': 'bulldozer.n.01', 'synonyms': ['bulldozer', 'dozer'], 'def': 'large powerful tractor; a large blade in front flattens areas of ground', 'name': 'bulldozer'}, {'frequency': 'c', 'id': 165, 'synset': 'bullet_train.n.01', 'synonyms': ['bullet_train'], 'def': 'a high-speed passenger train', 'name': 'bullet_train'}, {'frequency': 'c', 'id': 166, 'synset': 'bulletin_board.n.02', 'synonyms': ['bulletin_board', 'notice_board'], 'def': 'a board that hangs on a wall; displays announcements', 'name': 'bulletin_board'}, {'frequency': 'r', 'id': 167, 'synset': 'bulletproof_vest.n.01', 'synonyms': ['bulletproof_vest'], 'def': 'a vest capable of resisting the impact of a bullet', 'name': 'bulletproof_vest'}, {'frequency': 'c', 'id': 168, 'synset': 'bullhorn.n.01', 'synonyms': ['bullhorn', 'megaphone'], 'def': 'a portable loudspeaker with built-in microphone and amplifier', 'name': 'bullhorn'}, {'frequency': 'r', 'id': 169, 'synset': 'bully_beef.n.01', 'synonyms': ['corned_beef', 'corn_beef'], 'def': 'beef cured or pickled in brine', 'name': 'corned_beef'}, {'frequency': 'f', 'id': 170, 'synset': 'bun.n.01', 'synonyms': ['bun', 'roll'], 'def': 'small rounded bread either plain or sweet', 'name': 'bun'}, {'frequency': 'c', 'id': 171, 'synset': 'bunk_bed.n.01', 'synonyms': ['bunk_bed'], 'def': 'beds built one above the other', 'name': 'bunk_bed'}, {'frequency': 'f', 'id': 172, 'synset': 'buoy.n.01', 'synonyms': ['buoy'], 'def': 'a float attached by rope to the seabed to mark channels in a harbor or underwater hazards', 'name': 'buoy'}, {'frequency': 'r', 'id': 173, 'synset': 'burrito.n.01', 'synonyms': ['burrito'], 'def': 'a flour tortilla folded around a filling', 'name': 'burrito'}, {'frequency': 'f', 'id': 174, 'synset': 'bus.n.01', 'synonyms': ['bus_(vehicle)', 'autobus', 'charabanc', 'double-decker', 'motorbus', 'motorcoach'], 'def': 'a vehicle carrying many passengers; used for public transport', 'name': 'bus_(vehicle)'}, {'frequency': 'c', 'id': 175, 'synset': 'business_card.n.01', 'synonyms': ['business_card'], 'def': "a card on which are printed the person's name and business affiliation", 'name': 'business_card'}, {'frequency': 'c', 'id': 176, 'synset': 'butcher_knife.n.01', 'synonyms': ['butcher_knife'], 'def': 'a large sharp knife for cutting or trimming meat', 'name': 'butcher_knife'}, {'frequency': 'c', 'id': 177, 'synset': 'butter.n.01', 'synonyms': ['butter'], 'def': 'an edible emulsion of fat globules made by churning milk or cream; for cooking and table use', 'name': 'butter'}, {'frequency': 'c', 'id': 178, 'synset': 'butterfly.n.01', 'synonyms': ['butterfly'], 'def': 'insect typically having a slender body with knobbed antennae and broad colorful wings', 'name': 'butterfly'}, {'frequency': 'f', 'id': 179, 'synset': 'button.n.01', 'synonyms': ['button'], 'def': 'a round fastener sewn to shirts and coats etc to fit through buttonholes', 'name': 'button'}, {'frequency': 'f', 'id': 180, 'synset': 'cab.n.03', 'synonyms': ['cab_(taxi)', 'taxi', 'taxicab'], 'def': 'a car that takes passengers where they want to go in exchange for money', 'name': 'cab_(taxi)'}, {'frequency': 'r', 'id': 181, 'synset': 'cabana.n.01', 'synonyms': ['cabana'], 'def': 'a small tent used as a dressing room beside the sea or a swimming pool', 'name': 'cabana'}, {'frequency': 'r', 'id': 182, 'synset': 'cabin_car.n.01', 'synonyms': ['cabin_car', 'caboose'], 'def': 'a car on a freight train for use of the train crew; usually the last car on the train', 'name': 'cabin_car'}, {'frequency': 'f', 'id': 183, 'synset': 'cabinet.n.01', 'synonyms': ['cabinet'], 'def': 'a piece of furniture resembling a cupboard with doors and shelves and drawers', 'name': 'cabinet'}, {'frequency': 'r', 'id': 184, 'synset': 'cabinet.n.03', 'synonyms': ['locker', 'storage_locker'], 'def': 'a storage compartment for clothes and valuables; usually it has a lock', 'name': 'locker'}, {'frequency': 'f', 'id': 185, 'synset': 'cake.n.03', 'synonyms': ['cake'], 'def': 'baked goods made from or based on a mixture of flour, sugar, eggs, and fat', 'name': 'cake'}, {'frequency': 'c', 'id': 186, 'synset': 'calculator.n.02', 'synonyms': ['calculator'], 'def': 'a small machine that is used for mathematical calculations', 'name': 'calculator'}, {'frequency': 'f', 'id': 187, 'synset': 'calendar.n.02', 'synonyms': ['calendar'], 'def': 'a list or register of events (appointments/social events/court cases, etc)', 'name': 'calendar'}, {'frequency': 'c', 'id': 188, 'synset': 'calf.n.01', 'synonyms': ['calf'], 'def': 'young of domestic cattle', 'name': 'calf'}, {'frequency': 'c', 'id': 189, 'synset': 'camcorder.n.01', 'synonyms': ['camcorder'], 'def': 'a portable television camera and videocassette recorder', 'name': 'camcorder'}, {'frequency': 'c', 'id': 190, 'synset': 'camel.n.01', 'synonyms': ['camel'], 'def': 'cud-chewing mammal used as a draft or saddle animal in desert regions', 'name': 'camel'}, {'frequency': 'f', 'id': 191, 'synset': 'camera.n.01', 'synonyms': ['camera'], 'def': 'equipment for taking photographs', 'name': 'camera'}, {'frequency': 'c', 'id': 192, 'synset': 'camera_lens.n.01', 'synonyms': ['camera_lens'], 'def': 'a lens that focuses the image in a camera', 'name': 'camera_lens'}, {'frequency': 'c', 'id': 193, 'synset': 'camper.n.02', 'synonyms': ['camper_(vehicle)', 'camping_bus', 'motor_home'], 'def': 'a recreational vehicle equipped for camping out while traveling', 'name': 'camper_(vehicle)'}, {'frequency': 'f', 'id': 194, 'synset': 'can.n.01', 'synonyms': ['can', 'tin_can'], 'def': 'airtight sealed metal container for food or drink or paint etc.', 'name': 'can'}, {'frequency': 'c', 'id': 195, 'synset': 'can_opener.n.01', 'synonyms': ['can_opener', 'tin_opener'], 'def': 'a device for cutting cans open', 'name': 'can_opener'}, {'frequency': 'r', 'id': 196, 'synset': 'candelabrum.n.01', 'synonyms': ['candelabrum', 'candelabra'], 'def': 'branched candlestick; ornamental; has several lights', 'name': 'candelabrum'}, {'frequency': 'f', 'id': 197, 'synset': 'candle.n.01', 'synonyms': ['candle', 'candlestick'], 'def': 'stick of wax with a wick in the middle', 'name': 'candle'}, {'frequency': 'f', 'id': 198, 'synset': 'candlestick.n.01', 'synonyms': ['candle_holder'], 'def': 'a holder with sockets for candles', 'name': 'candle_holder'}, {'frequency': 'r', 'id': 199, 'synset': 'candy_bar.n.01', 'synonyms': ['candy_bar'], 'def': 'a candy shaped as a bar', 'name': 'candy_bar'}, {'frequency': 'c', 'id': 200, 'synset': 'candy_cane.n.01', 'synonyms': ['candy_cane'], 'def': 'a hard candy in the shape of a rod (usually with stripes)', 'name': 'candy_cane'}, {'frequency': 'c', 'id': 201, 'synset': 'cane.n.01', 'synonyms': ['walking_cane'], 'def': 'a stick that people can lean on to help them walk', 'name': 'walking_cane'}, {'frequency': 'c', 'id': 202, 'synset': 'canister.n.02', 'synonyms': ['canister', 'cannister'], 'def': 'metal container for storing dry foods such as tea or flour', 'name': 'canister'}, {'frequency': 'r', 'id': 203, 'synset': 'cannon.n.02', 'synonyms': ['cannon'], 'def': 'heavy gun fired from a tank', 'name': 'cannon'}, {'frequency': 'c', 'id': 204, 'synset': 'canoe.n.01', 'synonyms': ['canoe'], 'def': 'small and light boat; pointed at both ends; propelled with a paddle', 'name': 'canoe'}, {'frequency': 'r', 'id': 205, 'synset': 'cantaloup.n.02', 'synonyms': ['cantaloup', 'cantaloupe'], 'def': 'the fruit of a cantaloup vine; small to medium-sized melon with yellowish flesh', 'name': 'cantaloup'}, {'frequency': 'r', 'id': 206, 'synset': 'canteen.n.01', 'synonyms': ['canteen'], 'def': 'a flask for carrying water; used by soldiers or travelers', 'name': 'canteen'}, {'frequency': 'c', 'id': 207, 'synset': 'cap.n.01', 'synonyms': ['cap_(headwear)'], 'def': 'a tight-fitting headwear', 'name': 'cap_(headwear)'}, {'frequency': 'f', 'id': 208, 'synset': 'cap.n.02', 'synonyms': ['bottle_cap', 'cap_(container_lid)'], 'def': 'a top (as for a bottle)', 'name': 'bottle_cap'}, {'frequency': 'r', 'id': 209, 'synset': 'cape.n.02', 'synonyms': ['cape'], 'def': 'a sleeveless garment like a cloak but shorter', 'name': 'cape'}, {'frequency': 'c', 'id': 210, 'synset': 'cappuccino.n.01', 'synonyms': ['cappuccino', 'coffee_cappuccino'], 'def': 'equal parts of espresso and steamed milk', 'name': 'cappuccino'}, {'frequency': 'f', 'id': 211, 'synset': 'car.n.01', 'synonyms': ['car_(automobile)', 'auto_(automobile)', 'automobile'], 'def': 'a motor vehicle with four wheels', 'name': 'car_(automobile)'}, {'frequency': 'f', 'id': 212, 'synset': 'car.n.02', 'synonyms': ['railcar_(part_of_a_train)', 'railway_car_(part_of_a_train)', 'railroad_car_(part_of_a_train)'], 'def': 'a wheeled vehicle adapted to the rails of railroad', 'name': 'railcar_(part_of_a_train)'}, {'frequency': 'r', 'id': 213, 'synset': 'car.n.04', 'synonyms': ['elevator_car'], 'def': 'where passengers ride up and down', 'name': 'elevator_car'}, {'frequency': 'r', 'id': 214, 'synset': 'car_battery.n.01', 'synonyms': ['car_battery', 'automobile_battery'], 'def': 'a battery in a motor vehicle', 'name': 'car_battery'}, {'frequency': 'c', 'id': 215, 'synset': 'card.n.02', 'synonyms': ['identity_card'], 'def': 'a card certifying the identity of the bearer', 'name': 'identity_card'}, {'frequency': 'c', 'id': 216, 'synset': 'card.n.03', 'synonyms': ['card'], 'def': 'a rectangular piece of paper used to send messages (e.g. greetings or pictures)', 'name': 'card'}, {'frequency': 'r', 'id': 217, 'synset': 'cardigan.n.01', 'synonyms': ['cardigan'], 'def': 'knitted jacket that is fastened up the front with buttons or a zipper', 'name': 'cardigan'}, {'frequency': 'r', 'id': 218, 'synset': 'cargo_ship.n.01', 'synonyms': ['cargo_ship', 'cargo_vessel'], 'def': 'a ship designed to carry cargo', 'name': 'cargo_ship'}, {'frequency': 'r', 'id': 219, 'synset': 'carnation.n.01', 'synonyms': ['carnation'], 'def': 'plant with pink to purple-red spice-scented usually double flowers', 'name': 'carnation'}, {'frequency': 'c', 'id': 220, 'synset': 'carriage.n.02', 'synonyms': ['horse_carriage'], 'def': 'a vehicle with wheels drawn by one or more horses', 'name': 'horse_carriage'}, {'frequency': 'f', 'id': 221, 'synset': 'carrot.n.01', 'synonyms': ['carrot'], 'def': 'deep orange edible root of the cultivated carrot plant', 'name': 'carrot'}, {'frequency': 'c', 'id': 222, 'synset': 'carryall.n.01', 'synonyms': ['tote_bag'], 'def': 'a capacious bag or basket', 'name': 'tote_bag'}, {'frequency': 'c', 'id': 223, 'synset': 'cart.n.01', 'synonyms': ['cart'], 'def': 'a heavy open wagon usually having two wheels and drawn by an animal', 'name': 'cart'}, {'frequency': 'c', 'id': 224, 'synset': 'carton.n.02', 'synonyms': ['carton'], 'def': 'a box made of cardboard; opens by flaps on top', 'name': 'carton'}, {'frequency': 'c', 'id': 225, 'synset': 'cash_register.n.01', 'synonyms': ['cash_register', 'register_(for_cash_transactions)'], 'def': 'a cashbox with an adding machine to register transactions', 'name': 'cash_register'}, {'frequency': 'r', 'id': 226, 'synset': 'casserole.n.01', 'synonyms': ['casserole'], 'def': 'food cooked and served in a casserole', 'name': 'casserole'}, {'frequency': 'r', 'id': 227, 'synset': 'cassette.n.01', 'synonyms': ['cassette'], 'def': 'a container that holds a magnetic tape used for recording or playing sound or video', 'name': 'cassette'}, {'frequency': 'c', 'id': 228, 'synset': 'cast.n.05', 'synonyms': ['cast', 'plaster_cast', 'plaster_bandage'], 'def': 'bandage consisting of a firm covering that immobilizes broken bones while they heal', 'name': 'cast'}, {'frequency': 'f', 'id': 229, 'synset': 'cat.n.01', 'synonyms': ['cat'], 'def': 'a domestic house cat', 'name': 'cat'}, {'frequency': 'c', 'id': 230, 'synset': 'cauliflower.n.02', 'synonyms': ['cauliflower'], 'def': 'edible compact head of white undeveloped flowers', 'name': 'cauliflower'}, {'frequency': 'r', 'id': 231, 'synset': 'caviar.n.01', 'synonyms': ['caviar', 'caviare'], 'def': "salted roe of sturgeon or other large fish; usually served as an hors d'oeuvre", 'name': 'caviar'}, {'frequency': 'c', 'id': 232, 'synset': 'cayenne.n.02', 'synonyms': ['cayenne_(spice)', 'cayenne_pepper_(spice)', 'red_pepper_(spice)'], 'def': 'ground pods and seeds of pungent red peppers of the genus Capsicum', 'name': 'cayenne_(spice)'}, {'frequency': 'c', 'id': 233, 'synset': 'cd_player.n.01', 'synonyms': ['CD_player'], 'def': 'electronic equipment for playing compact discs (CDs)', 'name': 'CD_player'}, {'frequency': 'c', 'id': 234, 'synset': 'celery.n.01', 'synonyms': ['celery'], 'def': 'widely cultivated herb with aromatic leaf stalks that are eaten raw or cooked', 'name': 'celery'}, {'frequency': 'f', 'id': 235, 'synset': 'cellular_telephone.n.01', 'synonyms': ['cellular_telephone', 'cellular_phone', 'cellphone', 'mobile_phone', 'smart_phone'], 'def': 'a hand-held mobile telephone', 'name': 'cellular_telephone'}, {'frequency': 'r', 'id': 236, 'synset': 'chain_mail.n.01', 'synonyms': ['chain_mail', 'ring_mail', 'chain_armor', 'chain_armour', 'ring_armor', 'ring_armour'], 'def': '(Middle Ages) flexible armor made of interlinked metal rings', 'name': 'chain_mail'}, {'frequency': 'f', 'id': 237, 'synset': 'chair.n.01', 'synonyms': ['chair'], 'def': 'a seat for one person, with a support for the back', 'name': 'chair'}, {'frequency': 'r', 'id': 238, 'synset': 'chaise_longue.n.01', 'synonyms': ['chaise_longue', 'chaise', 'daybed'], 'def': 'a long chair; for reclining', 'name': 'chaise_longue'}, {'frequency': 'r', 'id': 239, 'synset': 'champagne.n.01', 'synonyms': ['champagne'], 'def': 'a white sparkling wine produced in Champagne or resembling that produced there', 'name': 'champagne'}, {'frequency': 'f', 'id': 240, 'synset': 'chandelier.n.01', 'synonyms': ['chandelier'], 'def': 'branched lighting fixture; often ornate; hangs from the ceiling', 'name': 'chandelier'}, {'frequency': 'r', 'id': 241, 'synset': 'chap.n.04', 'synonyms': ['chap'], 'def': 'leather leggings without a seat; worn over trousers by cowboys to protect their legs', 'name': 'chap'}, {'frequency': 'r', 'id': 242, 'synset': 'checkbook.n.01', 'synonyms': ['checkbook', 'chequebook'], 'def': 'a book issued to holders of checking accounts', 'name': 'checkbook'}, {'frequency': 'r', 'id': 243, 'synset': 'checkerboard.n.01', 'synonyms': ['checkerboard'], 'def': 'a board having 64 squares of two alternating colors', 'name': 'checkerboard'}, {'frequency': 'c', 'id': 244, 'synset': 'cherry.n.03', 'synonyms': ['cherry'], 'def': 'a red fruit with a single hard stone', 'name': 'cherry'}, {'frequency': 'r', 'id': 245, 'synset': 'chessboard.n.01', 'synonyms': ['chessboard'], 'def': 'a checkerboard used to play chess', 'name': 'chessboard'}, {'frequency': 'r', 'id': 246, 'synset': 'chest_of_drawers.n.01', 'synonyms': ['chest_of_drawers_(furniture)', 'bureau_(furniture)', 'chest_(furniture)'], 'def': 'furniture with drawers for keeping clothes', 'name': 'chest_of_drawers_(furniture)'}, {'frequency': 'c', 'id': 247, 'synset': 'chicken.n.02', 'synonyms': ['chicken_(animal)'], 'def': 'a domestic fowl bred for flesh or eggs', 'name': 'chicken_(animal)'}, {'frequency': 'c', 'id': 248, 'synset': 'chicken_wire.n.01', 'synonyms': ['chicken_wire'], 'def': 'a galvanized wire network with a hexagonal mesh; used to build fences', 'name': 'chicken_wire'}, {'frequency': 'r', 'id': 249, 'synset': 'chickpea.n.01', 'synonyms': ['chickpea', 'garbanzo'], 'def': 'the seed of the chickpea plant; usually dried', 'name': 'chickpea'}, {'frequency': 'r', 'id': 250, 'synset': 'chihuahua.n.03', 'synonyms': ['Chihuahua'], 'def': 'an old breed of tiny short-haired dog with protruding eyes from Mexico', 'name': 'Chihuahua'}, {'frequency': 'r', 'id': 251, 'synset': 'chili.n.02', 'synonyms': ['chili_(vegetable)', 'chili_pepper_(vegetable)', 'chilli_(vegetable)', 'chilly_(vegetable)', 'chile_(vegetable)'], 'def': 'very hot and finely tapering pepper of special pungency', 'name': 'chili_(vegetable)'}, {'frequency': 'r', 'id': 252, 'synset': 'chime.n.01', 'synonyms': ['chime', 'gong'], 'def': 'an instrument consisting of a set of bells that are struck with a hammer', 'name': 'chime'}, {'frequency': 'r', 'id': 253, 'synset': 'chinaware.n.01', 'synonyms': ['chinaware'], 'def': 'dishware made of high quality porcelain', 'name': 'chinaware'}, {'frequency': 'c', 'id': 254, 'synset': 'chip.n.04', 'synonyms': ['crisp_(potato_chip)', 'potato_chip'], 'def': 'a thin crisp slice of potato fried in deep fat', 'name': 'crisp_(potato_chip)'}, {'frequency': 'r', 'id': 255, 'synset': 'chip.n.06', 'synonyms': ['poker_chip'], 'def': 'a small disk-shaped counter used to represent money when gambling', 'name': 'poker_chip'}, {'frequency': 'c', 'id': 256, 'synset': 'chocolate_bar.n.01', 'synonyms': ['chocolate_bar'], 'def': 'a bar of chocolate candy', 'name': 'chocolate_bar'}, {'frequency': 'c', 'id': 257, 'synset': 'chocolate_cake.n.01', 'synonyms': ['chocolate_cake'], 'def': 'cake containing chocolate', 'name': 'chocolate_cake'}, {'frequency': 'r', 'id': 258, 'synset': 'chocolate_milk.n.01', 'synonyms': ['chocolate_milk'], 'def': 'milk flavored with chocolate syrup', 'name': 'chocolate_milk'}, {'frequency': 'r', 'id': 259, 'synset': 'chocolate_mousse.n.01', 'synonyms': ['chocolate_mousse'], 'def': 'dessert mousse made with chocolate', 'name': 'chocolate_mousse'}, {'frequency': 'f', 'id': 260, 'synset': 'choker.n.03', 'synonyms': ['choker', 'collar', 'neckband'], 'def': 'necklace that fits tightly around the neck', 'name': 'choker'}, {'frequency': 'f', 'id': 261, 'synset': 'chopping_board.n.01', 'synonyms': ['chopping_board', 'cutting_board', 'chopping_block'], 'def': 'a wooden board where meats or vegetables can be cut', 'name': 'chopping_board'}, {'frequency': 'c', 'id': 262, 'synset': 'chopstick.n.01', 'synonyms': ['chopstick'], 'def': 'one of a pair of slender sticks used as oriental tableware to eat food with', 'name': 'chopstick'}, {'frequency': 'f', 'id': 263, 'synset': 'christmas_tree.n.05', 'synonyms': ['Christmas_tree'], 'def': 'an ornamented evergreen used as a Christmas decoration', 'name': 'Christmas_tree'}, {'frequency': 'c', 'id': 264, 'synset': 'chute.n.02', 'synonyms': ['slide'], 'def': 'sloping channel through which things can descend', 'name': 'slide'}, {'frequency': 'r', 'id': 265, 'synset': 'cider.n.01', 'synonyms': ['cider', 'cyder'], 'def': 'a beverage made from juice pressed from apples', 'name': 'cider'}, {'frequency': 'r', 'id': 266, 'synset': 'cigar_box.n.01', 'synonyms': ['cigar_box'], 'def': 'a box for holding cigars', 'name': 'cigar_box'}, {'frequency': 'c', 'id': 267, 'synset': 'cigarette.n.01', 'synonyms': ['cigarette'], 'def': 'finely ground tobacco wrapped in paper; for smoking', 'name': 'cigarette'}, {'frequency': 'c', 'id': 268, 'synset': 'cigarette_case.n.01', 'synonyms': ['cigarette_case', 'cigarette_pack'], 'def': 'a small flat case for holding cigarettes', 'name': 'cigarette_case'}, {'frequency': 'f', 'id': 269, 'synset': 'cistern.n.02', 'synonyms': ['cistern', 'water_tank'], 'def': 'a tank that holds the water used to flush a toilet', 'name': 'cistern'}, {'frequency': 'r', 'id': 270, 'synset': 'clarinet.n.01', 'synonyms': ['clarinet'], 'def': 'a single-reed instrument with a straight tube', 'name': 'clarinet'}, {'frequency': 'r', 'id': 271, 'synset': 'clasp.n.01', 'synonyms': ['clasp'], 'def': 'a fastener (as a buckle or hook) that is used to hold two things together', 'name': 'clasp'}, {'frequency': 'c', 'id': 272, 'synset': 'cleansing_agent.n.01', 'synonyms': ['cleansing_agent', 'cleanser', 'cleaner'], 'def': 'a preparation used in cleaning something', 'name': 'cleansing_agent'}, {'frequency': 'r', 'id': 273, 'synset': 'clementine.n.01', 'synonyms': ['clementine'], 'def': 'a variety of mandarin orange', 'name': 'clementine'}, {'frequency': 'c', 'id': 274, 'synset': 'clip.n.03', 'synonyms': ['clip'], 'def': 'any of various small fasteners used to hold loose articles together', 'name': 'clip'}, {'frequency': 'c', 'id': 275, 'synset': 'clipboard.n.01', 'synonyms': ['clipboard'], 'def': 'a small writing board with a clip at the top for holding papers', 'name': 'clipboard'}, {'frequency': 'f', 'id': 276, 'synset': 'clock.n.01', 'synonyms': ['clock', 'timepiece', 'timekeeper'], 'def': 'a timepiece that shows the time of day', 'name': 'clock'}, {'frequency': 'f', 'id': 277, 'synset': 'clock_tower.n.01', 'synonyms': ['clock_tower'], 'def': 'a tower with a large clock visible high up on an outside face', 'name': 'clock_tower'}, {'frequency': 'c', 'id': 278, 'synset': 'clothes_hamper.n.01', 'synonyms': ['clothes_hamper', 'laundry_basket', 'clothes_basket'], 'def': 'a hamper that holds dirty clothes to be washed or wet clothes to be dried', 'name': 'clothes_hamper'}, {'frequency': 'c', 'id': 279, 'synset': 'clothespin.n.01', 'synonyms': ['clothespin', 'clothes_peg'], 'def': 'wood or plastic fastener; for holding clothes on a clothesline', 'name': 'clothespin'}, {'frequency': 'r', 'id': 280, 'synset': 'clutch_bag.n.01', 'synonyms': ['clutch_bag'], 'def': "a woman's strapless purse that is carried in the hand", 'name': 'clutch_bag'}, {'frequency': 'f', 'id': 281, 'synset': 'coaster.n.03', 'synonyms': ['coaster'], 'def': 'a covering (plate or mat) that protects the surface of a table', 'name': 'coaster'}, {'frequency': 'f', 'id': 282, 'synset': 'coat.n.01', 'synonyms': ['coat'], 'def': 'an outer garment that has sleeves and covers the body from shoulder down', 'name': 'coat'}, {'frequency': 'c', 'id': 283, 'synset': 'coat_hanger.n.01', 'synonyms': ['coat_hanger', 'clothes_hanger', 'dress_hanger'], 'def': "a hanger that is shaped like a person's shoulders", 'name': 'coat_hanger'}, {'frequency': 'r', 'id': 284, 'synset': 'coatrack.n.01', 'synonyms': ['coatrack', 'hatrack'], 'def': 'a rack with hooks for temporarily holding coats and hats', 'name': 'coatrack'}, {'frequency': 'c', 'id': 285, 'synset': 'cock.n.04', 'synonyms': ['cock', 'rooster'], 'def': 'adult male chicken', 'name': 'cock'}, {'frequency': 'c', 'id': 286, 'synset': 'coconut.n.02', 'synonyms': ['coconut', 'cocoanut'], 'def': 'large hard-shelled brown oval nut with a fibrous husk', 'name': 'coconut'}, {'frequency': 'r', 'id': 287, 'synset': 'coffee_filter.n.01', 'synonyms': ['coffee_filter'], 'def': 'filter (usually of paper) that passes the coffee and retains the coffee grounds', 'name': 'coffee_filter'}, {'frequency': 'f', 'id': 288, 'synset': 'coffee_maker.n.01', 'synonyms': ['coffee_maker', 'coffee_machine'], 'def': 'a kitchen appliance for brewing coffee automatically', 'name': 'coffee_maker'}, {'frequency': 'f', 'id': 289, 'synset': 'coffee_table.n.01', 'synonyms': ['coffee_table', 'cocktail_table'], 'def': 'low table where magazines can be placed and coffee or cocktails are served', 'name': 'coffee_table'}, {'frequency': 'c', 'id': 290, 'synset': 'coffeepot.n.01', 'synonyms': ['coffeepot'], 'def': 'tall pot in which coffee is brewed', 'name': 'coffeepot'}, {'frequency': 'r', 'id': 291, 'synset': 'coil.n.05', 'synonyms': ['coil'], 'def': 'tubing that is wound in a spiral', 'name': 'coil'}, {'frequency': 'c', 'id': 292, 'synset': 'coin.n.01', 'synonyms': ['coin'], 'def': 'a flat metal piece (usually a disc) used as money', 'name': 'coin'}, {'frequency': 'r', 'id': 293, 'synset': 'colander.n.01', 'synonyms': ['colander', 'cullender'], 'def': 'bowl-shaped strainer; used to wash or drain foods', 'name': 'colander'}, {'frequency': 'c', 'id': 294, 'synset': 'coleslaw.n.01', 'synonyms': ['coleslaw', 'slaw'], 'def': 'basically shredded cabbage', 'name': 'coleslaw'}, {'frequency': 'r', 'id': 295, 'synset': 'coloring_material.n.01', 'synonyms': ['coloring_material', 'colouring_material'], 'def': 'any material used for its color', 'name': 'coloring_material'}, {'frequency': 'r', 'id': 296, 'synset': 'combination_lock.n.01', 'synonyms': ['combination_lock'], 'def': 'lock that can be opened only by turning dials in a special sequence', 'name': 'combination_lock'}, {'frequency': 'c', 'id': 297, 'synset': 'comforter.n.04', 'synonyms': ['pacifier', 'teething_ring'], 'def': 'device used for an infant to suck or bite on', 'name': 'pacifier'}, {'frequency': 'r', 'id': 298, 'synset': 'comic_book.n.01', 'synonyms': ['comic_book'], 'def': 'a magazine devoted to comic strips', 'name': 'comic_book'}, {'frequency': 'f', 'id': 299, 'synset': 'computer_keyboard.n.01', 'synonyms': ['computer_keyboard', 'keyboard_(computer)'], 'def': 'a keyboard that is a data input device for computers', 'name': 'computer_keyboard'}, {'frequency': 'r', 'id': 300, 'synset': 'concrete_mixer.n.01', 'synonyms': ['concrete_mixer', 'cement_mixer'], 'def': 'a machine with a large revolving drum in which cement/concrete is mixed', 'name': 'concrete_mixer'}, {'frequency': 'f', 'id': 301, 'synset': 'cone.n.01', 'synonyms': ['cone', 'traffic_cone'], 'def': 'a cone-shaped object used to direct traffic', 'name': 'cone'}, {'frequency': 'f', 'id': 302, 'synset': 'control.n.09', 'synonyms': ['control', 'controller'], 'def': 'a mechanism that controls the operation of a machine', 'name': 'control'}, {'frequency': 'r', 'id': 303, 'synset': 'convertible.n.01', 'synonyms': ['convertible_(automobile)'], 'def': 'a car that has top that can be folded or removed', 'name': 'convertible_(automobile)'}, {'frequency': 'r', 'id': 304, 'synset': 'convertible.n.03', 'synonyms': ['sofa_bed'], 'def': 'a sofa that can be converted into a bed', 'name': 'sofa_bed'}, {'frequency': 'c', 'id': 305, 'synset': 'cookie.n.01', 'synonyms': ['cookie', 'cooky', 'biscuit_(cookie)'], 'def': "any of various small flat sweet cakes (`biscuit' is the British term)", 'name': 'cookie'}, {'frequency': 'r', 'id': 306, 'synset': 'cookie_jar.n.01', 'synonyms': ['cookie_jar', 'cooky_jar'], 'def': 'a jar in which cookies are kept (and sometimes money is hidden)', 'name': 'cookie_jar'}, {'frequency': 'r', 'id': 307, 'synset': 'cooking_utensil.n.01', 'synonyms': ['cooking_utensil'], 'def': 'a kitchen utensil made of material that does not melt easily; used for cooking', 'name': 'cooking_utensil'}, {'frequency': 'f', 'id': 308, 'synset': 'cooler.n.01', 'synonyms': ['cooler_(for_food)', 'ice_chest'], 'def': 'an insulated box for storing food often with ice', 'name': 'cooler_(for_food)'}, {'frequency': 'c', 'id': 309, 'synset': 'cork.n.04', 'synonyms': ['cork_(bottle_plug)', 'bottle_cork'], 'def': 'the plug in the mouth of a bottle (especially a wine bottle)', 'name': 'cork_(bottle_plug)'}, {'frequency': 'r', 'id': 310, 'synset': 'corkboard.n.01', 'synonyms': ['corkboard'], 'def': 'a sheet consisting of cork granules', 'name': 'corkboard'}, {'frequency': 'r', 'id': 311, 'synset': 'corkscrew.n.01', 'synonyms': ['corkscrew', 'bottle_screw'], 'def': 'a bottle opener that pulls corks', 'name': 'corkscrew'}, {'frequency': 'c', 'id': 312, 'synset': 'corn.n.03', 'synonyms': ['edible_corn', 'corn', 'maize'], 'def': 'ears of corn that can be prepared and served for human food', 'name': 'edible_corn'}, {'frequency': 'r', 'id': 313, 'synset': 'cornbread.n.01', 'synonyms': ['cornbread'], 'def': 'bread made primarily of cornmeal', 'name': 'cornbread'}, {'frequency': 'c', 'id': 314, 'synset': 'cornet.n.01', 'synonyms': ['cornet', 'horn', 'trumpet'], 'def': 'a brass musical instrument with a narrow tube and a flared bell and many valves', 'name': 'cornet'}, {'frequency': 'c', 'id': 315, 'synset': 'cornice.n.01', 'synonyms': ['cornice', 'valance', 'valance_board', 'pelmet'], 'def': 'a decorative framework to conceal curtain fixtures at the top of a window casing', 'name': 'cornice'}, {'frequency': 'r', 'id': 316, 'synset': 'cornmeal.n.01', 'synonyms': ['cornmeal'], 'def': 'coarsely ground corn', 'name': 'cornmeal'}, {'frequency': 'r', 'id': 317, 'synset': 'corset.n.01', 'synonyms': ['corset', 'girdle'], 'def': "a woman's close-fitting foundation garment", 'name': 'corset'}, {'frequency': 'r', 'id': 318, 'synset': 'cos.n.02', 'synonyms': ['romaine_lettuce'], 'def': 'lettuce with long dark-green leaves in a loosely packed elongated head', 'name': 'romaine_lettuce'}, {'frequency': 'c', 'id': 319, 'synset': 'costume.n.04', 'synonyms': ['costume'], 'def': 'the attire characteristic of a country or a time or a social class', 'name': 'costume'}, {'frequency': 'r', 'id': 320, 'synset': 'cougar.n.01', 'synonyms': ['cougar', 'puma', 'catamount', 'mountain_lion', 'panther'], 'def': 'large American feline resembling a lion', 'name': 'cougar'}, {'frequency': 'r', 'id': 321, 'synset': 'coverall.n.01', 'synonyms': ['coverall'], 'def': 'a loose-fitting protective garment that is worn over other clothing', 'name': 'coverall'}, {'frequency': 'r', 'id': 322, 'synset': 'cowbell.n.01', 'synonyms': ['cowbell'], 'def': 'a bell hung around the neck of cow so that the cow can be easily located', 'name': 'cowbell'}, {'frequency': 'f', 'id': 323, 'synset': 'cowboy_hat.n.01', 'synonyms': ['cowboy_hat', 'ten-gallon_hat'], 'def': 'a hat with a wide brim and a soft crown; worn by American ranch hands', 'name': 'cowboy_hat'}, {'frequency': 'r', 'id': 324, 'synset': 'crab.n.01', 'synonyms': ['crab_(animal)'], 'def': 'decapod having eyes on short stalks and a broad flattened shell and pincers', 'name': 'crab_(animal)'}, {'frequency': 'c', 'id': 325, 'synset': 'cracker.n.01', 'synonyms': ['cracker'], 'def': 'a thin crisp wafer', 'name': 'cracker'}, {'frequency': 'r', 'id': 326, 'synset': 'crape.n.01', 'synonyms': ['crape', 'crepe', 'French_pancake'], 'def': 'small very thin pancake', 'name': 'crape'}, {'frequency': 'f', 'id': 327, 'synset': 'crate.n.01', 'synonyms': ['crate'], 'def': 'a rugged box (usually made of wood); used for shipping', 'name': 'crate'}, {'frequency': 'r', 'id': 328, 'synset': 'crayon.n.01', 'synonyms': ['crayon', 'wax_crayon'], 'def': 'writing or drawing implement made of a colored stick of composition wax', 'name': 'crayon'}, {'frequency': 'r', 'id': 329, 'synset': 'cream_pitcher.n.01', 'synonyms': ['cream_pitcher'], 'def': 'a small pitcher for serving cream', 'name': 'cream_pitcher'}, {'frequency': 'r', 'id': 330, 'synset': 'credit_card.n.01', 'synonyms': ['credit_card', 'charge_card', 'debit_card'], 'def': 'a card, usually plastic, used to pay for goods and services', 'name': 'credit_card'}, {'frequency': 'c', 'id': 331, 'synset': 'crescent_roll.n.01', 'synonyms': ['crescent_roll', 'croissant'], 'def': 'very rich flaky crescent-shaped roll', 'name': 'crescent_roll'}, {'frequency': 'c', 'id': 332, 'synset': 'crib.n.01', 'synonyms': ['crib', 'cot'], 'def': 'baby bed with high sides made of slats', 'name': 'crib'}, {'frequency': 'c', 'id': 333, 'synset': 'crock.n.03', 'synonyms': ['crock_pot', 'earthenware_jar'], 'def': 'an earthen jar (made of baked clay)', 'name': 'crock_pot'}, {'frequency': 'f', 'id': 334, 'synset': 'crossbar.n.01', 'synonyms': ['crossbar'], 'def': 'a horizontal bar that goes across something', 'name': 'crossbar'}, {'frequency': 'r', 'id': 335, 'synset': 'crouton.n.01', 'synonyms': ['crouton'], 'def': 'a small piece of toasted or fried bread; served in soup or salads', 'name': 'crouton'}, {'frequency': 'r', 'id': 336, 'synset': 'crow.n.01', 'synonyms': ['crow'], 'def': 'black birds having a raucous call', 'name': 'crow'}, {'frequency': 'c', 'id': 337, 'synset': 'crown.n.04', 'synonyms': ['crown'], 'def': 'an ornamental jeweled headdress signifying sovereignty', 'name': 'crown'}, {'frequency': 'c', 'id': 338, 'synset': 'crucifix.n.01', 'synonyms': ['crucifix'], 'def': 'representation of the cross on which Jesus died', 'name': 'crucifix'}, {'frequency': 'c', 'id': 339, 'synset': 'cruise_ship.n.01', 'synonyms': ['cruise_ship', 'cruise_liner'], 'def': 'a passenger ship used commercially for pleasure cruises', 'name': 'cruise_ship'}, {'frequency': 'c', 'id': 340, 'synset': 'cruiser.n.01', 'synonyms': ['police_cruiser', 'patrol_car', 'police_car', 'squad_car'], 'def': 'a car in which policemen cruise the streets', 'name': 'police_cruiser'}, {'frequency': 'c', 'id': 341, 'synset': 'crumb.n.03', 'synonyms': ['crumb'], 'def': 'small piece of e.g. bread or cake', 'name': 'crumb'}, {'frequency': 'r', 'id': 342, 'synset': 'crutch.n.01', 'synonyms': ['crutch'], 'def': 'a wooden or metal staff that fits under the armpit and reaches to the ground', 'name': 'crutch'}, {'frequency': 'c', 'id': 343, 'synset': 'cub.n.03', 'synonyms': ['cub_(animal)'], 'def': 'the young of certain carnivorous mammals such as the bear or wolf or lion', 'name': 'cub_(animal)'}, {'frequency': 'r', 'id': 344, 'synset': 'cube.n.05', 'synonyms': ['cube', 'square_block'], 'def': 'a block in the (approximate) shape of a cube', 'name': 'cube'}, {'frequency': 'f', 'id': 345, 'synset': 'cucumber.n.02', 'synonyms': ['cucumber', 'cuke'], 'def': 'cylindrical green fruit with thin green rind and white flesh eaten as a vegetable', 'name': 'cucumber'}, {'frequency': 'c', 'id': 346, 'synset': 'cufflink.n.01', 'synonyms': ['cufflink'], 'def': 'jewelry consisting of linked buttons used to fasten the cuffs of a shirt', 'name': 'cufflink'}, {'frequency': 'f', 'id': 347, 'synset': 'cup.n.01', 'synonyms': ['cup'], 'def': 'a small open container usually used for drinking; usually has a handle', 'name': 'cup'}, {'frequency': 'c', 'id': 348, 'synset': 'cup.n.08', 'synonyms': ['trophy_cup'], 'def': 'a metal vessel with handles that is awarded as a trophy to a competition winner', 'name': 'trophy_cup'}, {'frequency': 'c', 'id': 349, 'synset': 'cupcake.n.01', 'synonyms': ['cupcake'], 'def': 'small cake baked in a muffin tin', 'name': 'cupcake'}, {'frequency': 'r', 'id': 350, 'synset': 'curler.n.01', 'synonyms': ['hair_curler', 'hair_roller', 'hair_crimper'], 'def': 'a cylindrical tube around which the hair is wound to curl it', 'name': 'hair_curler'}, {'frequency': 'r', 'id': 351, 'synset': 'curling_iron.n.01', 'synonyms': ['curling_iron'], 'def': 'a cylindrical home appliance that heats hair that has been curled around it', 'name': 'curling_iron'}, {'frequency': 'f', 'id': 352, 'synset': 'curtain.n.01', 'synonyms': ['curtain', 'drapery'], 'def': 'hanging cloth used as a blind (especially for a window)', 'name': 'curtain'}, {'frequency': 'f', 'id': 353, 'synset': 'cushion.n.03', 'synonyms': ['cushion'], 'def': 'a soft bag filled with air or padding such as feathers or foam rubber', 'name': 'cushion'}, {'frequency': 'r', 'id': 354, 'synset': 'custard.n.01', 'synonyms': ['custard'], 'def': 'sweetened mixture of milk and eggs baked or boiled or frozen', 'name': 'custard'}, {'frequency': 'c', 'id': 355, 'synset': 'cutter.n.06', 'synonyms': ['cutting_tool'], 'def': 'a cutting implement; a tool for cutting', 'name': 'cutting_tool'}, {'frequency': 'r', 'id': 356, 'synset': 'cylinder.n.04', 'synonyms': ['cylinder'], 'def': 'a cylindrical container', 'name': 'cylinder'}, {'frequency': 'r', 'id': 357, 'synset': 'cymbal.n.01', 'synonyms': ['cymbal'], 'def': 'a percussion instrument consisting of a concave brass disk', 'name': 'cymbal'}, {'frequency': 'r', 'id': 358, 'synset': 'dachshund.n.01', 'synonyms': ['dachshund', 'dachsie', 'badger_dog'], 'def': 'small long-bodied short-legged breed of dog having a short sleek coat and long drooping ears', 'name': 'dachshund'}, {'frequency': 'r', 'id': 359, 'synset': 'dagger.n.01', 'synonyms': ['dagger'], 'def': 'a short knife with a pointed blade used for piercing or stabbing', 'name': 'dagger'}, {'frequency': 'r', 'id': 360, 'synset': 'dartboard.n.01', 'synonyms': ['dartboard'], 'def': 'a circular board of wood or cork used as the target in the game of darts', 'name': 'dartboard'}, {'frequency': 'r', 'id': 361, 'synset': 'date.n.08', 'synonyms': ['date_(fruit)'], 'def': 'sweet edible fruit of the date palm with a single long woody seed', 'name': 'date_(fruit)'}, {'frequency': 'f', 'id': 362, 'synset': 'deck_chair.n.01', 'synonyms': ['deck_chair', 'beach_chair'], 'def': 'a folding chair for use outdoors; a wooden frame supports a length of canvas', 'name': 'deck_chair'}, {'frequency': 'c', 'id': 363, 'synset': 'deer.n.01', 'synonyms': ['deer', 'cervid'], 'def': "distinguished from Bovidae by the male's having solid deciduous antlers", 'name': 'deer'}, {'frequency': 'c', 'id': 364, 'synset': 'dental_floss.n.01', 'synonyms': ['dental_floss', 'floss'], 'def': 'a soft thread for cleaning the spaces between the teeth', 'name': 'dental_floss'}, {'frequency': 'f', 'id': 365, 'synset': 'desk.n.01', 'synonyms': ['desk'], 'def': 'a piece of furniture with a writing surface and usually drawers or other compartments', 'name': 'desk'}, {'frequency': 'r', 'id': 366, 'synset': 'detergent.n.01', 'synonyms': ['detergent'], 'def': 'a surface-active chemical widely used in industry and laundering', 'name': 'detergent'}, {'frequency': 'c', 'id': 367, 'synset': 'diaper.n.01', 'synonyms': ['diaper'], 'def': 'garment consisting of a folded cloth drawn up between the legs and fastened at the waist', 'name': 'diaper'}, {'frequency': 'r', 'id': 368, 'synset': 'diary.n.01', 'synonyms': ['diary', 'journal'], 'def': 'a daily written record of (usually personal) experiences and observations', 'name': 'diary'}, {'frequency': 'r', 'id': 369, 'synset': 'die.n.01', 'synonyms': ['die', 'dice'], 'def': 'a small cube with 1 to 6 spots on the six faces; used in gambling', 'name': 'die'}, {'frequency': 'r', 'id': 370, 'synset': 'dinghy.n.01', 'synonyms': ['dinghy', 'dory', 'rowboat'], 'def': 'a small boat of shallow draft with seats and oars with which it is propelled', 'name': 'dinghy'}, {'frequency': 'f', 'id': 371, 'synset': 'dining_table.n.01', 'synonyms': ['dining_table'], 'def': 'a table at which meals are served', 'name': 'dining_table'}, {'frequency': 'r', 'id': 372, 'synset': 'dinner_jacket.n.01', 'synonyms': ['tux', 'tuxedo'], 'def': 'semiformal evening dress for men', 'name': 'tux'}, {'frequency': 'c', 'id': 373, 'synset': 'dish.n.01', 'synonyms': ['dish'], 'def': 'a piece of dishware normally used as a container for holding or serving food', 'name': 'dish'}, {'frequency': 'c', 'id': 374, 'synset': 'dish.n.05', 'synonyms': ['dish_antenna'], 'def': 'directional antenna consisting of a parabolic reflector', 'name': 'dish_antenna'}, {'frequency': 'c', 'id': 375, 'synset': 'dishrag.n.01', 'synonyms': ['dishrag', 'dishcloth'], 'def': 'a cloth for washing dishes', 'name': 'dishrag'}, {'frequency': 'c', 'id': 376, 'synset': 'dishtowel.n.01', 'synonyms': ['dishtowel', 'tea_towel'], 'def': 'a towel for drying dishes', 'name': 'dishtowel'}, {'frequency': 'f', 'id': 377, 'synset': 'dishwasher.n.01', 'synonyms': ['dishwasher', 'dishwashing_machine'], 'def': 'a machine for washing dishes', 'name': 'dishwasher'}, {'frequency': 'r', 'id': 378, 'synset': 'dishwasher_detergent.n.01', 'synonyms': ['dishwasher_detergent', 'dishwashing_detergent', 'dishwashing_liquid'], 'def': 'a low-sudsing detergent designed for use in dishwashers', 'name': 'dishwasher_detergent'}, {'frequency': 'r', 'id': 379, 'synset': 'diskette.n.01', 'synonyms': ['diskette', 'floppy', 'floppy_disk'], 'def': 'a small plastic magnetic disk enclosed in a stiff envelope used to store data', 'name': 'diskette'}, {'frequency': 'c', 'id': 380, 'synset': 'dispenser.n.01', 'synonyms': ['dispenser'], 'def': 'a container so designed that the contents can be used in prescribed amounts', 'name': 'dispenser'}, {'frequency': 'c', 'id': 381, 'synset': 'dixie_cup.n.01', 'synonyms': ['Dixie_cup', 'paper_cup'], 'def': 'a disposable cup made of paper; for holding drinks', 'name': 'Dixie_cup'}, {'frequency': 'f', 'id': 382, 'synset': 'dog.n.01', 'synonyms': ['dog'], 'def': 'a common domesticated dog', 'name': 'dog'}, {'frequency': 'f', 'id': 383, 'synset': 'dog_collar.n.01', 'synonyms': ['dog_collar'], 'def': 'a collar for a dog', 'name': 'dog_collar'}, {'frequency': 'c', 'id': 384, 'synset': 'doll.n.01', 'synonyms': ['doll'], 'def': 'a toy replica of a HUMAN (NOT AN ANIMAL)', 'name': 'doll'}, {'frequency': 'r', 'id': 385, 'synset': 'dollar.n.02', 'synonyms': ['dollar', 'dollar_bill', 'one_dollar_bill'], 'def': 'a piece of paper money worth one dollar', 'name': 'dollar'}, {'frequency': 'r', 'id': 386, 'synset': 'dolphin.n.02', 'synonyms': ['dolphin'], 'def': 'any of various small toothed whales with a beaklike snout; larger than porpoises', 'name': 'dolphin'}, {'frequency': 'c', 'id': 387, 'synset': 'domestic_ass.n.01', 'synonyms': ['domestic_ass', 'donkey'], 'def': 'domestic beast of burden descended from the African wild ass; patient but stubborn', 'name': 'domestic_ass'}, {'frequency': 'r', 'id': 388, 'synset': 'domino.n.03', 'synonyms': ['eye_mask'], 'def': 'a mask covering the upper part of the face but with holes for the eyes', 'name': 'eye_mask'}, {'frequency': 'r', 'id': 389, 'synset': 'doorbell.n.01', 'synonyms': ['doorbell', 'buzzer'], 'def': 'a button at an outer door that gives a ringing or buzzing signal when pushed', 'name': 'doorbell'}, {'frequency': 'f', 'id': 390, 'synset': 'doorknob.n.01', 'synonyms': ['doorknob', 'doorhandle'], 'def': "a knob used to open a door (often called `doorhandle' in Great Britain)", 'name': 'doorknob'}, {'frequency': 'c', 'id': 391, 'synset': 'doormat.n.02', 'synonyms': ['doormat', 'welcome_mat'], 'def': 'a mat placed outside an exterior door for wiping the shoes before entering', 'name': 'doormat'}, {'frequency': 'f', 'id': 392, 'synset': 'doughnut.n.02', 'synonyms': ['doughnut', 'donut'], 'def': 'a small ring-shaped friedcake', 'name': 'doughnut'}, {'frequency': 'r', 'id': 393, 'synset': 'dove.n.01', 'synonyms': ['dove'], 'def': 'any of numerous small pigeons', 'name': 'dove'}, {'frequency': 'r', 'id': 394, 'synset': 'dragonfly.n.01', 'synonyms': ['dragonfly'], 'def': 'slender-bodied non-stinging insect having iridescent wings that are outspread at rest', 'name': 'dragonfly'}, {'frequency': 'f', 'id': 395, 'synset': 'drawer.n.01', 'synonyms': ['drawer'], 'def': 'a boxlike container in a piece of furniture; made so as to slide in and out', 'name': 'drawer'}, {'frequency': 'c', 'id': 396, 'synset': 'drawers.n.01', 'synonyms': ['underdrawers', 'boxers', 'boxershorts'], 'def': 'underpants worn by men', 'name': 'underdrawers'}, {'frequency': 'f', 'id': 397, 'synset': 'dress.n.01', 'synonyms': ['dress', 'frock'], 'def': 'a one-piece garment for a woman; has skirt and bodice', 'name': 'dress'}, {'frequency': 'c', 'id': 398, 'synset': 'dress_hat.n.01', 'synonyms': ['dress_hat', 'high_hat', 'opera_hat', 'silk_hat', 'top_hat'], 'def': "a man's hat with a tall crown; usually covered with silk or with beaver fur", 'name': 'dress_hat'}, {'frequency': 'c', 'id': 399, 'synset': 'dress_suit.n.01', 'synonyms': ['dress_suit'], 'def': 'formalwear consisting of full evening dress for men', 'name': 'dress_suit'}, {'frequency': 'c', 'id': 400, 'synset': 'dresser.n.05', 'synonyms': ['dresser'], 'def': 'a cabinet with shelves', 'name': 'dresser'}, {'frequency': 'c', 'id': 401, 'synset': 'drill.n.01', 'synonyms': ['drill'], 'def': 'a tool with a sharp rotating point for making holes in hard materials', 'name': 'drill'}, {'frequency': 'r', 'id': 402, 'synset': 'drinking_fountain.n.01', 'synonyms': ['drinking_fountain'], 'def': 'a public fountain to provide a jet of drinking water', 'name': 'drinking_fountain'}, {'frequency': 'r', 'id': 403, 'synset': 'drone.n.04', 'synonyms': ['drone'], 'def': 'an aircraft without a pilot that is operated by remote control', 'name': 'drone'}, {'frequency': 'r', 'id': 404, 'synset': 'dropper.n.01', 'synonyms': ['dropper', 'eye_dropper'], 'def': 'pipet consisting of a small tube with a vacuum bulb at one end for drawing liquid in and releasing it a drop at a time', 'name': 'dropper'}, {'frequency': 'c', 'id': 405, 'synset': 'drum.n.01', 'synonyms': ['drum_(musical_instrument)'], 'def': 'a musical percussion instrument; usually consists of a hollow cylinder with a membrane stretched across each end', 'name': 'drum_(musical_instrument)'}, {'frequency': 'r', 'id': 406, 'synset': 'drumstick.n.02', 'synonyms': ['drumstick'], 'def': 'a stick used for playing a drum', 'name': 'drumstick'}, {'frequency': 'f', 'id': 407, 'synset': 'duck.n.01', 'synonyms': ['duck'], 'def': 'small web-footed broad-billed swimming bird', 'name': 'duck'}, {'frequency': 'r', 'id': 408, 'synset': 'duckling.n.02', 'synonyms': ['duckling'], 'def': 'young duck', 'name': 'duckling'}, {'frequency': 'c', 'id': 409, 'synset': 'duct_tape.n.01', 'synonyms': ['duct_tape'], 'def': 'a wide silvery adhesive tape', 'name': 'duct_tape'}, {'frequency': 'f', 'id': 410, 'synset': 'duffel_bag.n.01', 'synonyms': ['duffel_bag', 'duffle_bag', 'duffel', 'duffle'], 'def': 'a large cylindrical bag of heavy cloth', 'name': 'duffel_bag'}, {'frequency': 'r', 'id': 411, 'synset': 'dumbbell.n.01', 'synonyms': ['dumbbell'], 'def': 'an exercising weight with two ball-like ends connected by a short handle', 'name': 'dumbbell'}, {'frequency': 'c', 'id': 412, 'synset': 'dumpster.n.01', 'synonyms': ['dumpster'], 'def': 'a container designed to receive and transport and dump waste', 'name': 'dumpster'}, {'frequency': 'r', 'id': 413, 'synset': 'dustpan.n.02', 'synonyms': ['dustpan'], 'def': 'a short-handled receptacle into which dust can be swept', 'name': 'dustpan'}, {'frequency': 'r', 'id': 414, 'synset': 'dutch_oven.n.02', 'synonyms': ['Dutch_oven'], 'def': 'iron or earthenware cooking pot; used for stews', 'name': 'Dutch_oven'}, {'frequency': 'c', 'id': 415, 'synset': 'eagle.n.01', 'synonyms': ['eagle'], 'def': 'large birds of prey noted for their broad wings and strong soaring flight', 'name': 'eagle'}, {'frequency': 'f', 'id': 416, 'synset': 'earphone.n.01', 'synonyms': ['earphone', 'earpiece', 'headphone'], 'def': 'device for listening to audio that is held over or inserted into the ear', 'name': 'earphone'}, {'frequency': 'r', 'id': 417, 'synset': 'earplug.n.01', 'synonyms': ['earplug'], 'def': 'a soft plug that is inserted into the ear canal to block sound', 'name': 'earplug'}, {'frequency': 'f', 'id': 418, 'synset': 'earring.n.01', 'synonyms': ['earring'], 'def': 'jewelry to ornament the ear', 'name': 'earring'}, {'frequency': 'c', 'id': 419, 'synset': 'easel.n.01', 'synonyms': ['easel'], 'def': "an upright tripod for displaying something (usually an artist's canvas)", 'name': 'easel'}, {'frequency': 'r', 'id': 420, 'synset': 'eclair.n.01', 'synonyms': ['eclair'], 'def': 'oblong cream puff', 'name': 'eclair'}, {'frequency': 'r', 'id': 421, 'synset': 'eel.n.01', 'synonyms': ['eel'], 'def': 'an elongate fish with fatty flesh', 'name': 'eel'}, {'frequency': 'f', 'id': 422, 'synset': 'egg.n.02', 'synonyms': ['egg', 'eggs'], 'def': 'oval reproductive body of a fowl (especially a hen) used as food', 'name': 'egg'}, {'frequency': 'r', 'id': 423, 'synset': 'egg_roll.n.01', 'synonyms': ['egg_roll', 'spring_roll'], 'def': 'minced vegetables and meat wrapped in a pancake and fried', 'name': 'egg_roll'}, {'frequency': 'c', 'id': 424, 'synset': 'egg_yolk.n.01', 'synonyms': ['egg_yolk', 'yolk_(egg)'], 'def': 'the yellow spherical part of an egg', 'name': 'egg_yolk'}, {'frequency': 'c', 'id': 425, 'synset': 'eggbeater.n.02', 'synonyms': ['eggbeater', 'eggwhisk'], 'def': 'a mixer for beating eggs or whipping cream', 'name': 'eggbeater'}, {'frequency': 'c', 'id': 426, 'synset': 'eggplant.n.01', 'synonyms': ['eggplant', 'aubergine'], 'def': 'egg-shaped vegetable having a shiny skin typically dark purple', 'name': 'eggplant'}, {'frequency': 'r', 'id': 427, 'synset': 'electric_chair.n.01', 'synonyms': ['electric_chair'], 'def': 'a chair-shaped instrument of execution by electrocution', 'name': 'electric_chair'}, {'frequency': 'f', 'id': 428, 'synset': 'electric_refrigerator.n.01', 'synonyms': ['refrigerator'], 'def': 'a refrigerator in which the coolant is pumped around by an electric motor', 'name': 'refrigerator'}, {'frequency': 'f', 'id': 429, 'synset': 'elephant.n.01', 'synonyms': ['elephant'], 'def': 'a common elephant', 'name': 'elephant'}, {'frequency': 'r', 'id': 430, 'synset': 'elk.n.01', 'synonyms': ['elk', 'moose'], 'def': 'large northern deer with enormous flattened antlers in the male', 'name': 'elk'}, {'frequency': 'c', 'id': 431, 'synset': 'envelope.n.01', 'synonyms': ['envelope'], 'def': 'a flat (usually rectangular) container for a letter, thin package, etc.', 'name': 'envelope'}, {'frequency': 'c', 'id': 432, 'synset': 'eraser.n.01', 'synonyms': ['eraser'], 'def': 'an implement used to erase something', 'name': 'eraser'}, {'frequency': 'r', 'id': 433, 'synset': 'escargot.n.01', 'synonyms': ['escargot'], 'def': 'edible snail usually served in the shell with a sauce of melted butter and garlic', 'name': 'escargot'}, {'frequency': 'r', 'id': 434, 'synset': 'eyepatch.n.01', 'synonyms': ['eyepatch'], 'def': 'a protective cloth covering for an injured eye', 'name': 'eyepatch'}, {'frequency': 'r', 'id': 435, 'synset': 'falcon.n.01', 'synonyms': ['falcon'], 'def': 'birds of prey having long pointed powerful wings adapted for swift flight', 'name': 'falcon'}, {'frequency': 'f', 'id': 436, 'synset': 'fan.n.01', 'synonyms': ['fan'], 'def': 'a device for creating a current of air by movement of a surface or surfaces', 'name': 'fan'}, {'frequency': 'f', 'id': 437, 'synset': 'faucet.n.01', 'synonyms': ['faucet', 'spigot', 'tap'], 'def': 'a regulator for controlling the flow of a liquid from a reservoir', 'name': 'faucet'}, {'frequency': 'r', 'id': 438, 'synset': 'fedora.n.01', 'synonyms': ['fedora'], 'def': 'a hat made of felt with a creased crown', 'name': 'fedora'}, {'frequency': 'r', 'id': 439, 'synset': 'ferret.n.02', 'synonyms': ['ferret'], 'def': 'domesticated albino variety of the European polecat bred for hunting rats and rabbits', 'name': 'ferret'}, {'frequency': 'c', 'id': 440, 'synset': 'ferris_wheel.n.01', 'synonyms': ['Ferris_wheel'], 'def': 'a large wheel with suspended seats that remain upright as the wheel rotates', 'name': 'Ferris_wheel'}, {'frequency': 'r', 'id': 441, 'synset': 'ferry.n.01', 'synonyms': ['ferry', 'ferryboat'], 'def': 'a boat that transports people or vehicles across a body of water and operates on a regular schedule', 'name': 'ferry'}, {'frequency': 'r', 'id': 442, 'synset': 'fig.n.04', 'synonyms': ['fig_(fruit)'], 'def': 'fleshy sweet pear-shaped yellowish or purple fruit eaten fresh or preserved or dried', 'name': 'fig_(fruit)'}, {'frequency': 'c', 'id': 443, 'synset': 'fighter.n.02', 'synonyms': ['fighter_jet', 'fighter_aircraft', 'attack_aircraft'], 'def': 'a high-speed military or naval airplane designed to destroy enemy targets', 'name': 'fighter_jet'}, {'frequency': 'f', 'id': 444, 'synset': 'figurine.n.01', 'synonyms': ['figurine'], 'def': 'a small carved or molded figure', 'name': 'figurine'}, {'frequency': 'c', 'id': 445, 'synset': 'file.n.03', 'synonyms': ['file_cabinet', 'filing_cabinet'], 'def': 'office furniture consisting of a container for keeping papers in order', 'name': 'file_cabinet'}, {'frequency': 'r', 'id': 446, 'synset': 'file.n.04', 'synonyms': ['file_(tool)'], 'def': 'a steel hand tool with small sharp teeth on some or all of its surfaces; used for smoothing wood or metal', 'name': 'file_(tool)'}, {'frequency': 'f', 'id': 447, 'synset': 'fire_alarm.n.02', 'synonyms': ['fire_alarm', 'smoke_alarm'], 'def': 'an alarm that is tripped off by fire or smoke', 'name': 'fire_alarm'}, {'frequency': 'c', 'id': 448, 'synset': 'fire_engine.n.01', 'synonyms': ['fire_engine', 'fire_truck'], 'def': 'large trucks that carry firefighters and equipment to the site of a fire', 'name': 'fire_engine'}, {'frequency': 'c', 'id': 449, 'synset': 'fire_extinguisher.n.01', 'synonyms': ['fire_extinguisher', 'extinguisher'], 'def': 'a manually operated device for extinguishing small fires', 'name': 'fire_extinguisher'}, {'frequency': 'c', 'id': 450, 'synset': 'fire_hose.n.01', 'synonyms': ['fire_hose'], 'def': 'a large hose that carries water from a fire hydrant to the site of the fire', 'name': 'fire_hose'}, {'frequency': 'f', 'id': 451, 'synset': 'fireplace.n.01', 'synonyms': ['fireplace'], 'def': 'an open recess in a wall at the base of a chimney where a fire can be built', 'name': 'fireplace'}, {'frequency': 'f', 'id': 452, 'synset': 'fireplug.n.01', 'synonyms': ['fireplug', 'fire_hydrant', 'hydrant'], 'def': 'an upright hydrant for drawing water to use in fighting a fire', 'name': 'fireplug'}, {'frequency': 'c', 'id': 453, 'synset': 'fish.n.01', 'synonyms': ['fish'], 'def': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills', 'name': 'fish'}, {'frequency': 'r', 'id': 454, 'synset': 'fish.n.02', 'synonyms': ['fish_(food)'], 'def': 'the flesh of fish used as food', 'name': 'fish_(food)'}, {'frequency': 'r', 'id': 455, 'synset': 'fishbowl.n.02', 'synonyms': ['fishbowl', 'goldfish_bowl'], 'def': 'a transparent bowl in which small fish are kept', 'name': 'fishbowl'}, {'frequency': 'r', 'id': 456, 'synset': 'fishing_boat.n.01', 'synonyms': ['fishing_boat', 'fishing_vessel'], 'def': 'a vessel for fishing', 'name': 'fishing_boat'}, {'frequency': 'c', 'id': 457, 'synset': 'fishing_rod.n.01', 'synonyms': ['fishing_rod', 'fishing_pole'], 'def': 'a rod that is used in fishing to extend the fishing line', 'name': 'fishing_rod'}, {'frequency': 'f', 'id': 458, 'synset': 'flag.n.01', 'synonyms': ['flag'], 'def': 'emblem usually consisting of a rectangular piece of cloth of distinctive design (do not include pole)', 'name': 'flag'}, {'frequency': 'f', 'id': 459, 'synset': 'flagpole.n.02', 'synonyms': ['flagpole', 'flagstaff'], 'def': 'a tall staff or pole on which a flag is raised', 'name': 'flagpole'}, {'frequency': 'c', 'id': 460, 'synset': 'flamingo.n.01', 'synonyms': ['flamingo'], 'def': 'large pink web-footed bird with down-bent bill', 'name': 'flamingo'}, {'frequency': 'c', 'id': 461, 'synset': 'flannel.n.01', 'synonyms': ['flannel'], 'def': 'a soft light woolen fabric; used for clothing', 'name': 'flannel'}, {'frequency': 'r', 'id': 462, 'synset': 'flash.n.10', 'synonyms': ['flash', 'flashbulb'], 'def': 'a lamp for providing momentary light to take a photograph', 'name': 'flash'}, {'frequency': 'c', 'id': 463, 'synset': 'flashlight.n.01', 'synonyms': ['flashlight', 'torch'], 'def': 'a small portable battery-powered electric lamp', 'name': 'flashlight'}, {'frequency': 'r', 'id': 464, 'synset': 'fleece.n.03', 'synonyms': ['fleece'], 'def': 'a soft bulky fabric with deep pile; used chiefly for clothing', 'name': 'fleece'}, {'frequency': 'f', 'id': 465, 'synset': 'flip-flop.n.02', 'synonyms': ['flip-flop_(sandal)'], 'def': 'a backless sandal held to the foot by a thong between two toes', 'name': 'flip-flop_(sandal)'}, {'frequency': 'c', 'id': 466, 'synset': 'flipper.n.01', 'synonyms': ['flipper_(footwear)', 'fin_(footwear)'], 'def': 'a shoe to aid a person in swimming', 'name': 'flipper_(footwear)'}, {'frequency': 'f', 'id': 467, 'synset': 'flower_arrangement.n.01', 'synonyms': ['flower_arrangement', 'floral_arrangement'], 'def': 'a decorative arrangement of flowers', 'name': 'flower_arrangement'}, {'frequency': 'c', 'id': 468, 'synset': 'flute.n.02', 'synonyms': ['flute_glass', 'champagne_flute'], 'def': 'a tall narrow wineglass', 'name': 'flute_glass'}, {'frequency': 'r', 'id': 469, 'synset': 'foal.n.01', 'synonyms': ['foal'], 'def': 'a young horse', 'name': 'foal'}, {'frequency': 'c', 'id': 470, 'synset': 'folding_chair.n.01', 'synonyms': ['folding_chair'], 'def': 'a chair that can be folded flat for storage', 'name': 'folding_chair'}, {'frequency': 'c', 'id': 471, 'synset': 'food_processor.n.01', 'synonyms': ['food_processor'], 'def': 'a kitchen appliance for shredding, blending, chopping, or slicing food', 'name': 'food_processor'}, {'frequency': 'c', 'id': 472, 'synset': 'football.n.02', 'synonyms': ['football_(American)'], 'def': 'the inflated oblong ball used in playing American football', 'name': 'football_(American)'}, {'frequency': 'r', 'id': 473, 'synset': 'football_helmet.n.01', 'synonyms': ['football_helmet'], 'def': 'a padded helmet with a face mask to protect the head of football players', 'name': 'football_helmet'}, {'frequency': 'c', 'id': 474, 'synset': 'footstool.n.01', 'synonyms': ['footstool', 'footrest'], 'def': 'a low seat or a stool to rest the feet of a seated person', 'name': 'footstool'}, {'frequency': 'f', 'id': 475, 'synset': 'fork.n.01', 'synonyms': ['fork'], 'def': 'cutlery used for serving and eating food', 'name': 'fork'}, {'frequency': 'r', 'id': 476, 'synset': 'forklift.n.01', 'synonyms': ['forklift'], 'def': 'an industrial vehicle with a power operated fork in front that can be inserted under loads to lift and move them', 'name': 'forklift'}, {'frequency': 'r', 'id': 477, 'synset': 'freight_car.n.01', 'synonyms': ['freight_car'], 'def': 'a railway car that carries freight', 'name': 'freight_car'}, {'frequency': 'r', 'id': 478, 'synset': 'french_toast.n.01', 'synonyms': ['French_toast'], 'def': 'bread slice dipped in egg and milk and fried', 'name': 'French_toast'}, {'frequency': 'c', 'id': 479, 'synset': 'freshener.n.01', 'synonyms': ['freshener', 'air_freshener'], 'def': 'anything that freshens', 'name': 'freshener'}, {'frequency': 'f', 'id': 480, 'synset': 'frisbee.n.01', 'synonyms': ['frisbee'], 'def': 'a light, plastic disk propelled with a flip of the wrist for recreation or competition', 'name': 'frisbee'}, {'frequency': 'c', 'id': 481, 'synset': 'frog.n.01', 'synonyms': ['frog', 'toad', 'toad_frog'], 'def': 'a tailless stout-bodied amphibians with long hind limbs for leaping', 'name': 'frog'}, {'frequency': 'c', 'id': 482, 'synset': 'fruit_juice.n.01', 'synonyms': ['fruit_juice'], 'def': 'drink produced by squeezing or crushing fruit', 'name': 'fruit_juice'}, {'frequency': 'r', 'id': 483, 'synset': 'fruit_salad.n.01', 'synonyms': ['fruit_salad'], 'def': 'salad composed of fruits', 'name': 'fruit_salad'}, {'frequency': 'c', 'id': 484, 'synset': 'frying_pan.n.01', 'synonyms': ['frying_pan', 'frypan', 'skillet'], 'def': 'a pan used for frying foods', 'name': 'frying_pan'}, {'frequency': 'r', 'id': 485, 'synset': 'fudge.n.01', 'synonyms': ['fudge'], 'def': 'soft creamy candy', 'name': 'fudge'}, {'frequency': 'r', 'id': 486, 'synset': 'funnel.n.02', 'synonyms': ['funnel'], 'def': 'a cone-shaped utensil used to channel a substance into a container with a small mouth', 'name': 'funnel'}, {'frequency': 'c', 'id': 487, 'synset': 'futon.n.01', 'synonyms': ['futon'], 'def': 'a pad that is used for sleeping on the floor or on a raised frame', 'name': 'futon'}, {'frequency': 'r', 'id': 488, 'synset': 'gag.n.02', 'synonyms': ['gag', 'muzzle'], 'def': "restraint put into a person's mouth to prevent speaking or shouting", 'name': 'gag'}, {'frequency': 'r', 'id': 489, 'synset': 'garbage.n.03', 'synonyms': ['garbage'], 'def': 'a receptacle where waste can be discarded', 'name': 'garbage'}, {'frequency': 'c', 'id': 490, 'synset': 'garbage_truck.n.01', 'synonyms': ['garbage_truck'], 'def': 'a truck for collecting domestic refuse', 'name': 'garbage_truck'}, {'frequency': 'c', 'id': 491, 'synset': 'garden_hose.n.01', 'synonyms': ['garden_hose'], 'def': 'a hose used for watering a lawn or garden', 'name': 'garden_hose'}, {'frequency': 'c', 'id': 492, 'synset': 'gargle.n.01', 'synonyms': ['gargle', 'mouthwash'], 'def': 'a medicated solution used for gargling and rinsing the mouth', 'name': 'gargle'}, {'frequency': 'r', 'id': 493, 'synset': 'gargoyle.n.02', 'synonyms': ['gargoyle'], 'def': 'an ornament consisting of a grotesquely carved figure of a person or animal', 'name': 'gargoyle'}, {'frequency': 'c', 'id': 494, 'synset': 'garlic.n.02', 'synonyms': ['garlic', 'ail'], 'def': 'aromatic bulb used as seasoning', 'name': 'garlic'}, {'frequency': 'r', 'id': 495, 'synset': 'gasmask.n.01', 'synonyms': ['gasmask', 'respirator', 'gas_helmet'], 'def': 'a protective face mask with a filter', 'name': 'gasmask'}, {'frequency': 'r', 'id': 496, 'synset': 'gazelle.n.01', 'synonyms': ['gazelle'], 'def': 'small swift graceful antelope of Africa and Asia having lustrous eyes', 'name': 'gazelle'}, {'frequency': 'c', 'id': 497, 'synset': 'gelatin.n.02', 'synonyms': ['gelatin', 'jelly'], 'def': 'an edible jelly made with gelatin and used as a dessert or salad base or a coating for foods', 'name': 'gelatin'}, {'frequency': 'r', 'id': 498, 'synset': 'gem.n.02', 'synonyms': ['gemstone'], 'def': 'a crystalline rock that can be cut and polished for jewelry', 'name': 'gemstone'}, {'frequency': 'c', 'id': 499, 'synset': 'giant_panda.n.01', 'synonyms': ['giant_panda', 'panda', 'panda_bear'], 'def': 'large black-and-white herbivorous mammal of bamboo forests of China and Tibet', 'name': 'giant_panda'}, {'frequency': 'c', 'id': 500, 'synset': 'gift_wrap.n.01', 'synonyms': ['gift_wrap'], 'def': 'attractive wrapping paper suitable for wrapping gifts', 'name': 'gift_wrap'}, {'frequency': 'c', 'id': 501, 'synset': 'ginger.n.03', 'synonyms': ['ginger', 'gingerroot'], 'def': 'the root of the common ginger plant; used fresh as a seasoning', 'name': 'ginger'}, {'frequency': 'f', 'id': 502, 'synset': 'giraffe.n.01', 'synonyms': ['giraffe'], 'def': 'tall animal having a spotted coat and small horns and very long neck and legs', 'name': 'giraffe'}, {'frequency': 'c', 'id': 503, 'synset': 'girdle.n.02', 'synonyms': ['cincture', 'sash', 'waistband', 'waistcloth'], 'def': 'a band of material around the waist that strengthens a skirt or trousers', 'name': 'cincture'}, {'frequency': 'f', 'id': 504, 'synset': 'glass.n.02', 'synonyms': ['glass_(drink_container)', 'drinking_glass'], 'def': 'a container for holding liquids while drinking', 'name': 'glass_(drink_container)'}, {'frequency': 'c', 'id': 505, 'synset': 'globe.n.03', 'synonyms': ['globe'], 'def': 'a sphere on which a map (especially of the earth) is represented', 'name': 'globe'}, {'frequency': 'f', 'id': 506, 'synset': 'glove.n.02', 'synonyms': ['glove'], 'def': 'handwear covering the hand', 'name': 'glove'}, {'frequency': 'c', 'id': 507, 'synset': 'goat.n.01', 'synonyms': ['goat'], 'def': 'a common goat', 'name': 'goat'}, {'frequency': 'f', 'id': 508, 'synset': 'goggles.n.01', 'synonyms': ['goggles'], 'def': 'tight-fitting spectacles worn to protect the eyes', 'name': 'goggles'}, {'frequency': 'r', 'id': 509, 'synset': 'goldfish.n.01', 'synonyms': ['goldfish'], 'def': 'small golden or orange-red freshwater fishes used as pond or aquarium pets', 'name': 'goldfish'}, {'frequency': 'r', 'id': 510, 'synset': 'golf_club.n.02', 'synonyms': ['golf_club', 'golf-club'], 'def': 'golf equipment used by a golfer to hit a golf ball', 'name': 'golf_club'}, {'frequency': 'c', 'id': 511, 'synset': 'golfcart.n.01', 'synonyms': ['golfcart'], 'def': 'a small motor vehicle in which golfers can ride between shots', 'name': 'golfcart'}, {'frequency': 'r', 'id': 512, 'synset': 'gondola.n.02', 'synonyms': ['gondola_(boat)'], 'def': 'long narrow flat-bottomed boat propelled by sculling; traditionally used on canals of Venice', 'name': 'gondola_(boat)'}, {'frequency': 'c', 'id': 513, 'synset': 'goose.n.01', 'synonyms': ['goose'], 'def': 'loud, web-footed long-necked aquatic birds usually larger than ducks', 'name': 'goose'}, {'frequency': 'r', 'id': 514, 'synset': 'gorilla.n.01', 'synonyms': ['gorilla'], 'def': 'largest ape', 'name': 'gorilla'}, {'frequency': 'r', 'id': 515, 'synset': 'gourd.n.02', 'synonyms': ['gourd'], 'def': 'any of numerous inedible fruits with hard rinds', 'name': 'gourd'}, {'frequency': 'r', 'id': 516, 'synset': 'gown.n.04', 'synonyms': ['surgical_gown', 'scrubs_(surgical_clothing)'], 'def': 'protective garment worn by surgeons during operations', 'name': 'surgical_gown'}, {'frequency': 'f', 'id': 517, 'synset': 'grape.n.01', 'synonyms': ['grape'], 'def': 'any of various juicy fruit with green or purple skins; grow in clusters', 'name': 'grape'}, {'frequency': 'r', 'id': 518, 'synset': 'grasshopper.n.01', 'synonyms': ['grasshopper'], 'def': 'plant-eating insect with hind legs adapted for leaping', 'name': 'grasshopper'}, {'frequency': 'c', 'id': 519, 'synset': 'grater.n.01', 'synonyms': ['grater'], 'def': 'utensil with sharp perforations for shredding foods (as vegetables or cheese)', 'name': 'grater'}, {'frequency': 'c', 'id': 520, 'synset': 'gravestone.n.01', 'synonyms': ['gravestone', 'headstone', 'tombstone'], 'def': 'a stone that is used to mark a grave', 'name': 'gravestone'}, {'frequency': 'r', 'id': 521, 'synset': 'gravy_boat.n.01', 'synonyms': ['gravy_boat', 'gravy_holder'], 'def': 'a dish (often boat-shaped) for serving gravy or sauce', 'name': 'gravy_boat'}, {'frequency': 'c', 'id': 522, 'synset': 'green_bean.n.02', 'synonyms': ['green_bean'], 'def': 'a common bean plant cultivated for its slender green edible pods', 'name': 'green_bean'}, {'frequency': 'c', 'id': 523, 'synset': 'green_onion.n.01', 'synonyms': ['green_onion', 'spring_onion', 'scallion'], 'def': 'a young onion before the bulb has enlarged', 'name': 'green_onion'}, {'frequency': 'r', 'id': 524, 'synset': 'griddle.n.01', 'synonyms': ['griddle'], 'def': 'cooking utensil consisting of a flat heated surface on which food is cooked', 'name': 'griddle'}, {'frequency': 'r', 'id': 525, 'synset': 'grillroom.n.01', 'synonyms': ['grillroom', 'grill_(restaurant)'], 'def': 'a restaurant where food is cooked on a grill', 'name': 'grillroom'}, {'frequency': 'r', 'id': 526, 'synset': 'grinder.n.04', 'synonyms': ['grinder_(tool)'], 'def': 'a machine tool that polishes metal', 'name': 'grinder_(tool)'}, {'frequency': 'r', 'id': 527, 'synset': 'grits.n.01', 'synonyms': ['grits', 'hominy_grits'], 'def': 'coarsely ground corn boiled as a breakfast dish', 'name': 'grits'}, {'frequency': 'c', 'id': 528, 'synset': 'grizzly.n.01', 'synonyms': ['grizzly', 'grizzly_bear'], 'def': 'powerful brownish-yellow bear of the uplands of western North America', 'name': 'grizzly'}, {'frequency': 'c', 'id': 529, 'synset': 'grocery_bag.n.01', 'synonyms': ['grocery_bag'], 'def': "a sack for holding customer's groceries", 'name': 'grocery_bag'}, {'frequency': 'r', 'id': 530, 'synset': 'guacamole.n.01', 'synonyms': ['guacamole'], 'def': 'a dip made of mashed avocado mixed with chopped onions and other seasonings', 'name': 'guacamole'}, {'frequency': 'f', 'id': 531, 'synset': 'guitar.n.01', 'synonyms': ['guitar'], 'def': 'a stringed instrument usually having six strings; played by strumming or plucking', 'name': 'guitar'}, {'frequency': 'c', 'id': 532, 'synset': 'gull.n.02', 'synonyms': ['gull', 'seagull'], 'def': 'mostly white aquatic bird having long pointed wings and short legs', 'name': 'gull'}, {'frequency': 'c', 'id': 533, 'synset': 'gun.n.01', 'synonyms': ['gun'], 'def': 'a weapon that discharges a bullet at high velocity from a metal tube', 'name': 'gun'}, {'frequency': 'r', 'id': 534, 'synset': 'hair_spray.n.01', 'synonyms': ['hair_spray'], 'def': 'substance sprayed on the hair to hold it in place', 'name': 'hair_spray'}, {'frequency': 'c', 'id': 535, 'synset': 'hairbrush.n.01', 'synonyms': ['hairbrush'], 'def': "a brush used to groom a person's hair", 'name': 'hairbrush'}, {'frequency': 'c', 'id': 536, 'synset': 'hairnet.n.01', 'synonyms': ['hairnet'], 'def': 'a small net that someone wears over their hair to keep it in place', 'name': 'hairnet'}, {'frequency': 'c', 'id': 537, 'synset': 'hairpin.n.01', 'synonyms': ['hairpin'], 'def': "a double pronged pin used to hold women's hair in place", 'name': 'hairpin'}, {'frequency': 'f', 'id': 538, 'synset': 'ham.n.01', 'synonyms': ['ham', 'jambon', 'gammon'], 'def': 'meat cut from the thigh of a hog (usually smoked)', 'name': 'ham'}, {'frequency': 'c', 'id': 539, 'synset': 'hamburger.n.01', 'synonyms': ['hamburger', 'beefburger', 'burger'], 'def': 'a sandwich consisting of a patty of minced beef served on a bun', 'name': 'hamburger'}, {'frequency': 'c', 'id': 540, 'synset': 'hammer.n.02', 'synonyms': ['hammer'], 'def': 'a hand tool with a heavy head and a handle; used to deliver an impulsive force by striking', 'name': 'hammer'}, {'frequency': 'r', 'id': 541, 'synset': 'hammock.n.02', 'synonyms': ['hammock'], 'def': 'a hanging bed of canvas or rope netting (usually suspended between two trees)', 'name': 'hammock'}, {'frequency': 'r', 'id': 542, 'synset': 'hamper.n.02', 'synonyms': ['hamper'], 'def': 'a basket usually with a cover', 'name': 'hamper'}, {'frequency': 'r', 'id': 543, 'synset': 'hamster.n.01', 'synonyms': ['hamster'], 'def': 'short-tailed burrowing rodent with large cheek pouches', 'name': 'hamster'}, {'frequency': 'c', 'id': 544, 'synset': 'hand_blower.n.01', 'synonyms': ['hair_dryer'], 'def': 'a hand-held electric blower that can blow warm air onto the hair', 'name': 'hair_dryer'}, {'frequency': 'r', 'id': 545, 'synset': 'hand_glass.n.01', 'synonyms': ['hand_glass', 'hand_mirror'], 'def': 'a mirror intended to be held in the hand', 'name': 'hand_glass'}, {'frequency': 'f', 'id': 546, 'synset': 'hand_towel.n.01', 'synonyms': ['hand_towel', 'face_towel'], 'def': 'a small towel used to dry the hands or face', 'name': 'hand_towel'}, {'frequency': 'c', 'id': 547, 'synset': 'handcart.n.01', 'synonyms': ['handcart', 'pushcart', 'hand_truck'], 'def': 'wheeled vehicle that can be pushed by a person', 'name': 'handcart'}, {'frequency': 'r', 'id': 548, 'synset': 'handcuff.n.01', 'synonyms': ['handcuff'], 'def': 'shackle that consists of a metal loop that can be locked around the wrist', 'name': 'handcuff'}, {'frequency': 'c', 'id': 549, 'synset': 'handkerchief.n.01', 'synonyms': ['handkerchief'], 'def': 'a square piece of cloth used for wiping the eyes or nose or as a costume accessory', 'name': 'handkerchief'}, {'frequency': 'f', 'id': 550, 'synset': 'handle.n.01', 'synonyms': ['handle', 'grip', 'handgrip'], 'def': 'the appendage to an object that is designed to be held in order to use or move it', 'name': 'handle'}, {'frequency': 'r', 'id': 551, 'synset': 'handsaw.n.01', 'synonyms': ['handsaw', "carpenter's_saw"], 'def': 'a saw used with one hand for cutting wood', 'name': 'handsaw'}, {'frequency': 'r', 'id': 552, 'synset': 'hardback.n.01', 'synonyms': ['hardback_book', 'hardcover_book'], 'def': 'a book with cardboard or cloth or leather covers', 'name': 'hardback_book'}, {'frequency': 'r', 'id': 553, 'synset': 'harmonium.n.01', 'synonyms': ['harmonium', 'organ_(musical_instrument)', 'reed_organ_(musical_instrument)'], 'def': 'a free-reed instrument in which air is forced through the reeds by bellows', 'name': 'harmonium'}, {'frequency': 'f', 'id': 554, 'synset': 'hat.n.01', 'synonyms': ['hat'], 'def': 'headwear that protects the head from bad weather, sun, or worn for fashion', 'name': 'hat'}, {'frequency': 'r', 'id': 555, 'synset': 'hatbox.n.01', 'synonyms': ['hatbox'], 'def': 'a round piece of luggage for carrying hats', 'name': 'hatbox'}, {'frequency': 'r', 'id': 556, 'synset': 'hatch.n.03', 'synonyms': ['hatch'], 'def': 'a movable barrier covering a hatchway', 'name': 'hatch'}, {'frequency': 'c', 'id': 557, 'synset': 'head_covering.n.01', 'synonyms': ['veil'], 'def': 'a garment that covers the head and face', 'name': 'veil'}, {'frequency': 'f', 'id': 558, 'synset': 'headband.n.01', 'synonyms': ['headband'], 'def': 'a band worn around or over the head', 'name': 'headband'}, {'frequency': 'f', 'id': 559, 'synset': 'headboard.n.01', 'synonyms': ['headboard'], 'def': 'a vertical board or panel forming the head of a bedstead', 'name': 'headboard'}, {'frequency': 'f', 'id': 560, 'synset': 'headlight.n.01', 'synonyms': ['headlight', 'headlamp'], 'def': 'a powerful light with reflector; attached to the front of an automobile or locomotive', 'name': 'headlight'}, {'frequency': 'c', 'id': 561, 'synset': 'headscarf.n.01', 'synonyms': ['headscarf'], 'def': 'a kerchief worn over the head and tied under the chin', 'name': 'headscarf'}, {'frequency': 'r', 'id': 562, 'synset': 'headset.n.01', 'synonyms': ['headset'], 'def': 'receiver consisting of a pair of headphones', 'name': 'headset'}, {'frequency': 'c', 'id': 563, 'synset': 'headstall.n.01', 'synonyms': ['headstall_(for_horses)', 'headpiece_(for_horses)'], 'def': "the band that is the part of a bridle that fits around a horse's head", 'name': 'headstall_(for_horses)'}, {'frequency': 'r', 'id': 564, 'synset': 'hearing_aid.n.02', 'synonyms': ['hearing_aid'], 'def': 'an acoustic device used to direct sound to the ear of a hearing-impaired person', 'name': 'hearing_aid'}, {'frequency': 'c', 'id': 565, 'synset': 'heart.n.02', 'synonyms': ['heart'], 'def': 'a muscular organ; its contractions move the blood through the body', 'name': 'heart'}, {'frequency': 'c', 'id': 566, 'synset': 'heater.n.01', 'synonyms': ['heater', 'warmer'], 'def': 'device that heats water or supplies warmth to a room', 'name': 'heater'}, {'frequency': 'c', 'id': 567, 'synset': 'helicopter.n.01', 'synonyms': ['helicopter'], 'def': 'an aircraft without wings that obtains its lift from the rotation of overhead blades', 'name': 'helicopter'}, {'frequency': 'f', 'id': 568, 'synset': 'helmet.n.02', 'synonyms': ['helmet'], 'def': 'a protective headgear made of hard material to resist blows', 'name': 'helmet'}, {'frequency': 'r', 'id': 569, 'synset': 'heron.n.02', 'synonyms': ['heron'], 'def': 'grey or white wading bird with long neck and long legs and (usually) long bill', 'name': 'heron'}, {'frequency': 'c', 'id': 570, 'synset': 'highchair.n.01', 'synonyms': ['highchair', 'feeding_chair'], 'def': 'a chair for feeding a very young child', 'name': 'highchair'}, {'frequency': 'f', 'id': 571, 'synset': 'hinge.n.01', 'synonyms': ['hinge'], 'def': 'a joint that holds two parts together so that one can swing relative to the other', 'name': 'hinge'}, {'frequency': 'r', 'id': 572, 'synset': 'hippopotamus.n.01', 'synonyms': ['hippopotamus'], 'def': 'massive thick-skinned animal living in or around rivers of tropical Africa', 'name': 'hippopotamus'}, {'frequency': 'r', 'id': 573, 'synset': 'hockey_stick.n.01', 'synonyms': ['hockey_stick'], 'def': 'sports implement consisting of a stick used by hockey players to move the puck', 'name': 'hockey_stick'}, {'frequency': 'c', 'id': 574, 'synset': 'hog.n.03', 'synonyms': ['hog', 'pig'], 'def': 'domestic swine', 'name': 'hog'}, {'frequency': 'f', 'id': 575, 'synset': 'home_plate.n.01', 'synonyms': ['home_plate_(baseball)', 'home_base_(baseball)'], 'def': '(baseball) a rubber slab where the batter stands; it must be touched by a base runner in order to score', 'name': 'home_plate_(baseball)'}, {'frequency': 'c', 'id': 576, 'synset': 'honey.n.01', 'synonyms': ['honey'], 'def': 'a sweet yellow liquid produced by bees', 'name': 'honey'}, {'frequency': 'f', 'id': 577, 'synset': 'hood.n.06', 'synonyms': ['fume_hood', 'exhaust_hood'], 'def': 'metal covering leading to a vent that exhausts smoke or fumes', 'name': 'fume_hood'}, {'frequency': 'f', 'id': 578, 'synset': 'hook.n.05', 'synonyms': ['hook'], 'def': 'a curved or bent implement for suspending or pulling something', 'name': 'hook'}, {'frequency': 'f', 'id': 579, 'synset': 'horse.n.01', 'synonyms': ['horse'], 'def': 'a common horse', 'name': 'horse'}, {'frequency': 'f', 'id': 580, 'synset': 'hose.n.03', 'synonyms': ['hose', 'hosepipe'], 'def': 'a flexible pipe for conveying a liquid or gas', 'name': 'hose'}, {'frequency': 'r', 'id': 581, 'synset': 'hot-air_balloon.n.01', 'synonyms': ['hot-air_balloon'], 'def': 'balloon for travel through the air in a basket suspended below a large bag of heated air', 'name': 'hot-air_balloon'}, {'frequency': 'r', 'id': 582, 'synset': 'hot_plate.n.01', 'synonyms': ['hotplate'], 'def': 'a portable electric appliance for heating or cooking or keeping food warm', 'name': 'hotplate'}, {'frequency': 'c', 'id': 583, 'synset': 'hot_sauce.n.01', 'synonyms': ['hot_sauce'], 'def': 'a pungent peppery sauce', 'name': 'hot_sauce'}, {'frequency': 'r', 'id': 584, 'synset': 'hourglass.n.01', 'synonyms': ['hourglass'], 'def': 'a sandglass timer that runs for sixty minutes', 'name': 'hourglass'}, {'frequency': 'r', 'id': 585, 'synset': 'houseboat.n.01', 'synonyms': ['houseboat'], 'def': 'a barge that is designed and equipped for use as a dwelling', 'name': 'houseboat'}, {'frequency': 'r', 'id': 586, 'synset': 'hummingbird.n.01', 'synonyms': ['hummingbird'], 'def': 'tiny American bird having brilliant iridescent plumage and long slender bills', 'name': 'hummingbird'}, {'frequency': 'r', 'id': 587, 'synset': 'hummus.n.01', 'synonyms': ['hummus', 'humus', 'hommos', 'hoummos', 'humous'], 'def': 'a thick spread made from mashed chickpeas', 'name': 'hummus'}, {'frequency': 'c', 'id': 588, 'synset': 'ice_bear.n.01', 'synonyms': ['polar_bear'], 'def': 'white bear of Arctic regions', 'name': 'polar_bear'}, {'frequency': 'c', 'id': 589, 'synset': 'ice_cream.n.01', 'synonyms': ['icecream'], 'def': 'frozen dessert containing cream and sugar and flavoring', 'name': 'icecream'}, {'frequency': 'r', 'id': 590, 'synset': 'ice_lolly.n.01', 'synonyms': ['popsicle'], 'def': 'ice cream or water ice on a small wooden stick', 'name': 'popsicle'}, {'frequency': 'c', 'id': 591, 'synset': 'ice_maker.n.01', 'synonyms': ['ice_maker'], 'def': 'an appliance included in some electric refrigerators for making ice cubes', 'name': 'ice_maker'}, {'frequency': 'r', 'id': 592, 'synset': 'ice_pack.n.01', 'synonyms': ['ice_pack', 'ice_bag'], 'def': 'a waterproof bag filled with ice: applied to the body (especially the head) to cool or reduce swelling', 'name': 'ice_pack'}, {'frequency': 'r', 'id': 593, 'synset': 'ice_skate.n.01', 'synonyms': ['ice_skate'], 'def': 'skate consisting of a boot with a steel blade fitted to the sole', 'name': 'ice_skate'}, {'frequency': 'r', 'id': 594, 'synset': 'ice_tea.n.01', 'synonyms': ['ice_tea', 'iced_tea'], 'def': 'strong tea served over ice', 'name': 'ice_tea'}, {'frequency': 'c', 'id': 595, 'synset': 'igniter.n.01', 'synonyms': ['igniter', 'ignitor', 'lighter'], 'def': 'a substance or device used to start a fire', 'name': 'igniter'}, {'frequency': 'r', 'id': 596, 'synset': 'incense.n.01', 'synonyms': ['incense'], 'def': 'a substance that produces a fragrant odor when burned', 'name': 'incense'}, {'frequency': 'r', 'id': 597, 'synset': 'inhaler.n.01', 'synonyms': ['inhaler', 'inhalator'], 'def': 'a dispenser that produces a chemical vapor to be inhaled through mouth or nose', 'name': 'inhaler'}, {'frequency': 'c', 'id': 598, 'synset': 'ipod.n.01', 'synonyms': ['iPod'], 'def': 'a pocket-sized device used to play music files', 'name': 'iPod'}, {'frequency': 'c', 'id': 599, 'synset': 'iron.n.04', 'synonyms': ['iron_(for_clothing)', 'smoothing_iron_(for_clothing)'], 'def': 'home appliance consisting of a flat metal base that is heated and used to smooth cloth', 'name': 'iron_(for_clothing)'}, {'frequency': 'r', 'id': 600, 'synset': 'ironing_board.n.01', 'synonyms': ['ironing_board'], 'def': 'narrow padded board on collapsible supports; used for ironing clothes', 'name': 'ironing_board'}, {'frequency': 'f', 'id': 601, 'synset': 'jacket.n.01', 'synonyms': ['jacket'], 'def': 'a waist-length coat', 'name': 'jacket'}, {'frequency': 'r', 'id': 602, 'synset': 'jam.n.01', 'synonyms': ['jam'], 'def': 'preserve of crushed fruit', 'name': 'jam'}, {'frequency': 'f', 'id': 603, 'synset': 'jean.n.01', 'synonyms': ['jean', 'blue_jean', 'denim'], 'def': '(usually plural) close-fitting trousers of heavy denim for manual work or casual wear', 'name': 'jean'}, {'frequency': 'c', 'id': 604, 'synset': 'jeep.n.01', 'synonyms': ['jeep', 'landrover'], 'def': 'a car suitable for traveling over rough terrain', 'name': 'jeep'}, {'frequency': 'r', 'id': 605, 'synset': 'jelly_bean.n.01', 'synonyms': ['jelly_bean', 'jelly_egg'], 'def': 'sugar-glazed jellied candy', 'name': 'jelly_bean'}, {'frequency': 'f', 'id': 606, 'synset': 'jersey.n.03', 'synonyms': ['jersey', 'T-shirt', 'tee_shirt'], 'def': 'a close-fitting pullover shirt', 'name': 'jersey'}, {'frequency': 'c', 'id': 607, 'synset': 'jet.n.01', 'synonyms': ['jet_plane', 'jet-propelled_plane'], 'def': 'an airplane powered by one or more jet engines', 'name': 'jet_plane'}, {'frequency': 'c', 'id': 608, 'synset': 'jewelry.n.01', 'synonyms': ['jewelry', 'jewellery'], 'def': 'an adornment (as a bracelet or ring or necklace) made of precious metals and set with gems (or imitation gems)', 'name': 'jewelry'}, {'frequency': 'r', 'id': 609, 'synset': 'joystick.n.02', 'synonyms': ['joystick'], 'def': 'a control device for computers consisting of a vertical handle that can move freely in two directions', 'name': 'joystick'}, {'frequency': 'r', 'id': 610, 'synset': 'jump_suit.n.01', 'synonyms': ['jumpsuit'], 'def': "one-piece garment fashioned after a parachutist's uniform", 'name': 'jumpsuit'}, {'frequency': 'c', 'id': 611, 'synset': 'kayak.n.01', 'synonyms': ['kayak'], 'def': 'a small canoe consisting of a light frame made watertight with animal skins', 'name': 'kayak'}, {'frequency': 'r', 'id': 612, 'synset': 'keg.n.02', 'synonyms': ['keg'], 'def': 'small cask or barrel', 'name': 'keg'}, {'frequency': 'r', 'id': 613, 'synset': 'kennel.n.01', 'synonyms': ['kennel', 'doghouse'], 'def': 'outbuilding that serves as a shelter for a dog', 'name': 'kennel'}, {'frequency': 'c', 'id': 614, 'synset': 'kettle.n.01', 'synonyms': ['kettle', 'boiler'], 'def': 'a metal pot for stewing or boiling; usually has a lid', 'name': 'kettle'}, {'frequency': 'f', 'id': 615, 'synset': 'key.n.01', 'synonyms': ['key'], 'def': 'metal instrument used to unlock a lock', 'name': 'key'}, {'frequency': 'r', 'id': 616, 'synset': 'keycard.n.01', 'synonyms': ['keycard'], 'def': 'a plastic card used to gain access typically to a door', 'name': 'keycard'}, {'frequency': 'r', 'id': 617, 'synset': 'kilt.n.01', 'synonyms': ['kilt'], 'def': 'a knee-length pleated tartan skirt worn by men as part of the traditional dress in the Highlands of northern Scotland', 'name': 'kilt'}, {'frequency': 'c', 'id': 618, 'synset': 'kimono.n.01', 'synonyms': ['kimono'], 'def': 'a loose robe; imitated from robes originally worn by Japanese', 'name': 'kimono'}, {'frequency': 'f', 'id': 619, 'synset': 'kitchen_sink.n.01', 'synonyms': ['kitchen_sink'], 'def': 'a sink in a kitchen', 'name': 'kitchen_sink'}, {'frequency': 'c', 'id': 620, 'synset': 'kitchen_table.n.01', 'synonyms': ['kitchen_table'], 'def': 'a table in the kitchen', 'name': 'kitchen_table'}, {'frequency': 'f', 'id': 621, 'synset': 'kite.n.03', 'synonyms': ['kite'], 'def': 'plaything consisting of a light frame covered with tissue paper; flown in wind at end of a string', 'name': 'kite'}, {'frequency': 'c', 'id': 622, 'synset': 'kitten.n.01', 'synonyms': ['kitten', 'kitty'], 'def': 'young domestic cat', 'name': 'kitten'}, {'frequency': 'c', 'id': 623, 'synset': 'kiwi.n.03', 'synonyms': ['kiwi_fruit'], 'def': 'fuzzy brown egg-shaped fruit with slightly tart green flesh', 'name': 'kiwi_fruit'}, {'frequency': 'f', 'id': 624, 'synset': 'knee_pad.n.01', 'synonyms': ['knee_pad'], 'def': 'protective garment consisting of a pad worn by football or baseball or hockey players', 'name': 'knee_pad'}, {'frequency': 'f', 'id': 625, 'synset': 'knife.n.01', 'synonyms': ['knife'], 'def': 'tool with a blade and point used as a cutting instrument', 'name': 'knife'}, {'frequency': 'r', 'id': 626, 'synset': 'knight.n.02', 'synonyms': ['knight_(chess_piece)', 'horse_(chess_piece)'], 'def': 'a chess game piece shaped to resemble the head of a horse', 'name': 'knight_(chess_piece)'}, {'frequency': 'r', 'id': 627, 'synset': 'knitting_needle.n.01', 'synonyms': ['knitting_needle'], 'def': 'needle consisting of a slender rod with pointed ends; usually used in pairs', 'name': 'knitting_needle'}, {'frequency': 'f', 'id': 628, 'synset': 'knob.n.02', 'synonyms': ['knob'], 'def': 'a round handle often found on a door', 'name': 'knob'}, {'frequency': 'r', 'id': 629, 'synset': 'knocker.n.05', 'synonyms': ['knocker_(on_a_door)', 'doorknocker'], 'def': 'a device (usually metal and ornamental) attached by a hinge to a door', 'name': 'knocker_(on_a_door)'}, {'frequency': 'r', 'id': 630, 'synset': 'koala.n.01', 'synonyms': ['koala', 'koala_bear'], 'def': 'sluggish tailless Australian marsupial with grey furry ears and coat', 'name': 'koala'}, {'frequency': 'r', 'id': 631, 'synset': 'lab_coat.n.01', 'synonyms': ['lab_coat', 'laboratory_coat'], 'def': 'a light coat worn to protect clothing from substances used while working in a laboratory', 'name': 'lab_coat'}, {'frequency': 'f', 'id': 632, 'synset': 'ladder.n.01', 'synonyms': ['ladder'], 'def': 'steps consisting of two parallel members connected by rungs', 'name': 'ladder'}, {'frequency': 'c', 'id': 633, 'synset': 'ladle.n.01', 'synonyms': ['ladle'], 'def': 'a spoon-shaped vessel with a long handle frequently used to transfer liquids', 'name': 'ladle'}, {'frequency': 'r', 'id': 634, 'synset': 'ladybug.n.01', 'synonyms': ['ladybug', 'ladybeetle', 'ladybird_beetle'], 'def': 'small round bright-colored and spotted beetle, typically red and black', 'name': 'ladybug'}, {'frequency': 'c', 'id': 635, 'synset': 'lamb.n.01', 'synonyms': ['lamb_(animal)'], 'def': 'young sheep', 'name': 'lamb_(animal)'}, {'frequency': 'r', 'id': 636, 'synset': 'lamb_chop.n.01', 'synonyms': ['lamb-chop', 'lambchop'], 'def': 'chop cut from a lamb', 'name': 'lamb-chop'}, {'frequency': 'f', 'id': 637, 'synset': 'lamp.n.02', 'synonyms': ['lamp'], 'def': 'a piece of furniture holding one or more electric light bulbs', 'name': 'lamp'}, {'frequency': 'f', 'id': 638, 'synset': 'lamppost.n.01', 'synonyms': ['lamppost'], 'def': 'a metal post supporting an outdoor lamp (such as a streetlight)', 'name': 'lamppost'}, {'frequency': 'f', 'id': 639, 'synset': 'lampshade.n.01', 'synonyms': ['lampshade'], 'def': 'a protective ornamental shade used to screen a light bulb from direct view', 'name': 'lampshade'}, {'frequency': 'c', 'id': 640, 'synset': 'lantern.n.01', 'synonyms': ['lantern'], 'def': 'light in a transparent protective case', 'name': 'lantern'}, {'frequency': 'f', 'id': 641, 'synset': 'lanyard.n.02', 'synonyms': ['lanyard', 'laniard'], 'def': 'a cord worn around the neck to hold a knife or whistle, etc.', 'name': 'lanyard'}, {'frequency': 'f', 'id': 642, 'synset': 'laptop.n.01', 'synonyms': ['laptop_computer', 'notebook_computer'], 'def': 'a portable computer small enough to use in your lap', 'name': 'laptop_computer'}, {'frequency': 'r', 'id': 643, 'synset': 'lasagna.n.01', 'synonyms': ['lasagna', 'lasagne'], 'def': 'baked dish of layers of lasagna pasta with sauce and cheese and meat or vegetables', 'name': 'lasagna'}, {'frequency': 'c', 'id': 644, 'synset': 'latch.n.02', 'synonyms': ['latch'], 'def': 'a bar that can be lowered or slid into a groove to fasten a door or gate', 'name': 'latch'}, {'frequency': 'r', 'id': 645, 'synset': 'lawn_mower.n.01', 'synonyms': ['lawn_mower'], 'def': 'garden tool for mowing grass on lawns', 'name': 'lawn_mower'}, {'frequency': 'r', 'id': 646, 'synset': 'leather.n.01', 'synonyms': ['leather'], 'def': 'an animal skin made smooth and flexible by removing the hair and then tanning', 'name': 'leather'}, {'frequency': 'c', 'id': 647, 'synset': 'legging.n.01', 'synonyms': ['legging_(clothing)', 'leging_(clothing)', 'leg_covering'], 'def': 'a garment covering the leg (usually extending from the knee to the ankle)', 'name': 'legging_(clothing)'}, {'frequency': 'c', 'id': 648, 'synset': 'lego.n.01', 'synonyms': ['Lego', 'Lego_set'], 'def': "a child's plastic construction set for making models from blocks", 'name': 'Lego'}, {'frequency': 'f', 'id': 649, 'synset': 'lemon.n.01', 'synonyms': ['lemon'], 'def': 'yellow oval fruit with juicy acidic flesh', 'name': 'lemon'}, {'frequency': 'r', 'id': 650, 'synset': 'lemonade.n.01', 'synonyms': ['lemonade'], 'def': 'sweetened beverage of diluted lemon juice', 'name': 'lemonade'}, {'frequency': 'f', 'id': 651, 'synset': 'lettuce.n.02', 'synonyms': ['lettuce'], 'def': 'leafy plant commonly eaten in salad or on sandwiches', 'name': 'lettuce'}, {'frequency': 'f', 'id': 652, 'synset': 'license_plate.n.01', 'synonyms': ['license_plate', 'numberplate'], 'def': "a plate mounted on the front and back of car and bearing the car's registration number", 'name': 'license_plate'}, {'frequency': 'f', 'id': 653, 'synset': 'life_buoy.n.01', 'synonyms': ['life_buoy', 'lifesaver', 'life_belt', 'life_ring'], 'def': 'a ring-shaped life preserver used to prevent drowning (NOT a life-jacket or vest)', 'name': 'life_buoy'}, {'frequency': 'f', 'id': 654, 'synset': 'life_jacket.n.01', 'synonyms': ['life_jacket', 'life_vest'], 'def': 'life preserver consisting of a sleeveless jacket of buoyant or inflatable design', 'name': 'life_jacket'}, {'frequency': 'f', 'id': 655, 'synset': 'light_bulb.n.01', 'synonyms': ['lightbulb'], 'def': 'glass bulb or tube shaped electric device that emits light (DO NOT MARK LAMPS AS A WHOLE)', 'name': 'lightbulb'}, {'frequency': 'r', 'id': 656, 'synset': 'lightning_rod.n.02', 'synonyms': ['lightning_rod', 'lightning_conductor'], 'def': 'a metallic conductor that is attached to a high point and leads to the ground', 'name': 'lightning_rod'}, {'frequency': 'c', 'id': 657, 'synset': 'lime.n.06', 'synonyms': ['lime'], 'def': 'the green acidic fruit of any of various lime trees', 'name': 'lime'}, {'frequency': 'r', 'id': 658, 'synset': 'limousine.n.01', 'synonyms': ['limousine'], 'def': 'long luxurious car; usually driven by a chauffeur', 'name': 'limousine'}, {'frequency': 'r', 'id': 659, 'synset': 'linen.n.02', 'synonyms': ['linen_paper'], 'def': 'a high-quality paper made of linen fibers or with a linen finish', 'name': 'linen_paper'}, {'frequency': 'c', 'id': 660, 'synset': 'lion.n.01', 'synonyms': ['lion'], 'def': 'large gregarious predatory cat of Africa and India', 'name': 'lion'}, {'frequency': 'c', 'id': 661, 'synset': 'lip_balm.n.01', 'synonyms': ['lip_balm'], 'def': 'a balm applied to the lips', 'name': 'lip_balm'}, {'frequency': 'c', 'id': 662, 'synset': 'lipstick.n.01', 'synonyms': ['lipstick', 'lip_rouge'], 'def': 'makeup that is used to color the lips', 'name': 'lipstick'}, {'frequency': 'r', 'id': 663, 'synset': 'liquor.n.01', 'synonyms': ['liquor', 'spirits', 'hard_liquor', 'liqueur', 'cordial'], 'def': 'an alcoholic beverage that is distilled rather than fermented', 'name': 'liquor'}, {'frequency': 'r', 'id': 664, 'synset': 'lizard.n.01', 'synonyms': ['lizard'], 'def': 'a reptile with usually two pairs of legs and a tapering tail', 'name': 'lizard'}, {'frequency': 'r', 'id': 665, 'synset': 'loafer.n.02', 'synonyms': ['Loafer_(type_of_shoe)'], 'def': 'a low leather step-in shoe', 'name': 'Loafer_(type_of_shoe)'}, {'frequency': 'f', 'id': 666, 'synset': 'log.n.01', 'synonyms': ['log'], 'def': 'a segment of the trunk of a tree when stripped of branches', 'name': 'log'}, {'frequency': 'c', 'id': 667, 'synset': 'lollipop.n.02', 'synonyms': ['lollipop'], 'def': 'hard candy on a stick', 'name': 'lollipop'}, {'frequency': 'c', 'id': 668, 'synset': 'lotion.n.01', 'synonyms': ['lotion'], 'def': 'any of various cosmetic preparations that are applied to the skin', 'name': 'lotion'}, {'frequency': 'f', 'id': 669, 'synset': 'loudspeaker.n.01', 'synonyms': ['speaker_(stero_equipment)'], 'def': 'electronic device that produces sound often as part of a stereo system', 'name': 'speaker_(stero_equipment)'}, {'frequency': 'c', 'id': 670, 'synset': 'love_seat.n.01', 'synonyms': ['loveseat'], 'def': 'small sofa that seats two people', 'name': 'loveseat'}, {'frequency': 'r', 'id': 671, 'synset': 'machine_gun.n.01', 'synonyms': ['machine_gun'], 'def': 'a rapidly firing automatic gun', 'name': 'machine_gun'}, {'frequency': 'f', 'id': 672, 'synset': 'magazine.n.02', 'synonyms': ['magazine'], 'def': 'a paperback periodic publication', 'name': 'magazine'}, {'frequency': 'f', 'id': 673, 'synset': 'magnet.n.01', 'synonyms': ['magnet'], 'def': 'a device that attracts iron and produces a magnetic field', 'name': 'magnet'}, {'frequency': 'r', 'id': 674, 'synset': 'mail_slot.n.01', 'synonyms': ['mail_slot'], 'def': 'a slot (usually in a door) through which mail can be delivered', 'name': 'mail_slot'}, {'frequency': 'c', 'id': 675, 'synset': 'mailbox.n.01', 'synonyms': ['mailbox_(at_home)', 'letter_box_(at_home)'], 'def': 'a private box for delivery of mail', 'name': 'mailbox_(at_home)'}, {'frequency': 'r', 'id': 676, 'synset': 'mallet.n.01', 'synonyms': ['mallet'], 'def': 'a sports implement with a long handle and a hammer-like head used to hit a ball', 'name': 'mallet'}, {'frequency': 'r', 'id': 677, 'synset': 'mammoth.n.01', 'synonyms': ['mammoth'], 'def': 'any of numerous extinct elephants widely distributed in the Pleistocene', 'name': 'mammoth'}, {'frequency': 'c', 'id': 678, 'synset': 'mandarin.n.05', 'synonyms': ['mandarin_orange'], 'def': 'a somewhat flat reddish-orange loose skinned citrus of China', 'name': 'mandarin_orange'}, {'frequency': 'c', 'id': 679, 'synset': 'manger.n.01', 'synonyms': ['manger', 'trough'], 'def': 'a container (usually in a barn or stable) from which cattle or horses feed', 'name': 'manger'}, {'frequency': 'f', 'id': 680, 'synset': 'manhole.n.01', 'synonyms': ['manhole'], 'def': 'a hole (usually with a flush cover) through which a person can gain access to an underground structure', 'name': 'manhole'}, {'frequency': 'c', 'id': 681, 'synset': 'map.n.01', 'synonyms': ['map'], 'def': "a diagrammatic representation of the earth's surface (or part of it)", 'name': 'map'}, {'frequency': 'c', 'id': 682, 'synset': 'marker.n.03', 'synonyms': ['marker'], 'def': 'a writing implement for making a mark', 'name': 'marker'}, {'frequency': 'r', 'id': 683, 'synset': 'martini.n.01', 'synonyms': ['martini'], 'def': 'a cocktail made of gin (or vodka) with dry vermouth', 'name': 'martini'}, {'frequency': 'r', 'id': 684, 'synset': 'mascot.n.01', 'synonyms': ['mascot'], 'def': 'a person or animal that is adopted by a team or other group as a symbolic figure', 'name': 'mascot'}, {'frequency': 'c', 'id': 685, 'synset': 'mashed_potato.n.01', 'synonyms': ['mashed_potato'], 'def': 'potato that has been peeled and boiled and then mashed', 'name': 'mashed_potato'}, {'frequency': 'r', 'id': 686, 'synset': 'masher.n.02', 'synonyms': ['masher'], 'def': 'a kitchen utensil used for mashing (e.g. potatoes)', 'name': 'masher'}, {'frequency': 'f', 'id': 687, 'synset': 'mask.n.04', 'synonyms': ['mask', 'facemask'], 'def': 'a protective covering worn over the face', 'name': 'mask'}, {'frequency': 'f', 'id': 688, 'synset': 'mast.n.01', 'synonyms': ['mast'], 'def': 'a vertical spar for supporting sails', 'name': 'mast'}, {'frequency': 'c', 'id': 689, 'synset': 'mat.n.03', 'synonyms': ['mat_(gym_equipment)', 'gym_mat'], 'def': 'sports equipment consisting of a piece of thick padding on the floor for gymnastics', 'name': 'mat_(gym_equipment)'}, {'frequency': 'r', 'id': 690, 'synset': 'matchbox.n.01', 'synonyms': ['matchbox'], 'def': 'a box for holding matches', 'name': 'matchbox'}, {'frequency': 'f', 'id': 691, 'synset': 'mattress.n.01', 'synonyms': ['mattress'], 'def': 'a thick pad filled with resilient material used as a bed or part of a bed', 'name': 'mattress'}, {'frequency': 'c', 'id': 692, 'synset': 'measuring_cup.n.01', 'synonyms': ['measuring_cup'], 'def': 'graduated cup used to measure liquid or granular ingredients', 'name': 'measuring_cup'}, {'frequency': 'c', 'id': 693, 'synset': 'measuring_stick.n.01', 'synonyms': ['measuring_stick', 'ruler_(measuring_stick)', 'measuring_rod'], 'def': 'measuring instrument having a sequence of marks at regular intervals', 'name': 'measuring_stick'}, {'frequency': 'c', 'id': 694, 'synset': 'meatball.n.01', 'synonyms': ['meatball'], 'def': 'ground meat formed into a ball and fried or simmered in broth', 'name': 'meatball'}, {'frequency': 'c', 'id': 695, 'synset': 'medicine.n.02', 'synonyms': ['medicine'], 'def': 'something that treats or prevents or alleviates the symptoms of disease', 'name': 'medicine'}, {'frequency': 'r', 'id': 696, 'synset': 'melon.n.01', 'synonyms': ['melon'], 'def': 'fruit of the gourd family having a hard rind and sweet juicy flesh', 'name': 'melon'}, {'frequency': 'f', 'id': 697, 'synset': 'microphone.n.01', 'synonyms': ['microphone'], 'def': 'device for converting sound waves into electrical energy', 'name': 'microphone'}, {'frequency': 'r', 'id': 698, 'synset': 'microscope.n.01', 'synonyms': ['microscope'], 'def': 'magnifier of the image of small objects', 'name': 'microscope'}, {'frequency': 'f', 'id': 699, 'synset': 'microwave.n.02', 'synonyms': ['microwave_oven'], 'def': 'kitchen appliance that cooks food by passing an electromagnetic wave through it', 'name': 'microwave_oven'}, {'frequency': 'r', 'id': 700, 'synset': 'milestone.n.01', 'synonyms': ['milestone', 'milepost'], 'def': 'stone post at side of a road to show distances', 'name': 'milestone'}, {'frequency': 'c', 'id': 701, 'synset': 'milk.n.01', 'synonyms': ['milk'], 'def': 'a white nutritious liquid secreted by mammals and used as food by human beings', 'name': 'milk'}, {'frequency': 'f', 'id': 702, 'synset': 'minivan.n.01', 'synonyms': ['minivan'], 'def': 'a small box-shaped passenger van', 'name': 'minivan'}, {'frequency': 'r', 'id': 703, 'synset': 'mint.n.05', 'synonyms': ['mint_candy'], 'def': 'a candy that is flavored with a mint oil', 'name': 'mint_candy'}, {'frequency': 'f', 'id': 704, 'synset': 'mirror.n.01', 'synonyms': ['mirror'], 'def': 'polished surface that forms images by reflecting light', 'name': 'mirror'}, {'frequency': 'c', 'id': 705, 'synset': 'mitten.n.01', 'synonyms': ['mitten'], 'def': 'glove that encases the thumb separately and the other four fingers together', 'name': 'mitten'}, {'frequency': 'c', 'id': 706, 'synset': 'mixer.n.04', 'synonyms': ['mixer_(kitchen_tool)', 'stand_mixer'], 'def': 'a kitchen utensil that is used for mixing foods', 'name': 'mixer_(kitchen_tool)'}, {'frequency': 'c', 'id': 707, 'synset': 'money.n.03', 'synonyms': ['money'], 'def': 'the official currency issued by a government or national bank', 'name': 'money'}, {'frequency': 'f', 'id': 708, 'synset': 'monitor.n.04', 'synonyms': ['monitor_(computer_equipment) computer_monitor'], 'def': 'a computer monitor', 'name': 'monitor_(computer_equipment) computer_monitor'}, {'frequency': 'c', 'id': 709, 'synset': 'monkey.n.01', 'synonyms': ['monkey'], 'def': 'any of various long-tailed primates', 'name': 'monkey'}, {'frequency': 'f', 'id': 710, 'synset': 'motor.n.01', 'synonyms': ['motor'], 'def': 'machine that converts other forms of energy into mechanical energy and so imparts motion', 'name': 'motor'}, {'frequency': 'f', 'id': 711, 'synset': 'motor_scooter.n.01', 'synonyms': ['motor_scooter', 'scooter'], 'def': 'a wheeled vehicle with small wheels and a low-powered engine', 'name': 'motor_scooter'}, {'frequency': 'r', 'id': 712, 'synset': 'motor_vehicle.n.01', 'synonyms': ['motor_vehicle', 'automotive_vehicle'], 'def': 'a self-propelled wheeled vehicle that does not run on rails', 'name': 'motor_vehicle'}, {'frequency': 'r', 'id': 713, 'synset': 'motorboat.n.01', 'synonyms': ['motorboat', 'powerboat'], 'def': 'a boat propelled by an internal-combustion engine', 'name': 'motorboat'}, {'frequency': 'f', 'id': 714, 'synset': 'motorcycle.n.01', 'synonyms': ['motorcycle'], 'def': 'a motor vehicle with two wheels and a strong frame', 'name': 'motorcycle'}, {'frequency': 'f', 'id': 715, 'synset': 'mound.n.01', 'synonyms': ['mound_(baseball)', "pitcher's_mound"], 'def': '(baseball) the slight elevation on which the pitcher stands', 'name': 'mound_(baseball)'}, {'frequency': 'r', 'id': 716, 'synset': 'mouse.n.01', 'synonyms': ['mouse_(animal_rodent)'], 'def': 'a small rodent with pointed snouts and small ears on elongated bodies with slender usually hairless tails', 'name': 'mouse_(animal_rodent)'}, {'frequency': 'f', 'id': 717, 'synset': 'mouse.n.04', 'synonyms': ['mouse_(computer_equipment)', 'computer_mouse'], 'def': 'a computer input device that controls an on-screen pointer', 'name': 'mouse_(computer_equipment)'}, {'frequency': 'f', 'id': 718, 'synset': 'mousepad.n.01', 'synonyms': ['mousepad'], 'def': 'a small portable pad that provides an operating surface for a computer mouse', 'name': 'mousepad'}, {'frequency': 'c', 'id': 719, 'synset': 'muffin.n.01', 'synonyms': ['muffin'], 'def': 'a sweet quick bread baked in a cup-shaped pan', 'name': 'muffin'}, {'frequency': 'f', 'id': 720, 'synset': 'mug.n.04', 'synonyms': ['mug'], 'def': 'with handle and usually cylindrical', 'name': 'mug'}, {'frequency': 'f', 'id': 721, 'synset': 'mushroom.n.02', 'synonyms': ['mushroom'], 'def': 'a common mushroom', 'name': 'mushroom'}, {'frequency': 'r', 'id': 722, 'synset': 'music_stool.n.01', 'synonyms': ['music_stool', 'piano_stool'], 'def': 'a stool for piano players; usually adjustable in height', 'name': 'music_stool'}, {'frequency': 'r', 'id': 723, 'synset': 'musical_instrument.n.01', 'synonyms': ['musical_instrument', 'instrument_(musical)'], 'def': 'any of various devices or contrivances that can be used to produce musical tones or sounds', 'name': 'musical_instrument'}, {'frequency': 'r', 'id': 724, 'synset': 'nailfile.n.01', 'synonyms': ['nailfile'], 'def': 'a small flat file for shaping the nails', 'name': 'nailfile'}, {'frequency': 'r', 'id': 725, 'synset': 'nameplate.n.01', 'synonyms': ['nameplate'], 'def': 'a plate bearing a name', 'name': 'nameplate'}, {'frequency': 'f', 'id': 726, 'synset': 'napkin.n.01', 'synonyms': ['napkin', 'table_napkin', 'serviette'], 'def': 'a small piece of table linen or paper that is used to wipe the mouth and to cover the lap in order to protect clothing', 'name': 'napkin'}, {'frequency': 'r', 'id': 727, 'synset': 'neckerchief.n.01', 'synonyms': ['neckerchief'], 'def': 'a kerchief worn around the neck', 'name': 'neckerchief'}, {'frequency': 'f', 'id': 728, 'synset': 'necklace.n.01', 'synonyms': ['necklace'], 'def': 'jewelry consisting of a cord or chain (often bearing gems) worn about the neck as an ornament', 'name': 'necklace'}, {'frequency': 'f', 'id': 729, 'synset': 'necktie.n.01', 'synonyms': ['necktie', 'tie_(necktie)'], 'def': 'neckwear consisting of a long narrow piece of material worn under a collar and tied in knot at the front', 'name': 'necktie'}, {'frequency': 'r', 'id': 730, 'synset': 'needle.n.03', 'synonyms': ['needle'], 'def': 'a sharp pointed implement (usually metal)', 'name': 'needle'}, {'frequency': 'c', 'id': 731, 'synset': 'nest.n.01', 'synonyms': ['nest'], 'def': 'a structure in which animals lay eggs or give birth to their young', 'name': 'nest'}, {'frequency': 'r', 'id': 732, 'synset': 'newsstand.n.01', 'synonyms': ['newsstand'], 'def': 'a stall where newspapers and other periodicals are sold', 'name': 'newsstand'}, {'frequency': 'c', 'id': 733, 'synset': 'nightwear.n.01', 'synonyms': ['nightshirt', 'nightwear', 'sleepwear', 'nightclothes'], 'def': 'garments designed to be worn in bed', 'name': 'nightshirt'}, {'frequency': 'r', 'id': 734, 'synset': 'nosebag.n.01', 'synonyms': ['nosebag_(for_animals)', 'feedbag'], 'def': 'a canvas bag that is used to feed an animal (such as a horse); covers the muzzle and fastens at the top of the head', 'name': 'nosebag_(for_animals)'}, {'frequency': 'r', 'id': 735, 'synset': 'noseband.n.01', 'synonyms': ['noseband_(for_animals)', 'nosepiece_(for_animals)'], 'def': "a strap that is the part of a bridle that goes over the animal's nose", 'name': 'noseband_(for_animals)'}, {'frequency': 'f', 'id': 736, 'synset': 'notebook.n.01', 'synonyms': ['notebook'], 'def': 'a book with blank pages for recording notes or memoranda', 'name': 'notebook'}, {'frequency': 'c', 'id': 737, 'synset': 'notepad.n.01', 'synonyms': ['notepad'], 'def': 'a pad of paper for keeping notes', 'name': 'notepad'}, {'frequency': 'c', 'id': 738, 'synset': 'nut.n.03', 'synonyms': ['nut'], 'def': 'a small metal block (usually square or hexagonal) with internal screw thread to be fitted onto a bolt', 'name': 'nut'}, {'frequency': 'r', 'id': 739, 'synset': 'nutcracker.n.01', 'synonyms': ['nutcracker'], 'def': 'a hand tool used to crack nuts open', 'name': 'nutcracker'}, {'frequency': 'c', 'id': 740, 'synset': 'oar.n.01', 'synonyms': ['oar'], 'def': 'an implement used to propel or steer a boat', 'name': 'oar'}, {'frequency': 'r', 'id': 741, 'synset': 'octopus.n.01', 'synonyms': ['octopus_(food)'], 'def': 'tentacles of octopus prepared as food', 'name': 'octopus_(food)'}, {'frequency': 'r', 'id': 742, 'synset': 'octopus.n.02', 'synonyms': ['octopus_(animal)'], 'def': 'bottom-living cephalopod having a soft oval body with eight long tentacles', 'name': 'octopus_(animal)'}, {'frequency': 'c', 'id': 743, 'synset': 'oil_lamp.n.01', 'synonyms': ['oil_lamp', 'kerosene_lamp', 'kerosine_lamp'], 'def': 'a lamp that burns oil (as kerosine) for light', 'name': 'oil_lamp'}, {'frequency': 'c', 'id': 744, 'synset': 'olive_oil.n.01', 'synonyms': ['olive_oil'], 'def': 'oil from olives', 'name': 'olive_oil'}, {'frequency': 'r', 'id': 745, 'synset': 'omelet.n.01', 'synonyms': ['omelet', 'omelette'], 'def': 'beaten eggs cooked until just set; may be folded around e.g. ham or cheese or jelly', 'name': 'omelet'}, {'frequency': 'f', 'id': 746, 'synset': 'onion.n.01', 'synonyms': ['onion'], 'def': 'the bulb of an onion plant', 'name': 'onion'}, {'frequency': 'f', 'id': 747, 'synset': 'orange.n.01', 'synonyms': ['orange_(fruit)'], 'def': 'orange (FRUIT of an orange tree)', 'name': 'orange_(fruit)'}, {'frequency': 'c', 'id': 748, 'synset': 'orange_juice.n.01', 'synonyms': ['orange_juice'], 'def': 'bottled or freshly squeezed juice of oranges', 'name': 'orange_juice'}, {'frequency': 'r', 'id': 749, 'synset': 'oregano.n.01', 'synonyms': ['oregano', 'marjoram'], 'def': 'aromatic Eurasian perennial herb used in cooking and baking', 'name': 'oregano'}, {'frequency': 'c', 'id': 750, 'synset': 'ostrich.n.02', 'synonyms': ['ostrich'], 'def': 'fast-running African flightless bird with two-toed feet; largest living bird', 'name': 'ostrich'}, {'frequency': 'c', 'id': 751, 'synset': 'ottoman.n.03', 'synonyms': ['ottoman', 'pouf', 'pouffe', 'hassock'], 'def': 'thick cushion used as a seat', 'name': 'ottoman'}, {'frequency': 'c', 'id': 752, 'synset': 'overall.n.01', 'synonyms': ['overalls_(clothing)'], 'def': 'work clothing consisting of denim trousers usually with a bib and shoulder straps', 'name': 'overalls_(clothing)'}, {'frequency': 'c', 'id': 753, 'synset': 'owl.n.01', 'synonyms': ['owl'], 'def': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes', 'name': 'owl'}, {'frequency': 'c', 'id': 754, 'synset': 'packet.n.03', 'synonyms': ['packet'], 'def': 'a small package or bundle', 'name': 'packet'}, {'frequency': 'r', 'id': 755, 'synset': 'pad.n.03', 'synonyms': ['inkpad', 'inking_pad', 'stamp_pad'], 'def': 'absorbent material saturated with ink used to transfer ink evenly to a rubber stamp', 'name': 'inkpad'}, {'frequency': 'c', 'id': 756, 'synset': 'pad.n.04', 'synonyms': ['pad'], 'def': 'a flat mass of soft material used for protection, stuffing, or comfort', 'name': 'pad'}, {'frequency': 'c', 'id': 757, 'synset': 'paddle.n.04', 'synonyms': ['paddle', 'boat_paddle'], 'def': 'a short light oar used without an oarlock to propel a canoe or small boat', 'name': 'paddle'}, {'frequency': 'c', 'id': 758, 'synset': 'padlock.n.01', 'synonyms': ['padlock'], 'def': 'a detachable, portable lock', 'name': 'padlock'}, {'frequency': 'r', 'id': 759, 'synset': 'paintbox.n.01', 'synonyms': ['paintbox'], 'def': "a box containing a collection of cubes or tubes of artists' paint", 'name': 'paintbox'}, {'frequency': 'c', 'id': 760, 'synset': 'paintbrush.n.01', 'synonyms': ['paintbrush'], 'def': 'a brush used as an applicator to apply paint', 'name': 'paintbrush'}, {'frequency': 'f', 'id': 761, 'synset': 'painting.n.01', 'synonyms': ['painting'], 'def': 'graphic art consisting of an artistic composition made by applying paints to a surface', 'name': 'painting'}, {'frequency': 'c', 'id': 762, 'synset': 'pajama.n.02', 'synonyms': ['pajamas', 'pyjamas'], 'def': 'loose-fitting nightclothes worn for sleeping or lounging', 'name': 'pajamas'}, {'frequency': 'c', 'id': 763, 'synset': 'palette.n.02', 'synonyms': ['palette', 'pallet'], 'def': 'board that provides a flat surface on which artists mix paints and the range of colors used', 'name': 'palette'}, {'frequency': 'f', 'id': 764, 'synset': 'pan.n.01', 'synonyms': ['pan_(for_cooking)', 'cooking_pan'], 'def': 'cooking utensil consisting of a wide metal vessel', 'name': 'pan_(for_cooking)'}, {'frequency': 'r', 'id': 765, 'synset': 'pan.n.03', 'synonyms': ['pan_(metal_container)'], 'def': 'shallow container made of metal', 'name': 'pan_(metal_container)'}, {'frequency': 'c', 'id': 766, 'synset': 'pancake.n.01', 'synonyms': ['pancake'], 'def': 'a flat cake of thin batter fried on both sides on a griddle', 'name': 'pancake'}, {'frequency': 'r', 'id': 767, 'synset': 'pantyhose.n.01', 'synonyms': ['pantyhose'], 'def': "a woman's tights consisting of underpants and stockings", 'name': 'pantyhose'}, {'frequency': 'r', 'id': 768, 'synset': 'papaya.n.02', 'synonyms': ['papaya'], 'def': 'large oval melon-like tropical fruit with yellowish flesh', 'name': 'papaya'}, {'frequency': 'r', 'id': 769, 'synset': 'paper_clip.n.01', 'synonyms': ['paperclip'], 'def': 'a wire or plastic clip for holding sheets of paper together', 'name': 'paperclip'}, {'frequency': 'f', 'id': 770, 'synset': 'paper_plate.n.01', 'synonyms': ['paper_plate'], 'def': 'a disposable plate made of cardboard', 'name': 'paper_plate'}, {'frequency': 'f', 'id': 771, 'synset': 'paper_towel.n.01', 'synonyms': ['paper_towel'], 'def': 'a disposable towel made of absorbent paper', 'name': 'paper_towel'}, {'frequency': 'r', 'id': 772, 'synset': 'paperback_book.n.01', 'synonyms': ['paperback_book', 'paper-back_book', 'softback_book', 'soft-cover_book'], 'def': 'a book with paper covers', 'name': 'paperback_book'}, {'frequency': 'r', 'id': 773, 'synset': 'paperweight.n.01', 'synonyms': ['paperweight'], 'def': 'a weight used to hold down a stack of papers', 'name': 'paperweight'}, {'frequency': 'c', 'id': 774, 'synset': 'parachute.n.01', 'synonyms': ['parachute'], 'def': 'rescue equipment consisting of a device that fills with air and retards your fall', 'name': 'parachute'}, {'frequency': 'r', 'id': 775, 'synset': 'parakeet.n.01', 'synonyms': ['parakeet', 'parrakeet', 'parroket', 'paraquet', 'paroquet', 'parroquet'], 'def': 'any of numerous small slender long-tailed parrots', 'name': 'parakeet'}, {'frequency': 'c', 'id': 776, 'synset': 'parasail.n.01', 'synonyms': ['parasail_(sports)'], 'def': 'parachute that will lift a person up into the air when it is towed by a motorboat or a car', 'name': 'parasail_(sports)'}, {'frequency': 'r', 'id': 777, 'synset': 'parchment.n.01', 'synonyms': ['parchment'], 'def': 'a superior paper resembling sheepskin', 'name': 'parchment'}, {'frequency': 'r', 'id': 778, 'synset': 'parka.n.01', 'synonyms': ['parka', 'anorak'], 'def': "a kind of heavy jacket (`windcheater' is a British term)", 'name': 'parka'}, {'frequency': 'f', 'id': 779, 'synset': 'parking_meter.n.01', 'synonyms': ['parking_meter'], 'def': 'a coin-operated timer located next to a parking space', 'name': 'parking_meter'}, {'frequency': 'c', 'id': 780, 'synset': 'parrot.n.01', 'synonyms': ['parrot'], 'def': 'usually brightly colored tropical birds with short hooked beaks and the ability to mimic sounds', 'name': 'parrot'}, {'frequency': 'c', 'id': 781, 'synset': 'passenger_car.n.01', 'synonyms': ['passenger_car_(part_of_a_train)', 'coach_(part_of_a_train)'], 'def': 'a railcar where passengers ride', 'name': 'passenger_car_(part_of_a_train)'}, {'frequency': 'r', 'id': 782, 'synset': 'passenger_ship.n.01', 'synonyms': ['passenger_ship'], 'def': 'a ship built to carry passengers', 'name': 'passenger_ship'}, {'frequency': 'r', 'id': 783, 'synset': 'passport.n.02', 'synonyms': ['passport'], 'def': 'a document issued by a country to a citizen allowing that person to travel abroad and re-enter the home country', 'name': 'passport'}, {'frequency': 'f', 'id': 784, 'synset': 'pastry.n.02', 'synonyms': ['pastry'], 'def': 'any of various baked foods made of dough or batter', 'name': 'pastry'}, {'frequency': 'r', 'id': 785, 'synset': 'patty.n.01', 'synonyms': ['patty_(food)'], 'def': 'small flat mass of chopped food', 'name': 'patty_(food)'}, {'frequency': 'c', 'id': 786, 'synset': 'pea.n.01', 'synonyms': ['pea_(food)'], 'def': 'seed of a pea plant used for food', 'name': 'pea_(food)'}, {'frequency': 'c', 'id': 787, 'synset': 'peach.n.03', 'synonyms': ['peach'], 'def': 'downy juicy fruit with sweet yellowish or whitish flesh', 'name': 'peach'}, {'frequency': 'c', 'id': 788, 'synset': 'peanut_butter.n.01', 'synonyms': ['peanut_butter'], 'def': 'a spread made from ground peanuts', 'name': 'peanut_butter'}, {'frequency': 'c', 'id': 789, 'synset': 'pear.n.01', 'synonyms': ['pear'], 'def': 'sweet juicy gritty-textured fruit available in many varieties', 'name': 'pear'}, {'frequency': 'r', 'id': 790, 'synset': 'peeler.n.03', 'synonyms': ['peeler_(tool_for_fruit_and_vegetables)'], 'def': 'a device for peeling vegetables or fruits', 'name': 'peeler_(tool_for_fruit_and_vegetables)'}, {'frequency': 'r', 'id': 791, 'synset': 'pegboard.n.01', 'synonyms': ['pegboard'], 'def': 'a board perforated with regularly spaced holes into which pegs can be fitted', 'name': 'pegboard'}, {'frequency': 'c', 'id': 792, 'synset': 'pelican.n.01', 'synonyms': ['pelican'], 'def': 'large long-winged warm-water seabird having a large bill with a distensible pouch for fish', 'name': 'pelican'}, {'frequency': 'f', 'id': 793, 'synset': 'pen.n.01', 'synonyms': ['pen'], 'def': 'a writing implement with a point from which ink flows', 'name': 'pen'}, {'frequency': 'c', 'id': 794, 'synset': 'pencil.n.01', 'synonyms': ['pencil'], 'def': 'a thin cylindrical pointed writing implement made of wood and graphite', 'name': 'pencil'}, {'frequency': 'r', 'id': 795, 'synset': 'pencil_box.n.01', 'synonyms': ['pencil_box', 'pencil_case'], 'def': 'a box for holding pencils', 'name': 'pencil_box'}, {'frequency': 'r', 'id': 796, 'synset': 'pencil_sharpener.n.01', 'synonyms': ['pencil_sharpener'], 'def': 'a rotary implement for sharpening the point on pencils', 'name': 'pencil_sharpener'}, {'frequency': 'r', 'id': 797, 'synset': 'pendulum.n.01', 'synonyms': ['pendulum'], 'def': 'an apparatus consisting of an object mounted so that it swings freely under the influence of gravity', 'name': 'pendulum'}, {'frequency': 'c', 'id': 798, 'synset': 'penguin.n.01', 'synonyms': ['penguin'], 'def': 'short-legged flightless birds of cold southern regions having webbed feet and wings modified as flippers', 'name': 'penguin'}, {'frequency': 'r', 'id': 799, 'synset': 'pennant.n.02', 'synonyms': ['pennant'], 'def': 'a flag longer than it is wide (and often tapering)', 'name': 'pennant'}, {'frequency': 'r', 'id': 800, 'synset': 'penny.n.02', 'synonyms': ['penny_(coin)'], 'def': 'a coin worth one-hundredth of the value of the basic unit', 'name': 'penny_(coin)'}, {'frequency': 'c', 'id': 801, 'synset': 'pepper.n.03', 'synonyms': ['pepper', 'peppercorn'], 'def': 'pungent seasoning from the berry of the common pepper plant; whole or ground', 'name': 'pepper'}, {'frequency': 'c', 'id': 802, 'synset': 'pepper_mill.n.01', 'synonyms': ['pepper_mill', 'pepper_grinder'], 'def': 'a mill for grinding pepper', 'name': 'pepper_mill'}, {'frequency': 'c', 'id': 803, 'synset': 'perfume.n.02', 'synonyms': ['perfume'], 'def': 'a toiletry that emits and diffuses a fragrant odor', 'name': 'perfume'}, {'frequency': 'r', 'id': 804, 'synset': 'persimmon.n.02', 'synonyms': ['persimmon'], 'def': 'orange fruit resembling a plum; edible when fully ripe', 'name': 'persimmon'}, {'frequency': 'f', 'id': 805, 'synset': 'person.n.01', 'synonyms': ['baby', 'child', 'boy', 'girl', 'man', 'woman', 'person', 'human'], 'def': 'a human being', 'name': 'baby'}, {'frequency': 'r', 'id': 806, 'synset': 'pet.n.01', 'synonyms': ['pet'], 'def': 'a domesticated animal kept for companionship or amusement', 'name': 'pet'}, {'frequency': 'r', 'id': 807, 'synset': 'petfood.n.01', 'synonyms': ['petfood', 'pet-food'], 'def': 'food prepared for animal pets', 'name': 'petfood'}, {'frequency': 'r', 'id': 808, 'synset': 'pew.n.01', 'synonyms': ['pew_(church_bench)', 'church_bench'], 'def': 'long bench with backs; used in church by the congregation', 'name': 'pew_(church_bench)'}, {'frequency': 'r', 'id': 809, 'synset': 'phonebook.n.01', 'synonyms': ['phonebook', 'telephone_book', 'telephone_directory'], 'def': 'a directory containing an alphabetical list of telephone subscribers and their telephone numbers', 'name': 'phonebook'}, {'frequency': 'c', 'id': 810, 'synset': 'phonograph_record.n.01', 'synonyms': ['phonograph_record', 'phonograph_recording', 'record_(phonograph_recording)'], 'def': 'sound recording consisting of a typically black disk with a continuous groove', 'name': 'phonograph_record'}, {'frequency': 'c', 'id': 811, 'synset': 'piano.n.01', 'synonyms': ['piano'], 'def': 'a keyboard instrument that is played by depressing keys that cause hammers to strike tuned strings and produce sounds', 'name': 'piano'}, {'frequency': 'f', 'id': 812, 'synset': 'pickle.n.01', 'synonyms': ['pickle'], 'def': 'vegetables (especially cucumbers) preserved in brine or vinegar', 'name': 'pickle'}, {'frequency': 'f', 'id': 813, 'synset': 'pickup.n.01', 'synonyms': ['pickup_truck'], 'def': 'a light truck with an open body and low sides and a tailboard', 'name': 'pickup_truck'}, {'frequency': 'c', 'id': 814, 'synset': 'pie.n.01', 'synonyms': ['pie'], 'def': 'dish baked in pastry-lined pan often with a pastry top', 'name': 'pie'}, {'frequency': 'c', 'id': 815, 'synset': 'pigeon.n.01', 'synonyms': ['pigeon'], 'def': 'wild and domesticated birds having a heavy body and short legs', 'name': 'pigeon'}, {'frequency': 'r', 'id': 816, 'synset': 'piggy_bank.n.01', 'synonyms': ['piggy_bank', 'penny_bank'], 'def': "a child's coin bank (often shaped like a pig)", 'name': 'piggy_bank'}, {'frequency': 'f', 'id': 817, 'synset': 'pillow.n.01', 'synonyms': ['pillow'], 'def': 'a cushion to support the head of a sleeping person', 'name': 'pillow'}, {'frequency': 'r', 'id': 818, 'synset': 'pin.n.09', 'synonyms': ['pin_(non_jewelry)'], 'def': 'a small slender (often pointed) piece of wood or metal used to support or fasten or attach things', 'name': 'pin_(non_jewelry)'}, {'frequency': 'f', 'id': 819, 'synset': 'pineapple.n.02', 'synonyms': ['pineapple'], 'def': 'large sweet fleshy tropical fruit with a tuft of stiff leaves', 'name': 'pineapple'}, {'frequency': 'c', 'id': 820, 'synset': 'pinecone.n.01', 'synonyms': ['pinecone'], 'def': 'the seed-producing cone of a pine tree', 'name': 'pinecone'}, {'frequency': 'r', 'id': 821, 'synset': 'ping-pong_ball.n.01', 'synonyms': ['ping-pong_ball'], 'def': 'light hollow ball used in playing table tennis', 'name': 'ping-pong_ball'}, {'frequency': 'r', 'id': 822, 'synset': 'pinwheel.n.03', 'synonyms': ['pinwheel'], 'def': 'a toy consisting of vanes of colored paper or plastic that is pinned to a stick and spins when it is pointed into the wind', 'name': 'pinwheel'}, {'frequency': 'r', 'id': 823, 'synset': 'pipe.n.01', 'synonyms': ['tobacco_pipe'], 'def': 'a tube with a small bowl at one end; used for smoking tobacco', 'name': 'tobacco_pipe'}, {'frequency': 'f', 'id': 824, 'synset': 'pipe.n.02', 'synonyms': ['pipe', 'piping'], 'def': 'a long tube made of metal or plastic that is used to carry water or oil or gas etc.', 'name': 'pipe'}, {'frequency': 'r', 'id': 825, 'synset': 'pistol.n.01', 'synonyms': ['pistol', 'handgun'], 'def': 'a firearm that is held and fired with one hand', 'name': 'pistol'}, {'frequency': 'r', 'id': 826, 'synset': 'pita.n.01', 'synonyms': ['pita_(bread)', 'pocket_bread'], 'def': 'usually small round bread that can open into a pocket for filling', 'name': 'pita_(bread)'}, {'frequency': 'f', 'id': 827, 'synset': 'pitcher.n.02', 'synonyms': ['pitcher_(vessel_for_liquid)', 'ewer'], 'def': 'an open vessel with a handle and a spout for pouring', 'name': 'pitcher_(vessel_for_liquid)'}, {'frequency': 'r', 'id': 828, 'synset': 'pitchfork.n.01', 'synonyms': ['pitchfork'], 'def': 'a long-handled hand tool with sharp widely spaced prongs for lifting and pitching hay', 'name': 'pitchfork'}, {'frequency': 'f', 'id': 829, 'synset': 'pizza.n.01', 'synonyms': ['pizza'], 'def': 'Italian open pie made of thin bread dough spread with a spiced mixture of e.g. tomato sauce and cheese', 'name': 'pizza'}, {'frequency': 'f', 'id': 830, 'synset': 'place_mat.n.01', 'synonyms': ['place_mat'], 'def': 'a mat placed on a table for an individual place setting', 'name': 'place_mat'}, {'frequency': 'f', 'id': 831, 'synset': 'plate.n.04', 'synonyms': ['plate'], 'def': 'dish on which food is served or from which food is eaten', 'name': 'plate'}, {'frequency': 'c', 'id': 832, 'synset': 'platter.n.01', 'synonyms': ['platter'], 'def': 'a large shallow dish used for serving food', 'name': 'platter'}, {'frequency': 'r', 'id': 833, 'synset': 'playing_card.n.01', 'synonyms': ['playing_card'], 'def': 'one of a pack of cards that are used to play card games', 'name': 'playing_card'}, {'frequency': 'r', 'id': 834, 'synset': 'playpen.n.01', 'synonyms': ['playpen'], 'def': 'a portable enclosure in which babies may be left to play', 'name': 'playpen'}, {'frequency': 'c', 'id': 835, 'synset': 'pliers.n.01', 'synonyms': ['pliers', 'plyers'], 'def': 'a gripping hand tool with two hinged arms and (usually) serrated jaws', 'name': 'pliers'}, {'frequency': 'r', 'id': 836, 'synset': 'plow.n.01', 'synonyms': ['plow_(farm_equipment)', 'plough_(farm_equipment)'], 'def': 'a farm tool having one or more heavy blades to break the soil and cut a furrow prior to sowing', 'name': 'plow_(farm_equipment)'}, {'frequency': 'r', 'id': 837, 'synset': 'pocket_watch.n.01', 'synonyms': ['pocket_watch'], 'def': 'a watch that is carried in a small watch pocket', 'name': 'pocket_watch'}, {'frequency': 'c', 'id': 838, 'synset': 'pocketknife.n.01', 'synonyms': ['pocketknife'], 'def': 'a knife with a blade that folds into the handle; suitable for carrying in the pocket', 'name': 'pocketknife'}, {'frequency': 'c', 'id': 839, 'synset': 'poker.n.01', 'synonyms': ['poker_(fire_stirring_tool)', 'stove_poker', 'fire_hook'], 'def': 'fire iron consisting of a metal rod with a handle; used to stir a fire', 'name': 'poker_(fire_stirring_tool)'}, {'frequency': 'f', 'id': 840, 'synset': 'pole.n.01', 'synonyms': ['pole', 'post'], 'def': 'a long (usually round) rod of wood or metal or plastic', 'name': 'pole'}, {'frequency': 'r', 'id': 841, 'synset': 'police_van.n.01', 'synonyms': ['police_van', 'police_wagon', 'paddy_wagon', 'patrol_wagon'], 'def': 'van used by police to transport prisoners', 'name': 'police_van'}, {'frequency': 'f', 'id': 842, 'synset': 'polo_shirt.n.01', 'synonyms': ['polo_shirt', 'sport_shirt'], 'def': 'a shirt with short sleeves designed for comfort and casual wear', 'name': 'polo_shirt'}, {'frequency': 'r', 'id': 843, 'synset': 'poncho.n.01', 'synonyms': ['poncho'], 'def': 'a blanket-like cloak with a hole in the center for the head', 'name': 'poncho'}, {'frequency': 'c', 'id': 844, 'synset': 'pony.n.05', 'synonyms': ['pony'], 'def': 'any of various breeds of small gentle horses usually less than five feet high at the shoulder', 'name': 'pony'}, {'frequency': 'r', 'id': 845, 'synset': 'pool_table.n.01', 'synonyms': ['pool_table', 'billiard_table', 'snooker_table'], 'def': 'game equipment consisting of a heavy table on which pool is played', 'name': 'pool_table'}, {'frequency': 'f', 'id': 846, 'synset': 'pop.n.02', 'synonyms': ['pop_(soda)', 'soda_(pop)', 'tonic', 'soft_drink'], 'def': 'a sweet drink containing carbonated water and flavoring', 'name': 'pop_(soda)'}, {'frequency': 'r', 'id': 847, 'synset': 'portrait.n.02', 'synonyms': ['portrait', 'portrayal'], 'def': 'any likeness of a person, in any medium', 'name': 'portrait'}, {'frequency': 'c', 'id': 848, 'synset': 'postbox.n.01', 'synonyms': ['postbox_(public)', 'mailbox_(public)'], 'def': 'public box for deposit of mail', 'name': 'postbox_(public)'}, {'frequency': 'c', 'id': 849, 'synset': 'postcard.n.01', 'synonyms': ['postcard', 'postal_card', 'mailing-card'], 'def': 'a card for sending messages by post without an envelope', 'name': 'postcard'}, {'frequency': 'f', 'id': 850, 'synset': 'poster.n.01', 'synonyms': ['poster', 'placard'], 'def': 'a sign posted in a public place as an advertisement', 'name': 'poster'}, {'frequency': 'f', 'id': 851, 'synset': 'pot.n.01', 'synonyms': ['pot'], 'def': 'metal or earthenware cooking vessel that is usually round and deep; often has a handle and lid', 'name': 'pot'}, {'frequency': 'f', 'id': 852, 'synset': 'pot.n.04', 'synonyms': ['flowerpot'], 'def': 'a container in which plants are cultivated', 'name': 'flowerpot'}, {'frequency': 'f', 'id': 853, 'synset': 'potato.n.01', 'synonyms': ['potato'], 'def': 'an edible tuber native to South America', 'name': 'potato'}, {'frequency': 'c', 'id': 854, 'synset': 'potholder.n.01', 'synonyms': ['potholder'], 'def': 'an insulated pad for holding hot pots', 'name': 'potholder'}, {'frequency': 'c', 'id': 855, 'synset': 'pottery.n.01', 'synonyms': ['pottery', 'clayware'], 'def': 'ceramic ware made from clay and baked in a kiln', 'name': 'pottery'}, {'frequency': 'c', 'id': 856, 'synset': 'pouch.n.01', 'synonyms': ['pouch'], 'def': 'a small or medium size container for holding or carrying things', 'name': 'pouch'}, {'frequency': 'r', 'id': 857, 'synset': 'power_shovel.n.01', 'synonyms': ['power_shovel', 'excavator', 'digger'], 'def': 'a machine for excavating', 'name': 'power_shovel'}, {'frequency': 'c', 'id': 858, 'synset': 'prawn.n.01', 'synonyms': ['prawn', 'shrimp'], 'def': 'any of various edible decapod crustaceans', 'name': 'prawn'}, {'frequency': 'f', 'id': 859, 'synset': 'printer.n.03', 'synonyms': ['printer', 'printing_machine'], 'def': 'a machine that prints', 'name': 'printer'}, {'frequency': 'c', 'id': 860, 'synset': 'projectile.n.01', 'synonyms': ['projectile_(weapon)', 'missile'], 'def': 'a weapon that is forcibly thrown or projected at a targets', 'name': 'projectile_(weapon)'}, {'frequency': 'c', 'id': 861, 'synset': 'projector.n.02', 'synonyms': ['projector'], 'def': 'an optical instrument that projects an enlarged image onto a screen', 'name': 'projector'}, {'frequency': 'f', 'id': 862, 'synset': 'propeller.n.01', 'synonyms': ['propeller', 'propellor'], 'def': 'a mechanical device that rotates to push against air or water', 'name': 'propeller'}, {'frequency': 'r', 'id': 863, 'synset': 'prune.n.01', 'synonyms': ['prune'], 'def': 'dried plum', 'name': 'prune'}, {'frequency': 'r', 'id': 864, 'synset': 'pudding.n.01', 'synonyms': ['pudding'], 'def': 'any of various soft thick unsweetened baked dishes', 'name': 'pudding'}, {'frequency': 'r', 'id': 865, 'synset': 'puffer.n.02', 'synonyms': ['puffer_(fish)', 'pufferfish', 'blowfish', 'globefish'], 'def': 'fishes whose elongated spiny body can inflate itself with water or air to form a globe', 'name': 'puffer_(fish)'}, {'frequency': 'r', 'id': 866, 'synset': 'puffin.n.01', 'synonyms': ['puffin'], 'def': 'seabirds having short necks and brightly colored compressed bills', 'name': 'puffin'}, {'frequency': 'r', 'id': 867, 'synset': 'pug.n.01', 'synonyms': ['pug-dog'], 'def': 'small compact smooth-coated breed of Asiatic origin having a tightly curled tail and broad flat wrinkled muzzle', 'name': 'pug-dog'}, {'frequency': 'c', 'id': 868, 'synset': 'pumpkin.n.02', 'synonyms': ['pumpkin'], 'def': 'usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn', 'name': 'pumpkin'}, {'frequency': 'r', 'id': 869, 'synset': 'punch.n.03', 'synonyms': ['puncher'], 'def': 'a tool for making holes or indentations', 'name': 'puncher'}, {'frequency': 'r', 'id': 870, 'synset': 'puppet.n.01', 'synonyms': ['puppet', 'marionette'], 'def': 'a small figure of a person operated from above with strings by a puppeteer', 'name': 'puppet'}, {'frequency': 'r', 'id': 871, 'synset': 'puppy.n.01', 'synonyms': ['puppy'], 'def': 'a young dog', 'name': 'puppy'}, {'frequency': 'r', 'id': 872, 'synset': 'quesadilla.n.01', 'synonyms': ['quesadilla'], 'def': 'a tortilla that is filled with cheese and heated', 'name': 'quesadilla'}, {'frequency': 'r', 'id': 873, 'synset': 'quiche.n.02', 'synonyms': ['quiche'], 'def': 'a tart filled with rich unsweetened custard; often contains other ingredients (as cheese or ham or seafood or vegetables)', 'name': 'quiche'}, {'frequency': 'f', 'id': 874, 'synset': 'quilt.n.01', 'synonyms': ['quilt', 'comforter'], 'def': 'bedding made of two layers of cloth filled with stuffing and stitched together', 'name': 'quilt'}, {'frequency': 'c', 'id': 875, 'synset': 'rabbit.n.01', 'synonyms': ['rabbit'], 'def': 'any of various burrowing animals of the family Leporidae having long ears and short tails', 'name': 'rabbit'}, {'frequency': 'r', 'id': 876, 'synset': 'racer.n.02', 'synonyms': ['race_car', 'racing_car'], 'def': 'a fast car that competes in races', 'name': 'race_car'}, {'frequency': 'c', 'id': 877, 'synset': 'racket.n.04', 'synonyms': ['racket', 'racquet'], 'def': 'a sports implement used to strike a ball in various games', 'name': 'racket'}, {'frequency': 'r', 'id': 878, 'synset': 'radar.n.01', 'synonyms': ['radar'], 'def': 'measuring instrument in which the echo of a pulse of microwave radiation is used to detect and locate distant objects', 'name': 'radar'}, {'frequency': 'c', 'id': 879, 'synset': 'radiator.n.03', 'synonyms': ['radiator'], 'def': 'a mechanism consisting of a metal honeycomb through which hot fluids circulate', 'name': 'radiator'}, {'frequency': 'c', 'id': 880, 'synset': 'radio_receiver.n.01', 'synonyms': ['radio_receiver', 'radio_set', 'radio', 'tuner_(radio)'], 'def': 'an electronic receiver that detects and demodulates and amplifies transmitted radio signals', 'name': 'radio_receiver'}, {'frequency': 'c', 'id': 881, 'synset': 'radish.n.03', 'synonyms': ['radish', 'daikon'], 'def': 'pungent edible root of any of various cultivated radish plants', 'name': 'radish'}, {'frequency': 'c', 'id': 882, 'synset': 'raft.n.01', 'synonyms': ['raft'], 'def': 'a flat float (usually made of logs or planks) that can be used for transport or as a platform for swimmers', 'name': 'raft'}, {'frequency': 'r', 'id': 883, 'synset': 'rag_doll.n.01', 'synonyms': ['rag_doll'], 'def': 'a cloth doll that is stuffed and (usually) painted', 'name': 'rag_doll'}, {'frequency': 'c', 'id': 884, 'synset': 'raincoat.n.01', 'synonyms': ['raincoat', 'waterproof_jacket'], 'def': 'a water-resistant coat', 'name': 'raincoat'}, {'frequency': 'c', 'id': 885, 'synset': 'ram.n.05', 'synonyms': ['ram_(animal)'], 'def': 'uncastrated adult male sheep', 'name': 'ram_(animal)'}, {'frequency': 'c', 'id': 886, 'synset': 'raspberry.n.02', 'synonyms': ['raspberry'], 'def': 'red or black edible aggregate berries usually smaller than the related blackberries', 'name': 'raspberry'}, {'frequency': 'r', 'id': 887, 'synset': 'rat.n.01', 'synonyms': ['rat'], 'def': 'any of various long-tailed rodents similar to but larger than a mouse', 'name': 'rat'}, {'frequency': 'c', 'id': 888, 'synset': 'razorblade.n.01', 'synonyms': ['razorblade'], 'def': 'a blade that has very sharp edge', 'name': 'razorblade'}, {'frequency': 'c', 'id': 889, 'synset': 'reamer.n.01', 'synonyms': ['reamer_(juicer)', 'juicer', 'juice_reamer'], 'def': 'a squeezer with a conical ridged center that is used for squeezing juice from citrus fruit', 'name': 'reamer_(juicer)'}, {'frequency': 'f', 'id': 890, 'synset': 'rearview_mirror.n.01', 'synonyms': ['rearview_mirror'], 'def': 'car mirror that reflects the view out of the rear window', 'name': 'rearview_mirror'}, {'frequency': 'c', 'id': 891, 'synset': 'receipt.n.02', 'synonyms': ['receipt'], 'def': 'an acknowledgment (usually tangible) that payment has been made', 'name': 'receipt'}, {'frequency': 'c', 'id': 892, 'synset': 'recliner.n.01', 'synonyms': ['recliner', 'reclining_chair', 'lounger_(chair)'], 'def': 'an armchair whose back can be lowered and foot can be raised to allow the sitter to recline in it', 'name': 'recliner'}, {'frequency': 'r', 'id': 893, 'synset': 'record_player.n.01', 'synonyms': ['record_player', 'phonograph_(record_player)', 'turntable'], 'def': 'machine in which rotating records cause a stylus to vibrate and the vibrations are amplified acoustically or electronically', 'name': 'record_player'}, {'frequency': 'r', 'id': 894, 'synset': 'red_cabbage.n.02', 'synonyms': ['red_cabbage'], 'def': 'compact head of purplish-red leaves', 'name': 'red_cabbage'}, {'frequency': 'f', 'id': 895, 'synset': 'reflector.n.01', 'synonyms': ['reflector'], 'def': 'device that reflects light, radiation, etc.', 'name': 'reflector'}, {'frequency': 'f', 'id': 896, 'synset': 'remote_control.n.01', 'synonyms': ['remote_control'], 'def': 'a device that can be used to control a machine or apparatus from a distance', 'name': 'remote_control'}, {'frequency': 'c', 'id': 897, 'synset': 'rhinoceros.n.01', 'synonyms': ['rhinoceros'], 'def': 'massive powerful herbivorous odd-toed ungulate of southeast Asia and Africa having very thick skin and one or two horns on the snout', 'name': 'rhinoceros'}, {'frequency': 'r', 'id': 898, 'synset': 'rib.n.03', 'synonyms': ['rib_(food)'], 'def': 'cut of meat including one or more ribs', 'name': 'rib_(food)'}, {'frequency': 'r', 'id': 899, 'synset': 'rifle.n.01', 'synonyms': ['rifle'], 'def': 'a shoulder firearm with a long barrel', 'name': 'rifle'}, {'frequency': 'f', 'id': 900, 'synset': 'ring.n.08', 'synonyms': ['ring'], 'def': 'jewelry consisting of a circlet of precious metal (often set with jewels) worn on the finger', 'name': 'ring'}, {'frequency': 'r', 'id': 901, 'synset': 'river_boat.n.01', 'synonyms': ['river_boat'], 'def': 'a boat used on rivers or to ply a river', 'name': 'river_boat'}, {'frequency': 'r', 'id': 902, 'synset': 'road_map.n.02', 'synonyms': ['road_map'], 'def': '(NOT A ROAD) a MAP showing roads (for automobile travel)', 'name': 'road_map'}, {'frequency': 'c', 'id': 903, 'synset': 'robe.n.01', 'synonyms': ['robe'], 'def': 'any loose flowing garment', 'name': 'robe'}, {'frequency': 'c', 'id': 904, 'synset': 'rocking_chair.n.01', 'synonyms': ['rocking_chair'], 'def': 'a chair mounted on rockers', 'name': 'rocking_chair'}, {'frequency': 'r', 'id': 905, 'synset': 'roller_skate.n.01', 'synonyms': ['roller_skate'], 'def': 'a shoe with pairs of rollers (small hard wheels) fixed to the sole', 'name': 'roller_skate'}, {'frequency': 'r', 'id': 906, 'synset': 'rollerblade.n.01', 'synonyms': ['Rollerblade'], 'def': 'an in-line variant of a roller skate', 'name': 'Rollerblade'}, {'frequency': 'c', 'id': 907, 'synset': 'rolling_pin.n.01', 'synonyms': ['rolling_pin'], 'def': 'utensil consisting of a cylinder (usually of wood) with a handle at each end; used to roll out dough', 'name': 'rolling_pin'}, {'frequency': 'r', 'id': 908, 'synset': 'root_beer.n.01', 'synonyms': ['root_beer'], 'def': 'carbonated drink containing extracts of roots and herbs', 'name': 'root_beer'}, {'frequency': 'c', 'id': 909, 'synset': 'router.n.02', 'synonyms': ['router_(computer_equipment)'], 'def': 'a device that forwards data packets between computer networks', 'name': 'router_(computer_equipment)'}, {'frequency': 'f', 'id': 910, 'synset': 'rubber_band.n.01', 'synonyms': ['rubber_band', 'elastic_band'], 'def': 'a narrow band of elastic rubber used to hold things (such as papers) together', 'name': 'rubber_band'}, {'frequency': 'c', 'id': 911, 'synset': 'runner.n.08', 'synonyms': ['runner_(carpet)'], 'def': 'a long narrow carpet', 'name': 'runner_(carpet)'}, {'frequency': 'f', 'id': 912, 'synset': 'sack.n.01', 'synonyms': ['plastic_bag', 'paper_bag'], 'def': "a bag made of paper or plastic for holding customer's purchases", 'name': 'plastic_bag'}, {'frequency': 'f', 'id': 913, 'synset': 'saddle.n.01', 'synonyms': ['saddle_(on_an_animal)'], 'def': 'a seat for the rider of a horse or camel', 'name': 'saddle_(on_an_animal)'}, {'frequency': 'f', 'id': 914, 'synset': 'saddle_blanket.n.01', 'synonyms': ['saddle_blanket', 'saddlecloth', 'horse_blanket'], 'def': 'stable gear consisting of a blanket placed under the saddle', 'name': 'saddle_blanket'}, {'frequency': 'c', 'id': 915, 'synset': 'saddlebag.n.01', 'synonyms': ['saddlebag'], 'def': 'a large bag (or pair of bags) hung over a saddle', 'name': 'saddlebag'}, {'frequency': 'r', 'id': 916, 'synset': 'safety_pin.n.01', 'synonyms': ['safety_pin'], 'def': 'a pin in the form of a clasp; has a guard so the point of the pin will not stick the user', 'name': 'safety_pin'}, {'frequency': 'c', 'id': 917, 'synset': 'sail.n.01', 'synonyms': ['sail'], 'def': 'a large piece of fabric by means of which wind is used to propel a sailing vessel', 'name': 'sail'}, {'frequency': 'c', 'id': 918, 'synset': 'salad.n.01', 'synonyms': ['salad'], 'def': 'food mixtures either arranged on a plate or tossed and served with a moist dressing; usually consisting of or including greens', 'name': 'salad'}, {'frequency': 'r', 'id': 919, 'synset': 'salad_plate.n.01', 'synonyms': ['salad_plate', 'salad_bowl'], 'def': 'a plate or bowl for individual servings of salad', 'name': 'salad_plate'}, {'frequency': 'r', 'id': 920, 'synset': 'salami.n.01', 'synonyms': ['salami'], 'def': 'highly seasoned fatty sausage of pork and beef usually dried', 'name': 'salami'}, {'frequency': 'r', 'id': 921, 'synset': 'salmon.n.01', 'synonyms': ['salmon_(fish)'], 'def': 'any of various large food and game fishes of northern waters', 'name': 'salmon_(fish)'}, {'frequency': 'r', 'id': 922, 'synset': 'salmon.n.03', 'synonyms': ['salmon_(food)'], 'def': 'flesh of any of various marine or freshwater fish of the family Salmonidae', 'name': 'salmon_(food)'}, {'frequency': 'r', 'id': 923, 'synset': 'salsa.n.01', 'synonyms': ['salsa'], 'def': 'spicy sauce of tomatoes and onions and chili peppers to accompany Mexican foods', 'name': 'salsa'}, {'frequency': 'f', 'id': 924, 'synset': 'saltshaker.n.01', 'synonyms': ['saltshaker'], 'def': 'a shaker with a perforated top for sprinkling salt', 'name': 'saltshaker'}, {'frequency': 'f', 'id': 925, 'synset': 'sandal.n.01', 'synonyms': ['sandal_(type_of_shoe)'], 'def': 'a shoe consisting of a sole fastened by straps to the foot', 'name': 'sandal_(type_of_shoe)'}, {'frequency': 'f', 'id': 926, 'synset': 'sandwich.n.01', 'synonyms': ['sandwich'], 'def': 'two (or more) slices of bread with a filling between them', 'name': 'sandwich'}, {'frequency': 'r', 'id': 927, 'synset': 'satchel.n.01', 'synonyms': ['satchel'], 'def': 'luggage consisting of a small case with a flat bottom and (usually) a shoulder strap', 'name': 'satchel'}, {'frequency': 'r', 'id': 928, 'synset': 'saucepan.n.01', 'synonyms': ['saucepan'], 'def': 'a deep pan with a handle; used for stewing or boiling', 'name': 'saucepan'}, {'frequency': 'f', 'id': 929, 'synset': 'saucer.n.02', 'synonyms': ['saucer'], 'def': 'a small shallow dish for holding a cup at the table', 'name': 'saucer'}, {'frequency': 'f', 'id': 930, 'synset': 'sausage.n.01', 'synonyms': ['sausage'], 'def': 'highly seasoned minced meat stuffed in casings', 'name': 'sausage'}, {'frequency': 'r', 'id': 931, 'synset': 'sawhorse.n.01', 'synonyms': ['sawhorse', 'sawbuck'], 'def': 'a framework for holding wood that is being sawed', 'name': 'sawhorse'}, {'frequency': 'r', 'id': 932, 'synset': 'sax.n.02', 'synonyms': ['saxophone'], 'def': "a wind instrument with a `J'-shaped form typically made of brass", 'name': 'saxophone'}, {'frequency': 'f', 'id': 933, 'synset': 'scale.n.07', 'synonyms': ['scale_(measuring_instrument)'], 'def': 'a measuring instrument for weighing; shows amount of mass', 'name': 'scale_(measuring_instrument)'}, {'frequency': 'r', 'id': 934, 'synset': 'scarecrow.n.01', 'synonyms': ['scarecrow', 'strawman'], 'def': 'an effigy in the shape of a man to frighten birds away from seeds', 'name': 'scarecrow'}, {'frequency': 'f', 'id': 935, 'synset': 'scarf.n.01', 'synonyms': ['scarf'], 'def': 'a garment worn around the head or neck or shoulders for warmth or decoration', 'name': 'scarf'}, {'frequency': 'c', 'id': 936, 'synset': 'school_bus.n.01', 'synonyms': ['school_bus'], 'def': 'a bus used to transport children to or from school', 'name': 'school_bus'}, {'frequency': 'f', 'id': 937, 'synset': 'scissors.n.01', 'synonyms': ['scissors'], 'def': 'a tool having two crossed pivoting blades with looped handles', 'name': 'scissors'}, {'frequency': 'c', 'id': 938, 'synset': 'scoreboard.n.01', 'synonyms': ['scoreboard'], 'def': 'a large board for displaying the score of a contest (and some other information)', 'name': 'scoreboard'}, {'frequency': 'c', 'id': 939, 'synset': 'scrambled_eggs.n.01', 'synonyms': ['scrambled_eggs'], 'def': 'eggs beaten and cooked to a soft firm consistency while stirring', 'name': 'scrambled_eggs'}, {'frequency': 'r', 'id': 940, 'synset': 'scraper.n.01', 'synonyms': ['scraper'], 'def': 'any of various hand tools for scraping', 'name': 'scraper'}, {'frequency': 'r', 'id': 941, 'synset': 'scratcher.n.03', 'synonyms': ['scratcher'], 'def': 'a device used for scratching', 'name': 'scratcher'}, {'frequency': 'c', 'id': 942, 'synset': 'screwdriver.n.01', 'synonyms': ['screwdriver'], 'def': 'a hand tool for driving screws; has a tip that fits into the head of a screw', 'name': 'screwdriver'}, {'frequency': 'c', 'id': 943, 'synset': 'scrub_brush.n.01', 'synonyms': ['scrubbing_brush'], 'def': 'a brush with short stiff bristles for heavy cleaning', 'name': 'scrubbing_brush'}, {'frequency': 'c', 'id': 944, 'synset': 'sculpture.n.01', 'synonyms': ['sculpture'], 'def': 'a three-dimensional work of art', 'name': 'sculpture'}, {'frequency': 'r', 'id': 945, 'synset': 'seabird.n.01', 'synonyms': ['seabird', 'seafowl'], 'def': 'a bird that frequents coastal waters and the open ocean: gulls; pelicans; gannets; cormorants; albatrosses; petrels; etc.', 'name': 'seabird'}, {'frequency': 'r', 'id': 946, 'synset': 'seahorse.n.02', 'synonyms': ['seahorse'], 'def': 'small fish with horse-like heads bent sharply downward and curled tails', 'name': 'seahorse'}, {'frequency': 'r', 'id': 947, 'synset': 'seaplane.n.01', 'synonyms': ['seaplane', 'hydroplane'], 'def': 'an airplane that can land on or take off from water', 'name': 'seaplane'}, {'frequency': 'c', 'id': 948, 'synset': 'seashell.n.01', 'synonyms': ['seashell'], 'def': 'the shell of a marine organism', 'name': 'seashell'}, {'frequency': 'r', 'id': 949, 'synset': 'seedling.n.01', 'synonyms': ['seedling'], 'def': 'young plant or tree grown from a seed', 'name': 'seedling'}, {'frequency': 'c', 'id': 950, 'synset': 'serving_dish.n.01', 'synonyms': ['serving_dish'], 'def': 'a dish used for serving food', 'name': 'serving_dish'}, {'frequency': 'r', 'id': 951, 'synset': 'sewing_machine.n.01', 'synonyms': ['sewing_machine'], 'def': 'a textile machine used as a home appliance for sewing', 'name': 'sewing_machine'}, {'frequency': 'r', 'id': 952, 'synset': 'shaker.n.03', 'synonyms': ['shaker'], 'def': 'a container in which something can be shaken', 'name': 'shaker'}, {'frequency': 'c', 'id': 953, 'synset': 'shampoo.n.01', 'synonyms': ['shampoo'], 'def': 'cleansing agent consisting of soaps or detergents used for washing the hair', 'name': 'shampoo'}, {'frequency': 'r', 'id': 954, 'synset': 'shark.n.01', 'synonyms': ['shark'], 'def': 'typically large carnivorous fishes with sharpe teeth', 'name': 'shark'}, {'frequency': 'r', 'id': 955, 'synset': 'sharpener.n.01', 'synonyms': ['sharpener'], 'def': 'any implement that is used to make something (an edge or a point) sharper', 'name': 'sharpener'}, {'frequency': 'r', 'id': 956, 'synset': 'sharpie.n.03', 'synonyms': ['Sharpie'], 'def': 'a pen with indelible ink that will write on any surface', 'name': 'Sharpie'}, {'frequency': 'r', 'id': 957, 'synset': 'shaver.n.03', 'synonyms': ['shaver_(electric)', 'electric_shaver', 'electric_razor'], 'def': 'a razor powered by an electric motor', 'name': 'shaver_(electric)'}, {'frequency': 'c', 'id': 958, 'synset': 'shaving_cream.n.01', 'synonyms': ['shaving_cream', 'shaving_soap'], 'def': 'toiletry consisting that forms a rich lather for softening the beard before shaving', 'name': 'shaving_cream'}, {'frequency': 'r', 'id': 959, 'synset': 'shawl.n.01', 'synonyms': ['shawl'], 'def': 'cloak consisting of an oblong piece of cloth used to cover the head and shoulders', 'name': 'shawl'}, {'frequency': 'r', 'id': 960, 'synset': 'shears.n.01', 'synonyms': ['shears'], 'def': 'large scissors with strong blades', 'name': 'shears'}, {'frequency': 'f', 'id': 961, 'synset': 'sheep.n.01', 'synonyms': ['sheep'], 'def': 'woolly usually horned ruminant mammal related to the goat', 'name': 'sheep'}, {'frequency': 'r', 'id': 962, 'synset': 'shepherd_dog.n.01', 'synonyms': ['shepherd_dog', 'sheepdog'], 'def': 'any of various usually long-haired breeds of dog reared to herd and guard sheep', 'name': 'shepherd_dog'}, {'frequency': 'r', 'id': 963, 'synset': 'sherbert.n.01', 'synonyms': ['sherbert', 'sherbet'], 'def': 'a frozen dessert made primarily of fruit juice and sugar', 'name': 'sherbert'}, {'frequency': 'r', 'id': 964, 'synset': 'shield.n.02', 'synonyms': ['shield'], 'def': 'armor carried on the arm to intercept blows', 'name': 'shield'}, {'frequency': 'f', 'id': 965, 'synset': 'shirt.n.01', 'synonyms': ['shirt'], 'def': 'a garment worn on the upper half of the body', 'name': 'shirt'}, {'frequency': 'f', 'id': 966, 'synset': 'shoe.n.01', 'synonyms': ['shoe', 'sneaker_(type_of_shoe)', 'tennis_shoe'], 'def': 'common footwear covering the foot', 'name': 'shoe'}, {'frequency': 'c', 'id': 967, 'synset': 'shopping_bag.n.01', 'synonyms': ['shopping_bag'], 'def': 'a bag made of plastic or strong paper (often with handles); used to transport goods after shopping', 'name': 'shopping_bag'}, {'frequency': 'c', 'id': 968, 'synset': 'shopping_cart.n.01', 'synonyms': ['shopping_cart'], 'def': 'a handcart that holds groceries or other goods while shopping', 'name': 'shopping_cart'}, {'frequency': 'f', 'id': 969, 'synset': 'short_pants.n.01', 'synonyms': ['short_pants', 'shorts_(clothing)', 'trunks_(clothing)'], 'def': 'trousers that end at or above the knee', 'name': 'short_pants'}, {'frequency': 'r', 'id': 970, 'synset': 'shot_glass.n.01', 'synonyms': ['shot_glass'], 'def': 'a small glass adequate to hold a single swallow of whiskey', 'name': 'shot_glass'}, {'frequency': 'c', 'id': 971, 'synset': 'shoulder_bag.n.01', 'synonyms': ['shoulder_bag'], 'def': 'a large handbag that can be carried by a strap looped over the shoulder', 'name': 'shoulder_bag'}, {'frequency': 'c', 'id': 972, 'synset': 'shovel.n.01', 'synonyms': ['shovel'], 'def': 'a hand tool for lifting loose material such as snow, dirt, etc.', 'name': 'shovel'}, {'frequency': 'f', 'id': 973, 'synset': 'shower.n.01', 'synonyms': ['shower_head'], 'def': 'a plumbing fixture that sprays water over you', 'name': 'shower_head'}, {'frequency': 'f', 'id': 974, 'synset': 'shower_curtain.n.01', 'synonyms': ['shower_curtain'], 'def': 'a curtain that keeps water from splashing out of the shower area', 'name': 'shower_curtain'}, {'frequency': 'r', 'id': 975, 'synset': 'shredder.n.01', 'synonyms': ['shredder_(for_paper)'], 'def': 'a device that shreds documents', 'name': 'shredder_(for_paper)'}, {'frequency': 'r', 'id': 976, 'synset': 'sieve.n.01', 'synonyms': ['sieve', 'screen_(sieve)'], 'def': 'a strainer for separating lumps from powdered material or grading particles', 'name': 'sieve'}, {'frequency': 'f', 'id': 977, 'synset': 'signboard.n.01', 'synonyms': ['signboard'], 'def': 'structure displaying a board on which advertisements can be posted', 'name': 'signboard'}, {'frequency': 'c', 'id': 978, 'synset': 'silo.n.01', 'synonyms': ['silo'], 'def': 'a cylindrical tower used for storing goods', 'name': 'silo'}, {'frequency': 'f', 'id': 979, 'synset': 'sink.n.01', 'synonyms': ['sink'], 'def': 'plumbing fixture consisting of a water basin fixed to a wall or floor and having a drainpipe', 'name': 'sink'}, {'frequency': 'f', 'id': 980, 'synset': 'skateboard.n.01', 'synonyms': ['skateboard'], 'def': 'a board with wheels that is ridden in a standing or crouching position and propelled by foot', 'name': 'skateboard'}, {'frequency': 'c', 'id': 981, 'synset': 'skewer.n.01', 'synonyms': ['skewer'], 'def': 'a long pin for holding meat in position while it is being roasted', 'name': 'skewer'}, {'frequency': 'f', 'id': 982, 'synset': 'ski.n.01', 'synonyms': ['ski'], 'def': 'sports equipment for skiing on snow', 'name': 'ski'}, {'frequency': 'f', 'id': 983, 'synset': 'ski_boot.n.01', 'synonyms': ['ski_boot'], 'def': 'a stiff boot that is fastened to a ski with a ski binding', 'name': 'ski_boot'}, {'frequency': 'f', 'id': 984, 'synset': 'ski_parka.n.01', 'synonyms': ['ski_parka', 'ski_jacket'], 'def': 'a parka to be worn while skiing', 'name': 'ski_parka'}, {'frequency': 'f', 'id': 985, 'synset': 'ski_pole.n.01', 'synonyms': ['ski_pole'], 'def': 'a pole with metal points used as an aid in skiing', 'name': 'ski_pole'}, {'frequency': 'f', 'id': 986, 'synset': 'skirt.n.02', 'synonyms': ['skirt'], 'def': 'a garment hanging from the waist; worn mainly by girls and women', 'name': 'skirt'}, {'frequency': 'c', 'id': 987, 'synset': 'sled.n.01', 'synonyms': ['sled', 'sledge', 'sleigh'], 'def': 'a vehicle or flat object for transportation over snow by sliding or pulled by dogs, etc.', 'name': 'sled'}, {'frequency': 'c', 'id': 988, 'synset': 'sleeping_bag.n.01', 'synonyms': ['sleeping_bag'], 'def': 'large padded bag designed to be slept in outdoors', 'name': 'sleeping_bag'}, {'frequency': 'r', 'id': 989, 'synset': 'sling.n.05', 'synonyms': ['sling_(bandage)', 'triangular_bandage'], 'def': 'bandage to support an injured forearm; slung over the shoulder or neck', 'name': 'sling_(bandage)'}, {'frequency': 'c', 'id': 990, 'synset': 'slipper.n.01', 'synonyms': ['slipper_(footwear)', 'carpet_slipper_(footwear)'], 'def': 'low footwear that can be slipped on and off easily; usually worn indoors', 'name': 'slipper_(footwear)'}, {'frequency': 'r', 'id': 991, 'synset': 'smoothie.n.02', 'synonyms': ['smoothie'], 'def': 'a thick smooth drink consisting of fresh fruit pureed with ice cream or yoghurt or milk', 'name': 'smoothie'}, {'frequency': 'r', 'id': 992, 'synset': 'snake.n.01', 'synonyms': ['snake', 'serpent'], 'def': 'limbless scaly elongate reptile; some are venomous', 'name': 'snake'}, {'frequency': 'f', 'id': 993, 'synset': 'snowboard.n.01', 'synonyms': ['snowboard'], 'def': 'a board that resembles a broad ski or a small surfboard; used in a standing position to slide down snow-covered slopes', 'name': 'snowboard'}, {'frequency': 'c', 'id': 994, 'synset': 'snowman.n.01', 'synonyms': ['snowman'], 'def': 'a figure of a person made of packed snow', 'name': 'snowman'}, {'frequency': 'c', 'id': 995, 'synset': 'snowmobile.n.01', 'synonyms': ['snowmobile'], 'def': 'tracked vehicle for travel on snow having skis in front', 'name': 'snowmobile'}, {'frequency': 'f', 'id': 996, 'synset': 'soap.n.01', 'synonyms': ['soap'], 'def': 'a cleansing agent made from the salts of vegetable or animal fats', 'name': 'soap'}, {'frequency': 'f', 'id': 997, 'synset': 'soccer_ball.n.01', 'synonyms': ['soccer_ball'], 'def': "an inflated ball used in playing soccer (called `football' outside of the United States)", 'name': 'soccer_ball'}, {'frequency': 'f', 'id': 998, 'synset': 'sock.n.01', 'synonyms': ['sock'], 'def': 'cloth covering for the foot; worn inside the shoe; reaches to between the ankle and the knee', 'name': 'sock'}, {'frequency': 'r', 'id': 999, 'synset': 'soda_fountain.n.02', 'synonyms': ['soda_fountain'], 'def': 'an apparatus for dispensing soda water', 'name': 'soda_fountain'}, {'frequency': 'r', 'id': 1000, 'synset': 'soda_water.n.01', 'synonyms': ['carbonated_water', 'club_soda', 'seltzer', 'sparkling_water'], 'def': 'effervescent beverage artificially charged with carbon dioxide', 'name': 'carbonated_water'}, {'frequency': 'f', 'id': 1001, 'synset': 'sofa.n.01', 'synonyms': ['sofa', 'couch', 'lounge'], 'def': 'an upholstered seat for more than one person', 'name': 'sofa'}, {'frequency': 'r', 'id': 1002, 'synset': 'softball.n.01', 'synonyms': ['softball'], 'def': 'ball used in playing softball', 'name': 'softball'}, {'frequency': 'c', 'id': 1003, 'synset': 'solar_array.n.01', 'synonyms': ['solar_array', 'solar_battery', 'solar_panel'], 'def': 'electrical device consisting of a large array of connected solar cells', 'name': 'solar_array'}, {'frequency': 'r', 'id': 1004, 'synset': 'sombrero.n.02', 'synonyms': ['sombrero'], 'def': 'a straw hat with a tall crown and broad brim; worn in American southwest and in Mexico', 'name': 'sombrero'}, {'frequency': 'c', 'id': 1005, 'synset': 'soup.n.01', 'synonyms': ['soup'], 'def': 'liquid food especially of meat or fish or vegetable stock often containing pieces of solid food', 'name': 'soup'}, {'frequency': 'r', 'id': 1006, 'synset': 'soup_bowl.n.01', 'synonyms': ['soup_bowl'], 'def': 'a bowl for serving soup', 'name': 'soup_bowl'}, {'frequency': 'c', 'id': 1007, 'synset': 'soupspoon.n.01', 'synonyms': ['soupspoon'], 'def': 'a spoon with a rounded bowl for eating soup', 'name': 'soupspoon'}, {'frequency': 'c', 'id': 1008, 'synset': 'sour_cream.n.01', 'synonyms': ['sour_cream', 'soured_cream'], 'def': 'soured light cream', 'name': 'sour_cream'}, {'frequency': 'r', 'id': 1009, 'synset': 'soya_milk.n.01', 'synonyms': ['soya_milk', 'soybean_milk', 'soymilk'], 'def': 'a milk substitute containing soybean flour and water; used in some infant formulas and in making tofu', 'name': 'soya_milk'}, {'frequency': 'r', 'id': 1010, 'synset': 'space_shuttle.n.01', 'synonyms': ['space_shuttle'], 'def': "a reusable spacecraft with wings for a controlled descent through the Earth's atmosphere", 'name': 'space_shuttle'}, {'frequency': 'r', 'id': 1011, 'synset': 'sparkler.n.02', 'synonyms': ['sparkler_(fireworks)'], 'def': 'a firework that burns slowly and throws out a shower of sparks', 'name': 'sparkler_(fireworks)'}, {'frequency': 'f', 'id': 1012, 'synset': 'spatula.n.02', 'synonyms': ['spatula'], 'def': 'a hand tool with a thin flexible blade used to mix or spread soft substances', 'name': 'spatula'}, {'frequency': 'r', 'id': 1013, 'synset': 'spear.n.01', 'synonyms': ['spear', 'lance'], 'def': 'a long pointed rod used as a tool or weapon', 'name': 'spear'}, {'frequency': 'f', 'id': 1014, 'synset': 'spectacles.n.01', 'synonyms': ['spectacles', 'specs', 'eyeglasses', 'glasses'], 'def': 'optical instrument consisting of a frame that holds a pair of lenses for correcting defective vision', 'name': 'spectacles'}, {'frequency': 'c', 'id': 1015, 'synset': 'spice_rack.n.01', 'synonyms': ['spice_rack'], 'def': 'a rack for displaying containers filled with spices', 'name': 'spice_rack'}, {'frequency': 'r', 'id': 1016, 'synset': 'spider.n.01', 'synonyms': ['spider'], 'def': 'predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'name': 'spider'}, {'frequency': 'c', 'id': 1017, 'synset': 'sponge.n.01', 'synonyms': ['sponge'], 'def': 'a porous mass usable to absorb water typically used for cleaning', 'name': 'sponge'}, {'frequency': 'f', 'id': 1018, 'synset': 'spoon.n.01', 'synonyms': ['spoon'], 'def': 'a piece of cutlery with a shallow bowl-shaped container and a handle', 'name': 'spoon'}, {'frequency': 'c', 'id': 1019, 'synset': 'sportswear.n.01', 'synonyms': ['sportswear', 'athletic_wear', 'activewear'], 'def': 'attire worn for sport or for casual wear', 'name': 'sportswear'}, {'frequency': 'c', 'id': 1020, 'synset': 'spotlight.n.02', 'synonyms': ['spotlight'], 'def': 'a lamp that produces a strong beam of light to illuminate a restricted area; used to focus attention of a stage performer', 'name': 'spotlight'}, {'frequency': 'r', 'id': 1021, 'synset': 'squirrel.n.01', 'synonyms': ['squirrel'], 'def': 'a kind of arboreal rodent having a long bushy tail', 'name': 'squirrel'}, {'frequency': 'c', 'id': 1022, 'synset': 'stapler.n.01', 'synonyms': ['stapler_(stapling_machine)'], 'def': 'a machine that inserts staples into sheets of paper in order to fasten them together', 'name': 'stapler_(stapling_machine)'}, {'frequency': 'r', 'id': 1023, 'synset': 'starfish.n.01', 'synonyms': ['starfish', 'sea_star'], 'def': 'echinoderms characterized by five arms extending from a central disk', 'name': 'starfish'}, {'frequency': 'f', 'id': 1024, 'synset': 'statue.n.01', 'synonyms': ['statue_(sculpture)'], 'def': 'a sculpture representing a human or animal', 'name': 'statue_(sculpture)'}, {'frequency': 'c', 'id': 1025, 'synset': 'steak.n.01', 'synonyms': ['steak_(food)'], 'def': 'a slice of meat cut from the fleshy part of an animal or large fish', 'name': 'steak_(food)'}, {'frequency': 'r', 'id': 1026, 'synset': 'steak_knife.n.01', 'synonyms': ['steak_knife'], 'def': 'a sharp table knife used in eating steak', 'name': 'steak_knife'}, {'frequency': 'r', 'id': 1027, 'synset': 'steamer.n.02', 'synonyms': ['steamer_(kitchen_appliance)'], 'def': 'a cooking utensil that can be used to cook food by steaming it', 'name': 'steamer_(kitchen_appliance)'}, {'frequency': 'f', 'id': 1028, 'synset': 'steering_wheel.n.01', 'synonyms': ['steering_wheel'], 'def': 'a handwheel that is used for steering', 'name': 'steering_wheel'}, {'frequency': 'r', 'id': 1029, 'synset': 'stencil.n.01', 'synonyms': ['stencil'], 'def': 'a sheet of material (metal, plastic, etc.) that has been perforated with a pattern; ink or paint can pass through the perforations to create the printed pattern on the surface below', 'name': 'stencil'}, {'frequency': 'r', 'id': 1030, 'synset': 'step_ladder.n.01', 'synonyms': ['stepladder'], 'def': 'a folding portable ladder hinged at the top', 'name': 'stepladder'}, {'frequency': 'c', 'id': 1031, 'synset': 'step_stool.n.01', 'synonyms': ['step_stool'], 'def': 'a stool that has one or two steps that fold under the seat', 'name': 'step_stool'}, {'frequency': 'c', 'id': 1032, 'synset': 'stereo.n.01', 'synonyms': ['stereo_(sound_system)'], 'def': 'electronic device for playing audio', 'name': 'stereo_(sound_system)'}, {'frequency': 'r', 'id': 1033, 'synset': 'stew.n.02', 'synonyms': ['stew'], 'def': 'food prepared by stewing especially meat or fish with vegetables', 'name': 'stew'}, {'frequency': 'r', 'id': 1034, 'synset': 'stirrer.n.02', 'synonyms': ['stirrer'], 'def': 'an implement used for stirring', 'name': 'stirrer'}, {'frequency': 'f', 'id': 1035, 'synset': 'stirrup.n.01', 'synonyms': ['stirrup'], 'def': "support consisting of metal loops into which rider's feet go", 'name': 'stirrup'}, {'frequency': 'c', 'id': 1036, 'synset': 'stocking.n.01', 'synonyms': ['stockings_(leg_wear)'], 'def': 'close-fitting hosiery to cover the foot and leg; come in matched pairs', 'name': 'stockings_(leg_wear)'}, {'frequency': 'f', 'id': 1037, 'synset': 'stool.n.01', 'synonyms': ['stool'], 'def': 'a simple seat without a back or arms', 'name': 'stool'}, {'frequency': 'f', 'id': 1038, 'synset': 'stop_sign.n.01', 'synonyms': ['stop_sign'], 'def': 'a traffic sign to notify drivers that they must come to a complete stop', 'name': 'stop_sign'}, {'frequency': 'f', 'id': 1039, 'synset': 'stoplight.n.01', 'synonyms': ['brake_light'], 'def': 'a red light on the rear of a motor vehicle that signals when the brakes are applied', 'name': 'brake_light'}, {'frequency': 'f', 'id': 1040, 'synset': 'stove.n.01', 'synonyms': ['stove', 'kitchen_stove', 'range_(kitchen_appliance)', 'kitchen_range', 'cooking_stove'], 'def': 'a kitchen appliance used for cooking food', 'name': 'stove'}, {'frequency': 'c', 'id': 1041, 'synset': 'strainer.n.01', 'synonyms': ['strainer'], 'def': 'a filter to retain larger pieces while smaller pieces and liquids pass through', 'name': 'strainer'}, {'frequency': 'f', 'id': 1042, 'synset': 'strap.n.01', 'synonyms': ['strap'], 'def': 'an elongated strip of material for binding things together or holding', 'name': 'strap'}, {'frequency': 'f', 'id': 1043, 'synset': 'straw.n.04', 'synonyms': ['straw_(for_drinking)', 'drinking_straw'], 'def': 'a thin paper or plastic tube used to suck liquids into the mouth', 'name': 'straw_(for_drinking)'}, {'frequency': 'f', 'id': 1044, 'synset': 'strawberry.n.01', 'synonyms': ['strawberry'], 'def': 'sweet fleshy red fruit', 'name': 'strawberry'}, {'frequency': 'f', 'id': 1045, 'synset': 'street_sign.n.01', 'synonyms': ['street_sign'], 'def': 'a sign visible from the street', 'name': 'street_sign'}, {'frequency': 'f', 'id': 1046, 'synset': 'streetlight.n.01', 'synonyms': ['streetlight', 'street_lamp'], 'def': 'a lamp supported on a lamppost; for illuminating a street', 'name': 'streetlight'}, {'frequency': 'r', 'id': 1047, 'synset': 'string_cheese.n.01', 'synonyms': ['string_cheese'], 'def': 'cheese formed in long strings twisted together', 'name': 'string_cheese'}, {'frequency': 'r', 'id': 1048, 'synset': 'stylus.n.02', 'synonyms': ['stylus'], 'def': 'a pointed tool for writing or drawing or engraving', 'name': 'stylus'}, {'frequency': 'r', 'id': 1049, 'synset': 'subwoofer.n.01', 'synonyms': ['subwoofer'], 'def': 'a loudspeaker that is designed to reproduce very low bass frequencies', 'name': 'subwoofer'}, {'frequency': 'r', 'id': 1050, 'synset': 'sugar_bowl.n.01', 'synonyms': ['sugar_bowl'], 'def': 'a dish in which sugar is served', 'name': 'sugar_bowl'}, {'frequency': 'r', 'id': 1051, 'synset': 'sugarcane.n.01', 'synonyms': ['sugarcane_(plant)'], 'def': 'juicy canes whose sap is a source of molasses and commercial sugar; fresh canes are sometimes chewed for the juice', 'name': 'sugarcane_(plant)'}, {'frequency': 'c', 'id': 1052, 'synset': 'suit.n.01', 'synonyms': ['suit_(clothing)'], 'def': 'a set of garments (usually including a jacket and trousers or skirt) for outerwear all of the same fabric and color', 'name': 'suit_(clothing)'}, {'frequency': 'c', 'id': 1053, 'synset': 'sunflower.n.01', 'synonyms': ['sunflower'], 'def': 'any plant of the genus Helianthus having large flower heads with dark disk florets and showy yellow rays', 'name': 'sunflower'}, {'frequency': 'f', 'id': 1054, 'synset': 'sunglasses.n.01', 'synonyms': ['sunglasses'], 'def': 'spectacles that are darkened or polarized to protect the eyes from the glare of the sun', 'name': 'sunglasses'}, {'frequency': 'c', 'id': 1055, 'synset': 'sunhat.n.01', 'synonyms': ['sunhat'], 'def': 'a hat with a broad brim that protects the face from direct exposure to the sun', 'name': 'sunhat'}, {'frequency': 'r', 'id': 1056, 'synset': 'sunscreen.n.01', 'synonyms': ['sunscreen', 'sunblock'], 'def': 'a cream spread on the skin; contains a chemical to filter out ultraviolet light and so protect from sunburn', 'name': 'sunscreen'}, {'frequency': 'f', 'id': 1057, 'synset': 'surfboard.n.01', 'synonyms': ['surfboard'], 'def': 'a narrow buoyant board for riding surf', 'name': 'surfboard'}, {'frequency': 'c', 'id': 1058, 'synset': 'sushi.n.01', 'synonyms': ['sushi'], 'def': 'rice (with raw fish) wrapped in seaweed', 'name': 'sushi'}, {'frequency': 'c', 'id': 1059, 'synset': 'swab.n.02', 'synonyms': ['mop'], 'def': 'cleaning implement consisting of absorbent material fastened to a handle; for cleaning floors', 'name': 'mop'}, {'frequency': 'c', 'id': 1060, 'synset': 'sweat_pants.n.01', 'synonyms': ['sweat_pants'], 'def': 'loose-fitting trousers with elastic cuffs; worn by athletes', 'name': 'sweat_pants'}, {'frequency': 'c', 'id': 1061, 'synset': 'sweatband.n.02', 'synonyms': ['sweatband'], 'def': 'a band of material tied around the forehead or wrist to absorb sweat', 'name': 'sweatband'}, {'frequency': 'f', 'id': 1062, 'synset': 'sweater.n.01', 'synonyms': ['sweater'], 'def': 'a crocheted or knitted garment covering the upper part of the body', 'name': 'sweater'}, {'frequency': 'f', 'id': 1063, 'synset': 'sweatshirt.n.01', 'synonyms': ['sweatshirt'], 'def': 'cotton knit pullover with long sleeves worn during athletic activity', 'name': 'sweatshirt'}, {'frequency': 'c', 'id': 1064, 'synset': 'sweet_potato.n.02', 'synonyms': ['sweet_potato'], 'def': 'the edible tuberous root of the sweet potato vine', 'name': 'sweet_potato'}, {'frequency': 'f', 'id': 1065, 'synset': 'swimsuit.n.01', 'synonyms': ['swimsuit', 'swimwear', 'bathing_suit', 'swimming_costume', 'bathing_costume', 'swimming_trunks', 'bathing_trunks'], 'def': 'garment worn for swimming', 'name': 'swimsuit'}, {'frequency': 'c', 'id': 1066, 'synset': 'sword.n.01', 'synonyms': ['sword'], 'def': 'a cutting or thrusting weapon that has a long metal blade', 'name': 'sword'}, {'frequency': 'r', 'id': 1067, 'synset': 'syringe.n.01', 'synonyms': ['syringe'], 'def': 'a medical instrument used to inject or withdraw fluids', 'name': 'syringe'}, {'frequency': 'r', 'id': 1068, 'synset': 'tabasco.n.02', 'synonyms': ['Tabasco_sauce'], 'def': 'very spicy sauce (trade name Tabasco) made from fully-aged red peppers', 'name': 'Tabasco_sauce'}, {'frequency': 'r', 'id': 1069, 'synset': 'table-tennis_table.n.01', 'synonyms': ['table-tennis_table', 'ping-pong_table'], 'def': 'a table used for playing table tennis', 'name': 'table-tennis_table'}, {'frequency': 'f', 'id': 1070, 'synset': 'table.n.02', 'synonyms': ['table'], 'def': 'a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs', 'name': 'table'}, {'frequency': 'c', 'id': 1071, 'synset': 'table_lamp.n.01', 'synonyms': ['table_lamp'], 'def': 'a lamp that sits on a table', 'name': 'table_lamp'}, {'frequency': 'f', 'id': 1072, 'synset': 'tablecloth.n.01', 'synonyms': ['tablecloth'], 'def': 'a covering spread over a dining table', 'name': 'tablecloth'}, {'frequency': 'r', 'id': 1073, 'synset': 'tachometer.n.01', 'synonyms': ['tachometer'], 'def': 'measuring instrument for indicating speed of rotation', 'name': 'tachometer'}, {'frequency': 'r', 'id': 1074, 'synset': 'taco.n.02', 'synonyms': ['taco'], 'def': 'a small tortilla cupped around a filling', 'name': 'taco'}, {'frequency': 'f', 'id': 1075, 'synset': 'tag.n.02', 'synonyms': ['tag'], 'def': 'a label associated with something for the purpose of identification or information', 'name': 'tag'}, {'frequency': 'f', 'id': 1076, 'synset': 'taillight.n.01', 'synonyms': ['taillight', 'rear_light'], 'def': 'lamp (usually red) mounted at the rear of a motor vehicle', 'name': 'taillight'}, {'frequency': 'r', 'id': 1077, 'synset': 'tambourine.n.01', 'synonyms': ['tambourine'], 'def': 'a shallow drum with a single drumhead and with metallic disks in the sides', 'name': 'tambourine'}, {'frequency': 'r', 'id': 1078, 'synset': 'tank.n.01', 'synonyms': ['army_tank', 'armored_combat_vehicle', 'armoured_combat_vehicle'], 'def': 'an enclosed armored military vehicle; has a cannon and moves on caterpillar treads', 'name': 'army_tank'}, {'frequency': 'c', 'id': 1079, 'synset': 'tank.n.02', 'synonyms': ['tank_(storage_vessel)', 'storage_tank'], 'def': 'a large (usually metallic) vessel for holding gases or liquids', 'name': 'tank_(storage_vessel)'}, {'frequency': 'f', 'id': 1080, 'synset': 'tank_top.n.01', 'synonyms': ['tank_top_(clothing)'], 'def': 'a tight-fitting sleeveless shirt with wide shoulder straps and low neck and no front opening', 'name': 'tank_top_(clothing)'}, {'frequency': 'c', 'id': 1081, 'synset': 'tape.n.01', 'synonyms': ['tape_(sticky_cloth_or_paper)'], 'def': 'a long thin piece of cloth or paper as used for binding or fastening', 'name': 'tape_(sticky_cloth_or_paper)'}, {'frequency': 'c', 'id': 1082, 'synset': 'tape.n.04', 'synonyms': ['tape_measure', 'measuring_tape'], 'def': 'measuring instrument consisting of a narrow strip (cloth or metal) marked in inches or centimeters and used for measuring lengths', 'name': 'tape_measure'}, {'frequency': 'c', 'id': 1083, 'synset': 'tapestry.n.02', 'synonyms': ['tapestry'], 'def': 'a heavy textile with a woven design; used for curtains and upholstery', 'name': 'tapestry'}, {'frequency': 'f', 'id': 1084, 'synset': 'tarpaulin.n.01', 'synonyms': ['tarp'], 'def': 'waterproofed canvas', 'name': 'tarp'}, {'frequency': 'c', 'id': 1085, 'synset': 'tartan.n.01', 'synonyms': ['tartan', 'plaid'], 'def': 'a cloth having a crisscross design', 'name': 'tartan'}, {'frequency': 'c', 'id': 1086, 'synset': 'tassel.n.01', 'synonyms': ['tassel'], 'def': 'adornment consisting of a bunch of cords fastened at one end', 'name': 'tassel'}, {'frequency': 'r', 'id': 1087, 'synset': 'tea_bag.n.01', 'synonyms': ['tea_bag'], 'def': 'a measured amount of tea in a bag for an individual serving of tea', 'name': 'tea_bag'}, {'frequency': 'c', 'id': 1088, 'synset': 'teacup.n.02', 'synonyms': ['teacup'], 'def': 'a cup from which tea is drunk', 'name': 'teacup'}, {'frequency': 'c', 'id': 1089, 'synset': 'teakettle.n.01', 'synonyms': ['teakettle'], 'def': 'kettle for boiling water to make tea', 'name': 'teakettle'}, {'frequency': 'c', 'id': 1090, 'synset': 'teapot.n.01', 'synonyms': ['teapot'], 'def': 'pot for brewing tea; usually has a spout and handle', 'name': 'teapot'}, {'frequency': 'f', 'id': 1091, 'synset': 'teddy.n.01', 'synonyms': ['teddy_bear'], 'def': "plaything consisting of a child's toy bear (usually plush and stuffed with soft materials)", 'name': 'teddy_bear'}, {'frequency': 'f', 'id': 1092, 'synset': 'telephone.n.01', 'synonyms': ['telephone', 'phone', 'telephone_set'], 'def': 'electronic device for communicating by voice over long distances', 'name': 'telephone'}, {'frequency': 'c', 'id': 1093, 'synset': 'telephone_booth.n.01', 'synonyms': ['telephone_booth', 'phone_booth', 'call_box', 'telephone_box', 'telephone_kiosk'], 'def': 'booth for using a telephone', 'name': 'telephone_booth'}, {'frequency': 'f', 'id': 1094, 'synset': 'telephone_pole.n.01', 'synonyms': ['telephone_pole', 'telegraph_pole', 'telegraph_post'], 'def': 'tall pole supporting telephone wires', 'name': 'telephone_pole'}, {'frequency': 'r', 'id': 1095, 'synset': 'telephoto_lens.n.01', 'synonyms': ['telephoto_lens', 'zoom_lens'], 'def': 'a camera lens that magnifies the image', 'name': 'telephoto_lens'}, {'frequency': 'c', 'id': 1096, 'synset': 'television_camera.n.01', 'synonyms': ['television_camera', 'tv_camera'], 'def': 'television equipment for capturing and recording video', 'name': 'television_camera'}, {'frequency': 'f', 'id': 1097, 'synset': 'television_receiver.n.01', 'synonyms': ['television_set', 'tv', 'tv_set'], 'def': 'an electronic device that receives television signals and displays them on a screen', 'name': 'television_set'}, {'frequency': 'f', 'id': 1098, 'synset': 'tennis_ball.n.01', 'synonyms': ['tennis_ball'], 'def': 'ball about the size of a fist used in playing tennis', 'name': 'tennis_ball'}, {'frequency': 'f', 'id': 1099, 'synset': 'tennis_racket.n.01', 'synonyms': ['tennis_racket'], 'def': 'a racket used to play tennis', 'name': 'tennis_racket'}, {'frequency': 'r', 'id': 1100, 'synset': 'tequila.n.01', 'synonyms': ['tequila'], 'def': 'Mexican liquor made from fermented juices of an agave plant', 'name': 'tequila'}, {'frequency': 'c', 'id': 1101, 'synset': 'thermometer.n.01', 'synonyms': ['thermometer'], 'def': 'measuring instrument for measuring temperature', 'name': 'thermometer'}, {'frequency': 'c', 'id': 1102, 'synset': 'thermos.n.01', 'synonyms': ['thermos_bottle'], 'def': 'vacuum flask that preserves temperature of hot or cold drinks', 'name': 'thermos_bottle'}, {'frequency': 'c', 'id': 1103, 'synset': 'thermostat.n.01', 'synonyms': ['thermostat'], 'def': 'a regulator for automatically regulating temperature by starting or stopping the supply of heat', 'name': 'thermostat'}, {'frequency': 'r', 'id': 1104, 'synset': 'thimble.n.02', 'synonyms': ['thimble'], 'def': 'a small metal cap to protect the finger while sewing; can be used as a small container', 'name': 'thimble'}, {'frequency': 'c', 'id': 1105, 'synset': 'thread.n.01', 'synonyms': ['thread', 'yarn'], 'def': 'a fine cord of twisted fibers (of cotton or silk or wool or nylon etc.) used in sewing and weaving', 'name': 'thread'}, {'frequency': 'c', 'id': 1106, 'synset': 'thumbtack.n.01', 'synonyms': ['thumbtack', 'drawing_pin', 'pushpin'], 'def': 'a tack for attaching papers to a bulletin board or drawing board', 'name': 'thumbtack'}, {'frequency': 'c', 'id': 1107, 'synset': 'tiara.n.01', 'synonyms': ['tiara'], 'def': 'a jeweled headdress worn by women on formal occasions', 'name': 'tiara'}, {'frequency': 'c', 'id': 1108, 'synset': 'tiger.n.02', 'synonyms': ['tiger'], 'def': 'large feline of forests in most of Asia having a tawny coat with black stripes', 'name': 'tiger'}, {'frequency': 'c', 'id': 1109, 'synset': 'tights.n.01', 'synonyms': ['tights_(clothing)', 'leotards'], 'def': 'skintight knit hose covering the body from the waist to the feet worn by acrobats and dancers and as stockings by women and girls', 'name': 'tights_(clothing)'}, {'frequency': 'c', 'id': 1110, 'synset': 'timer.n.01', 'synonyms': ['timer', 'stopwatch'], 'def': 'a timepiece that measures a time interval and signals its end', 'name': 'timer'}, {'frequency': 'f', 'id': 1111, 'synset': 'tinfoil.n.01', 'synonyms': ['tinfoil'], 'def': 'foil made of tin or an alloy of tin and lead', 'name': 'tinfoil'}, {'frequency': 'r', 'id': 1112, 'synset': 'tinsel.n.01', 'synonyms': ['tinsel'], 'def': 'a showy decoration that is basically valueless', 'name': 'tinsel'}, {'frequency': 'f', 'id': 1113, 'synset': 'tissue.n.02', 'synonyms': ['tissue_paper'], 'def': 'a soft thin (usually translucent) paper', 'name': 'tissue_paper'}, {'frequency': 'c', 'id': 1114, 'synset': 'toast.n.01', 'synonyms': ['toast_(food)'], 'def': 'slice of bread that has been toasted', 'name': 'toast_(food)'}, {'frequency': 'f', 'id': 1115, 'synset': 'toaster.n.02', 'synonyms': ['toaster'], 'def': 'a kitchen appliance (usually electric) for toasting bread', 'name': 'toaster'}, {'frequency': 'c', 'id': 1116, 'synset': 'toaster_oven.n.01', 'synonyms': ['toaster_oven'], 'def': 'kitchen appliance consisting of a small electric oven for toasting or warming food', 'name': 'toaster_oven'}, {'frequency': 'f', 'id': 1117, 'synset': 'toilet.n.02', 'synonyms': ['toilet'], 'def': 'a plumbing fixture for defecation and urination', 'name': 'toilet'}, {'frequency': 'f', 'id': 1118, 'synset': 'toilet_tissue.n.01', 'synonyms': ['toilet_tissue', 'toilet_paper', 'bathroom_tissue'], 'def': 'a soft thin absorbent paper for use in toilets', 'name': 'toilet_tissue'}, {'frequency': 'f', 'id': 1119, 'synset': 'tomato.n.01', 'synonyms': ['tomato'], 'def': 'mildly acid red or yellow pulpy fruit eaten as a vegetable', 'name': 'tomato'}, {'frequency': 'c', 'id': 1120, 'synset': 'tongs.n.01', 'synonyms': ['tongs'], 'def': 'any of various devices for taking hold of objects; usually have two hinged legs with handles above and pointed hooks below', 'name': 'tongs'}, {'frequency': 'c', 'id': 1121, 'synset': 'toolbox.n.01', 'synonyms': ['toolbox'], 'def': 'a box or chest or cabinet for holding hand tools', 'name': 'toolbox'}, {'frequency': 'f', 'id': 1122, 'synset': 'toothbrush.n.01', 'synonyms': ['toothbrush'], 'def': 'small brush; has long handle; used to clean teeth', 'name': 'toothbrush'}, {'frequency': 'f', 'id': 1123, 'synset': 'toothpaste.n.01', 'synonyms': ['toothpaste'], 'def': 'a dentifrice in the form of a paste', 'name': 'toothpaste'}, {'frequency': 'c', 'id': 1124, 'synset': 'toothpick.n.01', 'synonyms': ['toothpick'], 'def': 'pick consisting of a small strip of wood or plastic; used to pick food from between the teeth', 'name': 'toothpick'}, {'frequency': 'c', 'id': 1125, 'synset': 'top.n.09', 'synonyms': ['cover'], 'def': 'covering for a hole (especially a hole in the top of a container)', 'name': 'cover'}, {'frequency': 'c', 'id': 1126, 'synset': 'tortilla.n.01', 'synonyms': ['tortilla'], 'def': 'thin unleavened pancake made from cornmeal or wheat flour', 'name': 'tortilla'}, {'frequency': 'c', 'id': 1127, 'synset': 'tow_truck.n.01', 'synonyms': ['tow_truck'], 'def': 'a truck equipped to hoist and pull wrecked cars (or to remove cars from no-parking zones)', 'name': 'tow_truck'}, {'frequency': 'f', 'id': 1128, 'synset': 'towel.n.01', 'synonyms': ['towel'], 'def': 'a rectangular piece of absorbent cloth (or paper) for drying or wiping', 'name': 'towel'}, {'frequency': 'f', 'id': 1129, 'synset': 'towel_rack.n.01', 'synonyms': ['towel_rack', 'towel_rail', 'towel_bar'], 'def': 'a rack consisting of one or more bars on which towels can be hung', 'name': 'towel_rack'}, {'frequency': 'f', 'id': 1130, 'synset': 'toy.n.03', 'synonyms': ['toy'], 'def': 'a device regarded as providing amusement', 'name': 'toy'}, {'frequency': 'c', 'id': 1131, 'synset': 'tractor.n.01', 'synonyms': ['tractor_(farm_equipment)'], 'def': 'a wheeled vehicle with large wheels; used in farming and other applications', 'name': 'tractor_(farm_equipment)'}, {'frequency': 'f', 'id': 1132, 'synset': 'traffic_light.n.01', 'synonyms': ['traffic_light'], 'def': 'a device to control vehicle traffic often consisting of three or more lights', 'name': 'traffic_light'}, {'frequency': 'r', 'id': 1133, 'synset': 'trail_bike.n.01', 'synonyms': ['dirt_bike'], 'def': 'a lightweight motorcycle equipped with rugged tires and suspension for off-road use', 'name': 'dirt_bike'}, {'frequency': 'c', 'id': 1134, 'synset': 'trailer_truck.n.01', 'synonyms': ['trailer_truck', 'tractor_trailer', 'trucking_rig', 'articulated_lorry', 'semi_truck'], 'def': 'a truck consisting of a tractor and trailer together', 'name': 'trailer_truck'}, {'frequency': 'f', 'id': 1135, 'synset': 'train.n.01', 'synonyms': ['train_(railroad_vehicle)', 'railroad_train'], 'def': 'public or private transport provided by a line of railway cars coupled together and drawn by a locomotive', 'name': 'train_(railroad_vehicle)'}, {'frequency': 'r', 'id': 1136, 'synset': 'trampoline.n.01', 'synonyms': ['trampoline'], 'def': 'gymnastic apparatus consisting of a strong canvas sheet attached with springs to a metal frame', 'name': 'trampoline'}, {'frequency': 'f', 'id': 1137, 'synset': 'tray.n.01', 'synonyms': ['tray'], 'def': 'an open receptacle for holding or displaying or serving articles or food', 'name': 'tray'}, {'frequency': 'r', 'id': 1138, 'synset': 'tree_house.n.01', 'synonyms': ['tree_house'], 'def': '(NOT A TREE) a PLAYHOUSE built in the branches of a tree', 'name': 'tree_house'}, {'frequency': 'r', 'id': 1139, 'synset': 'trench_coat.n.01', 'synonyms': ['trench_coat'], 'def': 'a military style raincoat; belted with deep pockets', 'name': 'trench_coat'}, {'frequency': 'r', 'id': 1140, 'synset': 'triangle.n.05', 'synonyms': ['triangle_(musical_instrument)'], 'def': 'a percussion instrument consisting of a metal bar bent in the shape of an open triangle', 'name': 'triangle_(musical_instrument)'}, {'frequency': 'r', 'id': 1141, 'synset': 'tricycle.n.01', 'synonyms': ['tricycle'], 'def': 'a vehicle with three wheels that is moved by foot pedals', 'name': 'tricycle'}, {'frequency': 'c', 'id': 1142, 'synset': 'tripod.n.01', 'synonyms': ['tripod'], 'def': 'a three-legged rack used for support', 'name': 'tripod'}, {'frequency': 'f', 'id': 1143, 'synset': 'trouser.n.01', 'synonyms': ['trousers', 'pants_(clothing)'], 'def': 'a garment extending from the waist to the knee or ankle, covering each leg separately', 'name': 'trousers'}, {'frequency': 'f', 'id': 1144, 'synset': 'truck.n.01', 'synonyms': ['truck'], 'def': 'an automotive vehicle suitable for hauling', 'name': 'truck'}, {'frequency': 'r', 'id': 1145, 'synset': 'truffle.n.03', 'synonyms': ['truffle_(chocolate)', 'chocolate_truffle'], 'def': 'creamy chocolate candy', 'name': 'truffle_(chocolate)'}, {'frequency': 'c', 'id': 1146, 'synset': 'trunk.n.02', 'synonyms': ['trunk'], 'def': 'luggage consisting of a large strong case used when traveling or for storage', 'name': 'trunk'}, {'frequency': 'r', 'id': 1147, 'synset': 'tub.n.02', 'synonyms': ['vat'], 'def': 'a large open vessel for holding or storing liquids', 'name': 'vat'}, {'frequency': 'c', 'id': 1148, 'synset': 'turban.n.01', 'synonyms': ['turban'], 'def': 'a traditional headdress consisting of a long scarf wrapped around the head', 'name': 'turban'}, {'frequency': 'r', 'id': 1149, 'synset': 'turkey.n.01', 'synonyms': ['turkey_(bird)'], 'def': 'large gallinaceous bird with fan-shaped tail; widely domesticated for food', 'name': 'turkey_(bird)'}, {'frequency': 'c', 'id': 1150, 'synset': 'turkey.n.04', 'synonyms': ['turkey_(food)'], 'def': 'flesh of large domesticated fowl usually roasted', 'name': 'turkey_(food)'}, {'frequency': 'r', 'id': 1151, 'synset': 'turnip.n.01', 'synonyms': ['turnip'], 'def': 'widely cultivated plant having a large fleshy edible white or yellow root', 'name': 'turnip'}, {'frequency': 'c', 'id': 1152, 'synset': 'turtle.n.02', 'synonyms': ['turtle'], 'def': 'any of various aquatic and land reptiles having a bony shell and flipper-like limbs for swimming', 'name': 'turtle'}, {'frequency': 'r', 'id': 1153, 'synset': 'turtleneck.n.01', 'synonyms': ['turtleneck_(clothing)', 'polo-neck'], 'def': 'a sweater or jersey with a high close-fitting collar', 'name': 'turtleneck_(clothing)'}, {'frequency': 'r', 'id': 1154, 'synset': 'typewriter.n.01', 'synonyms': ['typewriter'], 'def': 'hand-operated character printer for printing written messages one character at a time', 'name': 'typewriter'}, {'frequency': 'f', 'id': 1155, 'synset': 'umbrella.n.01', 'synonyms': ['umbrella'], 'def': 'a lightweight handheld collapsible canopy', 'name': 'umbrella'}, {'frequency': 'c', 'id': 1156, 'synset': 'underwear.n.01', 'synonyms': ['underwear', 'underclothes', 'underclothing', 'underpants'], 'def': 'undergarment worn next to the skin and under the outer garments', 'name': 'underwear'}, {'frequency': 'r', 'id': 1157, 'synset': 'unicycle.n.01', 'synonyms': ['unicycle'], 'def': 'a vehicle with a single wheel that is driven by pedals', 'name': 'unicycle'}, {'frequency': 'c', 'id': 1158, 'synset': 'urinal.n.01', 'synonyms': ['urinal'], 'def': 'a plumbing fixture (usually attached to the wall) used by men to urinate', 'name': 'urinal'}, {'frequency': 'r', 'id': 1159, 'synset': 'urn.n.01', 'synonyms': ['urn'], 'def': 'a large vase that usually has a pedestal or feet', 'name': 'urn'}, {'frequency': 'c', 'id': 1160, 'synset': 'vacuum.n.04', 'synonyms': ['vacuum_cleaner'], 'def': 'an electrical home appliance that cleans by suction', 'name': 'vacuum_cleaner'}, {'frequency': 'c', 'id': 1161, 'synset': 'valve.n.03', 'synonyms': ['valve'], 'def': 'control consisting of a mechanical device for controlling the flow of a fluid', 'name': 'valve'}, {'frequency': 'f', 'id': 1162, 'synset': 'vase.n.01', 'synonyms': ['vase'], 'def': 'an open jar of glass or porcelain used as an ornament or to hold flowers', 'name': 'vase'}, {'frequency': 'c', 'id': 1163, 'synset': 'vending_machine.n.01', 'synonyms': ['vending_machine'], 'def': 'a slot machine for selling goods', 'name': 'vending_machine'}, {'frequency': 'f', 'id': 1164, 'synset': 'vent.n.01', 'synonyms': ['vent', 'blowhole', 'air_vent'], 'def': 'a hole for the escape of gas or air', 'name': 'vent'}, {'frequency': 'c', 'id': 1165, 'synset': 'videotape.n.01', 'synonyms': ['videotape'], 'def': 'a video recording made on magnetic tape', 'name': 'videotape'}, {'frequency': 'r', 'id': 1166, 'synset': 'vinegar.n.01', 'synonyms': ['vinegar'], 'def': 'sour-tasting liquid produced usually by oxidation of the alcohol in wine or cider and used as a condiment or food preservative', 'name': 'vinegar'}, {'frequency': 'r', 'id': 1167, 'synset': 'violin.n.01', 'synonyms': ['violin', 'fiddle'], 'def': 'bowed stringed instrument that is the highest member of the violin family', 'name': 'violin'}, {'frequency': 'r', 'id': 1168, 'synset': 'vodka.n.01', 'synonyms': ['vodka'], 'def': 'unaged colorless liquor originating in Russia', 'name': 'vodka'}, {'frequency': 'r', 'id': 1169, 'synset': 'volleyball.n.02', 'synonyms': ['volleyball'], 'def': 'an inflated ball used in playing volleyball', 'name': 'volleyball'}, {'frequency': 'r', 'id': 1170, 'synset': 'vulture.n.01', 'synonyms': ['vulture'], 'def': 'any of various large birds of prey having naked heads and weak claws and feeding chiefly on carrion', 'name': 'vulture'}, {'frequency': 'c', 'id': 1171, 'synset': 'waffle.n.01', 'synonyms': ['waffle'], 'def': 'pancake batter baked in a waffle iron', 'name': 'waffle'}, {'frequency': 'r', 'id': 1172, 'synset': 'waffle_iron.n.01', 'synonyms': ['waffle_iron'], 'def': 'a kitchen appliance for baking waffles', 'name': 'waffle_iron'}, {'frequency': 'c', 'id': 1173, 'synset': 'wagon.n.01', 'synonyms': ['wagon'], 'def': 'any of various kinds of wheeled vehicles drawn by an animal or a tractor', 'name': 'wagon'}, {'frequency': 'c', 'id': 1174, 'synset': 'wagon_wheel.n.01', 'synonyms': ['wagon_wheel'], 'def': 'a wheel of a wagon', 'name': 'wagon_wheel'}, {'frequency': 'c', 'id': 1175, 'synset': 'walking_stick.n.01', 'synonyms': ['walking_stick'], 'def': 'a stick carried in the hand for support in walking', 'name': 'walking_stick'}, {'frequency': 'c', 'id': 1176, 'synset': 'wall_clock.n.01', 'synonyms': ['wall_clock'], 'def': 'a clock mounted on a wall', 'name': 'wall_clock'}, {'frequency': 'f', 'id': 1177, 'synset': 'wall_socket.n.01', 'synonyms': ['wall_socket', 'wall_plug', 'electric_outlet', 'electrical_outlet', 'outlet', 'electric_receptacle'], 'def': 'receptacle providing a place in a wiring system where current can be taken to run electrical devices', 'name': 'wall_socket'}, {'frequency': 'c', 'id': 1178, 'synset': 'wallet.n.01', 'synonyms': ['wallet', 'billfold'], 'def': 'a pocket-size case for holding papers and paper money', 'name': 'wallet'}, {'frequency': 'r', 'id': 1179, 'synset': 'walrus.n.01', 'synonyms': ['walrus'], 'def': 'either of two large northern marine mammals having ivory tusks and tough hide over thick blubber', 'name': 'walrus'}, {'frequency': 'r', 'id': 1180, 'synset': 'wardrobe.n.01', 'synonyms': ['wardrobe'], 'def': 'a tall piece of furniture that provides storage space for clothes; has a door and rails or hooks for hanging clothes', 'name': 'wardrobe'}, {'frequency': 'r', 'id': 1181, 'synset': 'wasabi.n.02', 'synonyms': ['wasabi'], 'def': 'the thick green root of the wasabi plant that the Japanese use in cooking and that tastes like strong horseradish', 'name': 'wasabi'}, {'frequency': 'c', 'id': 1182, 'synset': 'washer.n.03', 'synonyms': ['automatic_washer', 'washing_machine'], 'def': 'a home appliance for washing clothes and linens automatically', 'name': 'automatic_washer'}, {'frequency': 'f', 'id': 1183, 'synset': 'watch.n.01', 'synonyms': ['watch', 'wristwatch'], 'def': 'a small, portable timepiece', 'name': 'watch'}, {'frequency': 'f', 'id': 1184, 'synset': 'water_bottle.n.01', 'synonyms': ['water_bottle'], 'def': 'a bottle for holding water', 'name': 'water_bottle'}, {'frequency': 'c', 'id': 1185, 'synset': 'water_cooler.n.01', 'synonyms': ['water_cooler'], 'def': 'a device for cooling and dispensing drinking water', 'name': 'water_cooler'}, {'frequency': 'c', 'id': 1186, 'synset': 'water_faucet.n.01', 'synonyms': ['water_faucet', 'water_tap', 'tap_(water_faucet)'], 'def': 'a faucet for drawing water from a pipe or cask', 'name': 'water_faucet'}, {'frequency': 'r', 'id': 1187, 'synset': 'water_filter.n.01', 'synonyms': ['water_filter'], 'def': 'a filter to remove impurities from the water supply', 'name': 'water_filter'}, {'frequency': 'r', 'id': 1188, 'synset': 'water_heater.n.01', 'synonyms': ['water_heater', 'hot-water_heater'], 'def': 'a heater and storage tank to supply heated water', 'name': 'water_heater'}, {'frequency': 'r', 'id': 1189, 'synset': 'water_jug.n.01', 'synonyms': ['water_jug'], 'def': 'a jug that holds water', 'name': 'water_jug'}, {'frequency': 'r', 'id': 1190, 'synset': 'water_pistol.n.01', 'synonyms': ['water_gun', 'squirt_gun'], 'def': 'plaything consisting of a toy pistol that squirts water', 'name': 'water_gun'}, {'frequency': 'c', 'id': 1191, 'synset': 'water_scooter.n.01', 'synonyms': ['water_scooter', 'sea_scooter', 'jet_ski'], 'def': 'a motorboat resembling a motor scooter (NOT A SURFBOARD OR WATER SKI)', 'name': 'water_scooter'}, {'frequency': 'c', 'id': 1192, 'synset': 'water_ski.n.01', 'synonyms': ['water_ski'], 'def': 'broad ski for skimming over water towed by a speedboat (DO NOT MARK WATER)', 'name': 'water_ski'}, {'frequency': 'c', 'id': 1193, 'synset': 'water_tower.n.01', 'synonyms': ['water_tower'], 'def': 'a large reservoir for water', 'name': 'water_tower'}, {'frequency': 'c', 'id': 1194, 'synset': 'watering_can.n.01', 'synonyms': ['watering_can'], 'def': 'a container with a handle and a spout with a perforated nozzle; used to sprinkle water over plants', 'name': 'watering_can'}, {'frequency': 'c', 'id': 1195, 'synset': 'watermelon.n.02', 'synonyms': ['watermelon'], 'def': 'large oblong or roundish melon with a hard green rind and sweet watery red or occasionally yellowish pulp', 'name': 'watermelon'}, {'frequency': 'f', 'id': 1196, 'synset': 'weathervane.n.01', 'synonyms': ['weathervane', 'vane_(weathervane)', 'wind_vane'], 'def': 'mechanical device attached to an elevated structure; rotates freely to show the direction of the wind', 'name': 'weathervane'}, {'frequency': 'c', 'id': 1197, 'synset': 'webcam.n.01', 'synonyms': ['webcam'], 'def': 'a digital camera designed to take digital photographs and transmit them over the internet', 'name': 'webcam'}, {'frequency': 'c', 'id': 1198, 'synset': 'wedding_cake.n.01', 'synonyms': ['wedding_cake', 'bridecake'], 'def': 'a rich cake with two or more tiers and covered with frosting and decorations; served at a wedding reception', 'name': 'wedding_cake'}, {'frequency': 'c', 'id': 1199, 'synset': 'wedding_ring.n.01', 'synonyms': ['wedding_ring', 'wedding_band'], 'def': 'a ring given to the bride and/or groom at the wedding', 'name': 'wedding_ring'}, {'frequency': 'f', 'id': 1200, 'synset': 'wet_suit.n.01', 'synonyms': ['wet_suit'], 'def': 'a close-fitting garment made of a permeable material; worn in cold water to retain body heat', 'name': 'wet_suit'}, {'frequency': 'f', 'id': 1201, 'synset': 'wheel.n.01', 'synonyms': ['wheel'], 'def': 'a circular frame with spokes (or a solid disc) that can rotate on a shaft or axle', 'name': 'wheel'}, {'frequency': 'c', 'id': 1202, 'synset': 'wheelchair.n.01', 'synonyms': ['wheelchair'], 'def': 'a movable chair mounted on large wheels', 'name': 'wheelchair'}, {'frequency': 'c', 'id': 1203, 'synset': 'whipped_cream.n.01', 'synonyms': ['whipped_cream'], 'def': 'cream that has been beaten until light and fluffy', 'name': 'whipped_cream'}, {'frequency': 'r', 'id': 1204, 'synset': 'whiskey.n.01', 'synonyms': ['whiskey'], 'def': 'a liquor made from fermented mash of grain', 'name': 'whiskey'}, {'frequency': 'r', 'id': 1205, 'synset': 'whistle.n.03', 'synonyms': ['whistle'], 'def': 'a small wind instrument that produces a whistling sound by blowing into it', 'name': 'whistle'}, {'frequency': 'r', 'id': 1206, 'synset': 'wick.n.02', 'synonyms': ['wick'], 'def': 'a loosely woven cord in a candle or oil lamp that is lit on fire', 'name': 'wick'}, {'frequency': 'c', 'id': 1207, 'synset': 'wig.n.01', 'synonyms': ['wig'], 'def': 'hairpiece covering the head and made of real or synthetic hair', 'name': 'wig'}, {'frequency': 'c', 'id': 1208, 'synset': 'wind_chime.n.01', 'synonyms': ['wind_chime'], 'def': 'a decorative arrangement of pieces of metal or glass or pottery that hang together loosely so the wind can cause them to tinkle', 'name': 'wind_chime'}, {'frequency': 'c', 'id': 1209, 'synset': 'windmill.n.01', 'synonyms': ['windmill'], 'def': 'a mill that is powered by the wind', 'name': 'windmill'}, {'frequency': 'c', 'id': 1210, 'synset': 'window_box.n.01', 'synonyms': ['window_box_(for_plants)'], 'def': 'a container for growing plants on a windowsill', 'name': 'window_box_(for_plants)'}, {'frequency': 'f', 'id': 1211, 'synset': 'windshield_wiper.n.01', 'synonyms': ['windshield_wiper', 'windscreen_wiper', 'wiper_(for_windshield/screen)'], 'def': 'a mechanical device that cleans the windshield', 'name': 'windshield_wiper'}, {'frequency': 'c', 'id': 1212, 'synset': 'windsock.n.01', 'synonyms': ['windsock', 'air_sock', 'air-sleeve', 'wind_sleeve', 'wind_cone'], 'def': 'a truncated cloth cone mounted on a mast/pole; shows wind direction', 'name': 'windsock'}, {'frequency': 'f', 'id': 1213, 'synset': 'wine_bottle.n.01', 'synonyms': ['wine_bottle'], 'def': 'a bottle for holding wine', 'name': 'wine_bottle'}, {'frequency': 'r', 'id': 1214, 'synset': 'wine_bucket.n.01', 'synonyms': ['wine_bucket', 'wine_cooler'], 'def': 'a bucket of ice used to chill a bottle of wine', 'name': 'wine_bucket'}, {'frequency': 'f', 'id': 1215, 'synset': 'wineglass.n.01', 'synonyms': ['wineglass'], 'def': 'a glass that has a stem and in which wine is served', 'name': 'wineglass'}, {'frequency': 'r', 'id': 1216, 'synset': 'wing_chair.n.01', 'synonyms': ['wing_chair'], 'def': 'easy chair having wings on each side of a high back', 'name': 'wing_chair'}, {'frequency': 'c', 'id': 1217, 'synset': 'winker.n.02', 'synonyms': ['blinder_(for_horses)'], 'def': 'blinds that prevent a horse from seeing something on either side', 'name': 'blinder_(for_horses)'}, {'frequency': 'c', 'id': 1218, 'synset': 'wok.n.01', 'synonyms': ['wok'], 'def': 'pan with a convex bottom; used for frying in Chinese cooking', 'name': 'wok'}, {'frequency': 'r', 'id': 1219, 'synset': 'wolf.n.01', 'synonyms': ['wolf'], 'def': 'a wild carnivorous mammal of the dog family, living and hunting in packs', 'name': 'wolf'}, {'frequency': 'c', 'id': 1220, 'synset': 'wooden_spoon.n.02', 'synonyms': ['wooden_spoon'], 'def': 'a spoon made of wood', 'name': 'wooden_spoon'}, {'frequency': 'c', 'id': 1221, 'synset': 'wreath.n.01', 'synonyms': ['wreath'], 'def': 'an arrangement of flowers, leaves, or stems fastened in a ring', 'name': 'wreath'}, {'frequency': 'c', 'id': 1222, 'synset': 'wrench.n.03', 'synonyms': ['wrench', 'spanner'], 'def': 'a hand tool that is used to hold or twist a nut or bolt', 'name': 'wrench'}, {'frequency': 'c', 'id': 1223, 'synset': 'wristband.n.01', 'synonyms': ['wristband'], 'def': 'band consisting of a part of a sleeve that covers the wrist', 'name': 'wristband'}, {'frequency': 'f', 'id': 1224, 'synset': 'wristlet.n.01', 'synonyms': ['wristlet', 'wrist_band'], 'def': 'a band or bracelet worn around the wrist', 'name': 'wristlet'}, {'frequency': 'r', 'id': 1225, 'synset': 'yacht.n.01', 'synonyms': ['yacht'], 'def': 'an expensive vessel propelled by sail or power and used for cruising or racing', 'name': 'yacht'}, {'frequency': 'r', 'id': 1226, 'synset': 'yak.n.02', 'synonyms': ['yak'], 'def': 'large long-haired wild ox of Tibet often domesticated', 'name': 'yak'}, {'frequency': 'c', 'id': 1227, 'synset': 'yogurt.n.01', 'synonyms': ['yogurt', 'yoghurt', 'yoghourt'], 'def': 'a custard-like food made from curdled milk', 'name': 'yogurt'}, {'frequency': 'r', 'id': 1228, 'synset': 'yoke.n.07', 'synonyms': ['yoke_(animal_equipment)'], 'def': 'gear joining two animals at the neck; NOT egg yolk', 'name': 'yoke_(animal_equipment)'}, {'frequency': 'f', 'id': 1229, 'synset': 'zebra.n.01', 'synonyms': ['zebra'], 'def': 'any of several fleet black-and-white striped African equines', 'name': 'zebra'}, {'frequency': 'c', 'id': 1230, 'synset': 'zucchini.n.02', 'synonyms': ['zucchini', 'courgette'], 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}] # noqa # fmt: on ================================================ FILE: annotator/oneformer/detectron2/data/datasets/lvis_v1_categories.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Autogen with # with open("lvis_v1_val.json", "r") as f: # a = json.load(f) # c = a["categories"] # for x in c: # del x["image_count"] # del x["instance_count"] # LVIS_CATEGORIES = repr(c) + " # noqa" # with open("/tmp/lvis_categories.py", "wt") as f: # f.write(f"LVIS_CATEGORIES = {LVIS_CATEGORIES}") # Then paste the contents of that file below # fmt: off LVIS_CATEGORIES = [{'frequency': 'c', 'synset': 'aerosol.n.02', 'synonyms': ['aerosol_can', 'spray_can'], 'id': 1, 'def': 'a dispenser that holds a substance under pressure', 'name': 'aerosol_can'}, {'frequency': 'f', 'synset': 'air_conditioner.n.01', 'synonyms': ['air_conditioner'], 'id': 2, 'def': 'a machine that keeps air cool and dry', 'name': 'air_conditioner'}, {'frequency': 'f', 'synset': 'airplane.n.01', 'synonyms': ['airplane', 'aeroplane'], 'id': 3, 'def': 'an aircraft that has a fixed wing and is powered by propellers or jets', 'name': 'airplane'}, {'frequency': 'f', 'synset': 'alarm_clock.n.01', 'synonyms': ['alarm_clock'], 'id': 4, 'def': 'a clock that wakes a sleeper at some preset time', 'name': 'alarm_clock'}, {'frequency': 'c', 'synset': 'alcohol.n.01', 'synonyms': ['alcohol', 'alcoholic_beverage'], 'id': 5, 'def': 'a liquor or brew containing alcohol as the active agent', 'name': 'alcohol'}, {'frequency': 'c', 'synset': 'alligator.n.02', 'synonyms': ['alligator', 'gator'], 'id': 6, 'def': 'amphibious reptiles related to crocodiles but with shorter broader snouts', 'name': 'alligator'}, {'frequency': 'c', 'synset': 'almond.n.02', 'synonyms': ['almond'], 'id': 7, 'def': 'oval-shaped edible seed of the almond tree', 'name': 'almond'}, {'frequency': 'c', 'synset': 'ambulance.n.01', 'synonyms': ['ambulance'], 'id': 8, 'def': 'a vehicle that takes people to and from hospitals', 'name': 'ambulance'}, {'frequency': 'c', 'synset': 'amplifier.n.01', 'synonyms': ['amplifier'], 'id': 9, 'def': 'electronic equipment that increases strength of signals', 'name': 'amplifier'}, {'frequency': 'c', 'synset': 'anklet.n.03', 'synonyms': ['anklet', 'ankle_bracelet'], 'id': 10, 'def': 'an ornament worn around the ankle', 'name': 'anklet'}, {'frequency': 'f', 'synset': 'antenna.n.01', 'synonyms': ['antenna', 'aerial', 'transmitting_aerial'], 'id': 11, 'def': 'an electrical device that sends or receives radio or television signals', 'name': 'antenna'}, {'frequency': 'f', 'synset': 'apple.n.01', 'synonyms': ['apple'], 'id': 12, 'def': 'fruit with red or yellow or green skin and sweet to tart crisp whitish flesh', 'name': 'apple'}, {'frequency': 'r', 'synset': 'applesauce.n.01', 'synonyms': ['applesauce'], 'id': 13, 'def': 'puree of stewed apples usually sweetened and spiced', 'name': 'applesauce'}, {'frequency': 'r', 'synset': 'apricot.n.02', 'synonyms': ['apricot'], 'id': 14, 'def': 'downy yellow to rosy-colored fruit resembling a small peach', 'name': 'apricot'}, {'frequency': 'f', 'synset': 'apron.n.01', 'synonyms': ['apron'], 'id': 15, 'def': 'a garment of cloth that is tied about the waist and worn to protect clothing', 'name': 'apron'}, {'frequency': 'c', 'synset': 'aquarium.n.01', 'synonyms': ['aquarium', 'fish_tank'], 'id': 16, 'def': 'a tank/pool/bowl filled with water for keeping live fish and underwater animals', 'name': 'aquarium'}, {'frequency': 'r', 'synset': 'arctic.n.02', 'synonyms': ['arctic_(type_of_shoe)', 'galosh', 'golosh', 'rubber_(type_of_shoe)', 'gumshoe'], 'id': 17, 'def': 'a waterproof overshoe that protects shoes from water or snow', 'name': 'arctic_(type_of_shoe)'}, {'frequency': 'c', 'synset': 'armband.n.02', 'synonyms': ['armband'], 'id': 18, 'def': 'a band worn around the upper arm', 'name': 'armband'}, {'frequency': 'f', 'synset': 'armchair.n.01', 'synonyms': ['armchair'], 'id': 19, 'def': 'chair with a support on each side for arms', 'name': 'armchair'}, {'frequency': 'r', 'synset': 'armoire.n.01', 'synonyms': ['armoire'], 'id': 20, 'def': 'a large wardrobe or cabinet', 'name': 'armoire'}, {'frequency': 'r', 'synset': 'armor.n.01', 'synonyms': ['armor', 'armour'], 'id': 21, 'def': 'protective covering made of metal and used in combat', 'name': 'armor'}, {'frequency': 'c', 'synset': 'artichoke.n.02', 'synonyms': ['artichoke'], 'id': 22, 'def': 'a thistlelike flower head with edible fleshy leaves and heart', 'name': 'artichoke'}, {'frequency': 'f', 'synset': 'ashcan.n.01', 'synonyms': ['trash_can', 'garbage_can', 'wastebin', 'dustbin', 'trash_barrel', 'trash_bin'], 'id': 23, 'def': 'a bin that holds rubbish until it is collected', 'name': 'trash_can'}, {'frequency': 'c', 'synset': 'ashtray.n.01', 'synonyms': ['ashtray'], 'id': 24, 'def': "a receptacle for the ash from smokers' cigars or cigarettes", 'name': 'ashtray'}, {'frequency': 'c', 'synset': 'asparagus.n.02', 'synonyms': ['asparagus'], 'id': 25, 'def': 'edible young shoots of the asparagus plant', 'name': 'asparagus'}, {'frequency': 'c', 'synset': 'atomizer.n.01', 'synonyms': ['atomizer', 'atomiser', 'spray', 'sprayer', 'nebulizer', 'nebuliser'], 'id': 26, 'def': 'a dispenser that turns a liquid (such as perfume) into a fine mist', 'name': 'atomizer'}, {'frequency': 'f', 'synset': 'avocado.n.01', 'synonyms': ['avocado'], 'id': 27, 'def': 'a pear-shaped fruit with green or blackish skin and rich yellowish pulp enclosing a single large seed', 'name': 'avocado'}, {'frequency': 'c', 'synset': 'award.n.02', 'synonyms': ['award', 'accolade'], 'id': 28, 'def': 'a tangible symbol signifying approval or distinction', 'name': 'award'}, {'frequency': 'f', 'synset': 'awning.n.01', 'synonyms': ['awning'], 'id': 29, 'def': 'a canopy made of canvas to shelter people or things from rain or sun', 'name': 'awning'}, {'frequency': 'r', 'synset': 'ax.n.01', 'synonyms': ['ax', 'axe'], 'id': 30, 'def': 'an edge tool with a heavy bladed head mounted across a handle', 'name': 'ax'}, {'frequency': 'r', 'synset': 'baboon.n.01', 'synonyms': ['baboon'], 'id': 31, 'def': 'large terrestrial monkeys having doglike muzzles', 'name': 'baboon'}, {'frequency': 'f', 'synset': 'baby_buggy.n.01', 'synonyms': ['baby_buggy', 'baby_carriage', 'perambulator', 'pram', 'stroller'], 'id': 32, 'def': 'a small vehicle with four wheels in which a baby or child is pushed around', 'name': 'baby_buggy'}, {'frequency': 'c', 'synset': 'backboard.n.01', 'synonyms': ['basketball_backboard'], 'id': 33, 'def': 'a raised vertical board with basket attached; used to play basketball', 'name': 'basketball_backboard'}, {'frequency': 'f', 'synset': 'backpack.n.01', 'synonyms': ['backpack', 'knapsack', 'packsack', 'rucksack', 'haversack'], 'id': 34, 'def': 'a bag carried by a strap on your back or shoulder', 'name': 'backpack'}, {'frequency': 'f', 'synset': 'bag.n.04', 'synonyms': ['handbag', 'purse', 'pocketbook'], 'id': 35, 'def': 'a container used for carrying money and small personal items or accessories', 'name': 'handbag'}, {'frequency': 'f', 'synset': 'bag.n.06', 'synonyms': ['suitcase', 'baggage', 'luggage'], 'id': 36, 'def': 'cases used to carry belongings when traveling', 'name': 'suitcase'}, {'frequency': 'c', 'synset': 'bagel.n.01', 'synonyms': ['bagel', 'beigel'], 'id': 37, 'def': 'glazed yeast-raised doughnut-shaped roll with hard crust', 'name': 'bagel'}, {'frequency': 'r', 'synset': 'bagpipe.n.01', 'synonyms': ['bagpipe'], 'id': 38, 'def': 'a tubular wind instrument; the player blows air into a bag and squeezes it out', 'name': 'bagpipe'}, {'frequency': 'r', 'synset': 'baguet.n.01', 'synonyms': ['baguet', 'baguette'], 'id': 39, 'def': 'narrow French stick loaf', 'name': 'baguet'}, {'frequency': 'r', 'synset': 'bait.n.02', 'synonyms': ['bait', 'lure'], 'id': 40, 'def': 'something used to lure fish or other animals into danger so they can be trapped or killed', 'name': 'bait'}, {'frequency': 'f', 'synset': 'ball.n.06', 'synonyms': ['ball'], 'id': 41, 'def': 'a spherical object used as a plaything', 'name': 'ball'}, {'frequency': 'r', 'synset': 'ballet_skirt.n.01', 'synonyms': ['ballet_skirt', 'tutu'], 'id': 42, 'def': 'very short skirt worn by ballerinas', 'name': 'ballet_skirt'}, {'frequency': 'f', 'synset': 'balloon.n.01', 'synonyms': ['balloon'], 'id': 43, 'def': 'large tough nonrigid bag filled with gas or heated air', 'name': 'balloon'}, {'frequency': 'c', 'synset': 'bamboo.n.02', 'synonyms': ['bamboo'], 'id': 44, 'def': 'woody tropical grass having hollow woody stems', 'name': 'bamboo'}, {'frequency': 'f', 'synset': 'banana.n.02', 'synonyms': ['banana'], 'id': 45, 'def': 'elongated crescent-shaped yellow fruit with soft sweet flesh', 'name': 'banana'}, {'frequency': 'c', 'synset': 'band_aid.n.01', 'synonyms': ['Band_Aid'], 'id': 46, 'def': 'trade name for an adhesive bandage to cover small cuts or blisters', 'name': 'Band_Aid'}, {'frequency': 'c', 'synset': 'bandage.n.01', 'synonyms': ['bandage'], 'id': 47, 'def': 'a piece of soft material that covers and protects an injured part of the body', 'name': 'bandage'}, {'frequency': 'f', 'synset': 'bandanna.n.01', 'synonyms': ['bandanna', 'bandana'], 'id': 48, 'def': 'large and brightly colored handkerchief; often used as a neckerchief', 'name': 'bandanna'}, {'frequency': 'r', 'synset': 'banjo.n.01', 'synonyms': ['banjo'], 'id': 49, 'def': 'a stringed instrument of the guitar family with a long neck and circular body', 'name': 'banjo'}, {'frequency': 'f', 'synset': 'banner.n.01', 'synonyms': ['banner', 'streamer'], 'id': 50, 'def': 'long strip of cloth or paper used for decoration or advertising', 'name': 'banner'}, {'frequency': 'r', 'synset': 'barbell.n.01', 'synonyms': ['barbell'], 'id': 51, 'def': 'a bar to which heavy discs are attached at each end; used in weightlifting', 'name': 'barbell'}, {'frequency': 'r', 'synset': 'barge.n.01', 'synonyms': ['barge'], 'id': 52, 'def': 'a flatbottom boat for carrying heavy loads (especially on canals)', 'name': 'barge'}, {'frequency': 'f', 'synset': 'barrel.n.02', 'synonyms': ['barrel', 'cask'], 'id': 53, 'def': 'a cylindrical container that holds liquids', 'name': 'barrel'}, {'frequency': 'c', 'synset': 'barrette.n.01', 'synonyms': ['barrette'], 'id': 54, 'def': "a pin for holding women's hair in place", 'name': 'barrette'}, {'frequency': 'c', 'synset': 'barrow.n.03', 'synonyms': ['barrow', 'garden_cart', 'lawn_cart', 'wheelbarrow'], 'id': 55, 'def': 'a cart for carrying small loads; has handles and one or more wheels', 'name': 'barrow'}, {'frequency': 'f', 'synset': 'base.n.03', 'synonyms': ['baseball_base'], 'id': 56, 'def': 'a place that the runner must touch before scoring', 'name': 'baseball_base'}, {'frequency': 'f', 'synset': 'baseball.n.02', 'synonyms': ['baseball'], 'id': 57, 'def': 'a ball used in playing baseball', 'name': 'baseball'}, {'frequency': 'f', 'synset': 'baseball_bat.n.01', 'synonyms': ['baseball_bat'], 'id': 58, 'def': 'an implement used in baseball by the batter', 'name': 'baseball_bat'}, {'frequency': 'f', 'synset': 'baseball_cap.n.01', 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'id': 59, 'def': 'a cap with a bill', 'name': 'baseball_cap'}, {'frequency': 'f', 'synset': 'baseball_glove.n.01', 'synonyms': ['baseball_glove', 'baseball_mitt'], 'id': 60, 'def': 'the handwear used by fielders in playing baseball', 'name': 'baseball_glove'}, {'frequency': 'f', 'synset': 'basket.n.01', 'synonyms': ['basket', 'handbasket'], 'id': 61, 'def': 'a container that is usually woven and has handles', 'name': 'basket'}, {'frequency': 'c', 'synset': 'basketball.n.02', 'synonyms': ['basketball'], 'id': 62, 'def': 'an inflated ball used in playing basketball', 'name': 'basketball'}, {'frequency': 'r', 'synset': 'bass_horn.n.01', 'synonyms': ['bass_horn', 'sousaphone', 'tuba'], 'id': 63, 'def': 'the lowest brass wind instrument', 'name': 'bass_horn'}, {'frequency': 'c', 'synset': 'bat.n.01', 'synonyms': ['bat_(animal)'], 'id': 64, 'def': 'nocturnal mouselike mammal with forelimbs modified to form membranous wings', 'name': 'bat_(animal)'}, {'frequency': 'f', 'synset': 'bath_mat.n.01', 'synonyms': ['bath_mat'], 'id': 65, 'def': 'a heavy towel or mat to stand on while drying yourself after a bath', 'name': 'bath_mat'}, {'frequency': 'f', 'synset': 'bath_towel.n.01', 'synonyms': ['bath_towel'], 'id': 66, 'def': 'a large towel; to dry yourself after a bath', 'name': 'bath_towel'}, {'frequency': 'c', 'synset': 'bathrobe.n.01', 'synonyms': ['bathrobe'], 'id': 67, 'def': 'a loose-fitting robe of towelling; worn after a bath or swim', 'name': 'bathrobe'}, {'frequency': 'f', 'synset': 'bathtub.n.01', 'synonyms': ['bathtub', 'bathing_tub'], 'id': 68, 'def': 'a large open container that you fill with water and use to wash the body', 'name': 'bathtub'}, {'frequency': 'r', 'synset': 'batter.n.02', 'synonyms': ['batter_(food)'], 'id': 69, 'def': 'a liquid or semiliquid mixture, as of flour, eggs, and milk, used in cooking', 'name': 'batter_(food)'}, {'frequency': 'c', 'synset': 'battery.n.02', 'synonyms': ['battery'], 'id': 70, 'def': 'a portable device that produces electricity', 'name': 'battery'}, {'frequency': 'r', 'synset': 'beach_ball.n.01', 'synonyms': ['beachball'], 'id': 71, 'def': 'large and light ball; for play at the seaside', 'name': 'beachball'}, {'frequency': 'c', 'synset': 'bead.n.01', 'synonyms': ['bead'], 'id': 72, 'def': 'a small ball with a hole through the middle used for ornamentation, jewellery, etc.', 'name': 'bead'}, {'frequency': 'c', 'synset': 'bean_curd.n.01', 'synonyms': ['bean_curd', 'tofu'], 'id': 73, 'def': 'cheeselike food made of curdled soybean milk', 'name': 'bean_curd'}, {'frequency': 'c', 'synset': 'beanbag.n.01', 'synonyms': ['beanbag'], 'id': 74, 'def': 'a bag filled with dried beans or similar items; used in games or to sit on', 'name': 'beanbag'}, {'frequency': 'f', 'synset': 'beanie.n.01', 'synonyms': ['beanie', 'beany'], 'id': 75, 'def': 'a small skullcap; formerly worn by schoolboys and college freshmen', 'name': 'beanie'}, {'frequency': 'f', 'synset': 'bear.n.01', 'synonyms': ['bear'], 'id': 76, 'def': 'large carnivorous or omnivorous mammals with shaggy coats and claws', 'name': 'bear'}, {'frequency': 'f', 'synset': 'bed.n.01', 'synonyms': ['bed'], 'id': 77, 'def': 'a piece of furniture that provides a place to sleep', 'name': 'bed'}, {'frequency': 'r', 'synset': 'bedpan.n.01', 'synonyms': ['bedpan'], 'id': 78, 'def': 'a shallow vessel used by a bedridden patient for defecation and urination', 'name': 'bedpan'}, {'frequency': 'f', 'synset': 'bedspread.n.01', 'synonyms': ['bedspread', 'bedcover', 'bed_covering', 'counterpane', 'spread'], 'id': 79, 'def': 'decorative cover for a bed', 'name': 'bedspread'}, {'frequency': 'f', 'synset': 'beef.n.01', 'synonyms': ['cow'], 'id': 80, 'def': 'cattle/cow', 'name': 'cow'}, {'frequency': 'f', 'synset': 'beef.n.02', 'synonyms': ['beef_(food)', 'boeuf_(food)'], 'id': 81, 'def': 'meat from an adult domestic bovine', 'name': 'beef_(food)'}, {'frequency': 'r', 'synset': 'beeper.n.01', 'synonyms': ['beeper', 'pager'], 'id': 82, 'def': 'an device that beeps when the person carrying it is being paged', 'name': 'beeper'}, {'frequency': 'f', 'synset': 'beer_bottle.n.01', 'synonyms': ['beer_bottle'], 'id': 83, 'def': 'a bottle that holds beer', 'name': 'beer_bottle'}, {'frequency': 'c', 'synset': 'beer_can.n.01', 'synonyms': ['beer_can'], 'id': 84, 'def': 'a can that holds beer', 'name': 'beer_can'}, {'frequency': 'r', 'synset': 'beetle.n.01', 'synonyms': ['beetle'], 'id': 85, 'def': 'insect with hard wing covers', 'name': 'beetle'}, {'frequency': 'f', 'synset': 'bell.n.01', 'synonyms': ['bell'], 'id': 86, 'def': 'a hollow device made of metal that makes a ringing sound when struck', 'name': 'bell'}, {'frequency': 'f', 'synset': 'bell_pepper.n.02', 'synonyms': ['bell_pepper', 'capsicum'], 'id': 87, 'def': 'large bell-shaped sweet pepper in green or red or yellow or orange or black varieties', 'name': 'bell_pepper'}, {'frequency': 'f', 'synset': 'belt.n.02', 'synonyms': ['belt'], 'id': 88, 'def': 'a band to tie or buckle around the body (usually at the waist)', 'name': 'belt'}, {'frequency': 'f', 'synset': 'belt_buckle.n.01', 'synonyms': ['belt_buckle'], 'id': 89, 'def': 'the buckle used to fasten a belt', 'name': 'belt_buckle'}, {'frequency': 'f', 'synset': 'bench.n.01', 'synonyms': ['bench'], 'id': 90, 'def': 'a long seat for more than one person', 'name': 'bench'}, {'frequency': 'c', 'synset': 'beret.n.01', 'synonyms': ['beret'], 'id': 91, 'def': 'a cap with no brim or bill; made of soft cloth', 'name': 'beret'}, {'frequency': 'c', 'synset': 'bib.n.02', 'synonyms': ['bib'], 'id': 92, 'def': 'a napkin tied under the chin of a child while eating', 'name': 'bib'}, {'frequency': 'r', 'synset': 'bible.n.01', 'synonyms': ['Bible'], 'id': 93, 'def': 'the sacred writings of the Christian religions', 'name': 'Bible'}, {'frequency': 'f', 'synset': 'bicycle.n.01', 'synonyms': ['bicycle', 'bike_(bicycle)'], 'id': 94, 'def': 'a wheeled vehicle that has two wheels and is moved by foot pedals', 'name': 'bicycle'}, {'frequency': 'f', 'synset': 'bill.n.09', 'synonyms': ['visor', 'vizor'], 'id': 95, 'def': 'a brim that projects to the front to shade the eyes', 'name': 'visor'}, {'frequency': 'f', 'synset': 'billboard.n.01', 'synonyms': ['billboard'], 'id': 96, 'def': 'large outdoor signboard', 'name': 'billboard'}, {'frequency': 'c', 'synset': 'binder.n.03', 'synonyms': ['binder', 'ring-binder'], 'id': 97, 'def': 'holds loose papers or magazines', 'name': 'binder'}, {'frequency': 'c', 'synset': 'binoculars.n.01', 'synonyms': ['binoculars', 'field_glasses', 'opera_glasses'], 'id': 98, 'def': 'an optical instrument designed for simultaneous use by both eyes', 'name': 'binoculars'}, {'frequency': 'f', 'synset': 'bird.n.01', 'synonyms': ['bird'], 'id': 99, 'def': 'animal characterized by feathers and wings', 'name': 'bird'}, {'frequency': 'c', 'synset': 'bird_feeder.n.01', 'synonyms': ['birdfeeder'], 'id': 100, 'def': 'an outdoor device that supplies food for wild birds', 'name': 'birdfeeder'}, {'frequency': 'c', 'synset': 'birdbath.n.01', 'synonyms': ['birdbath'], 'id': 101, 'def': 'an ornamental basin (usually in a garden) for birds to bathe in', 'name': 'birdbath'}, {'frequency': 'c', 'synset': 'birdcage.n.01', 'synonyms': ['birdcage'], 'id': 102, 'def': 'a cage in which a bird can be kept', 'name': 'birdcage'}, {'frequency': 'c', 'synset': 'birdhouse.n.01', 'synonyms': ['birdhouse'], 'id': 103, 'def': 'a shelter for birds', 'name': 'birdhouse'}, {'frequency': 'f', 'synset': 'birthday_cake.n.01', 'synonyms': ['birthday_cake'], 'id': 104, 'def': 'decorated cake served at a birthday party', 'name': 'birthday_cake'}, {'frequency': 'r', 'synset': 'birthday_card.n.01', 'synonyms': ['birthday_card'], 'id': 105, 'def': 'a card expressing a birthday greeting', 'name': 'birthday_card'}, {'frequency': 'r', 'synset': 'black_flag.n.01', 'synonyms': ['pirate_flag'], 'id': 106, 'def': 'a flag usually bearing a white skull and crossbones on a black background', 'name': 'pirate_flag'}, {'frequency': 'c', 'synset': 'black_sheep.n.02', 'synonyms': ['black_sheep'], 'id': 107, 'def': 'sheep with a black coat', 'name': 'black_sheep'}, {'frequency': 'c', 'synset': 'blackberry.n.01', 'synonyms': ['blackberry'], 'id': 108, 'def': 'large sweet black or very dark purple edible aggregate fruit', 'name': 'blackberry'}, {'frequency': 'f', 'synset': 'blackboard.n.01', 'synonyms': ['blackboard', 'chalkboard'], 'id': 109, 'def': 'sheet of slate; for writing with chalk', 'name': 'blackboard'}, {'frequency': 'f', 'synset': 'blanket.n.01', 'synonyms': ['blanket'], 'id': 110, 'def': 'bedding that keeps a person warm in bed', 'name': 'blanket'}, {'frequency': 'c', 'synset': 'blazer.n.01', 'synonyms': ['blazer', 'sport_jacket', 'sport_coat', 'sports_jacket', 'sports_coat'], 'id': 111, 'def': 'lightweight jacket; often striped in the colors of a club or school', 'name': 'blazer'}, {'frequency': 'f', 'synset': 'blender.n.01', 'synonyms': ['blender', 'liquidizer', 'liquidiser'], 'id': 112, 'def': 'an electrically powered mixer that mix or chop or liquefy foods', 'name': 'blender'}, {'frequency': 'r', 'synset': 'blimp.n.02', 'synonyms': ['blimp'], 'id': 113, 'def': 'a small nonrigid airship used for observation or as a barrage balloon', 'name': 'blimp'}, {'frequency': 'f', 'synset': 'blinker.n.01', 'synonyms': ['blinker', 'flasher'], 'id': 114, 'def': 'a light that flashes on and off; used as a signal or to send messages', 'name': 'blinker'}, {'frequency': 'f', 'synset': 'blouse.n.01', 'synonyms': ['blouse'], 'id': 115, 'def': 'a top worn by women', 'name': 'blouse'}, {'frequency': 'f', 'synset': 'blueberry.n.02', 'synonyms': ['blueberry'], 'id': 116, 'def': 'sweet edible dark-blue berries of blueberry plants', 'name': 'blueberry'}, {'frequency': 'r', 'synset': 'board.n.09', 'synonyms': ['gameboard'], 'id': 117, 'def': 'a flat portable surface (usually rectangular) designed for board games', 'name': 'gameboard'}, {'frequency': 'f', 'synset': 'boat.n.01', 'synonyms': ['boat', 'ship_(boat)'], 'id': 118, 'def': 'a vessel for travel on water', 'name': 'boat'}, {'frequency': 'r', 'synset': 'bob.n.05', 'synonyms': ['bob', 'bobber', 'bobfloat'], 'id': 119, 'def': 'a small float usually made of cork; attached to a fishing line', 'name': 'bob'}, {'frequency': 'c', 'synset': 'bobbin.n.01', 'synonyms': ['bobbin', 'spool', 'reel'], 'id': 120, 'def': 'a thing around which thread/tape/film or other flexible materials can be wound', 'name': 'bobbin'}, {'frequency': 'c', 'synset': 'bobby_pin.n.01', 'synonyms': ['bobby_pin', 'hairgrip'], 'id': 121, 'def': 'a flat wire hairpin used to hold bobbed hair in place', 'name': 'bobby_pin'}, {'frequency': 'c', 'synset': 'boiled_egg.n.01', 'synonyms': ['boiled_egg', 'coddled_egg'], 'id': 122, 'def': 'egg cooked briefly in the shell in gently boiling water', 'name': 'boiled_egg'}, {'frequency': 'r', 'synset': 'bolo_tie.n.01', 'synonyms': ['bolo_tie', 'bolo', 'bola_tie', 'bola'], 'id': 123, 'def': 'a cord fastened around the neck with an ornamental clasp and worn as a necktie', 'name': 'bolo_tie'}, {'frequency': 'c', 'synset': 'bolt.n.03', 'synonyms': ['deadbolt'], 'id': 124, 'def': 'the part of a lock that is engaged or withdrawn with a key', 'name': 'deadbolt'}, {'frequency': 'f', 'synset': 'bolt.n.06', 'synonyms': ['bolt'], 'id': 125, 'def': 'a screw that screws into a nut to form a fastener', 'name': 'bolt'}, {'frequency': 'r', 'synset': 'bonnet.n.01', 'synonyms': ['bonnet'], 'id': 126, 'def': 'a hat tied under the chin', 'name': 'bonnet'}, {'frequency': 'f', 'synset': 'book.n.01', 'synonyms': ['book'], 'id': 127, 'def': 'a written work or composition that has been published', 'name': 'book'}, {'frequency': 'c', 'synset': 'bookcase.n.01', 'synonyms': ['bookcase'], 'id': 128, 'def': 'a piece of furniture with shelves for storing books', 'name': 'bookcase'}, {'frequency': 'c', 'synset': 'booklet.n.01', 'synonyms': ['booklet', 'brochure', 'leaflet', 'pamphlet'], 'id': 129, 'def': 'a small book usually having a paper cover', 'name': 'booklet'}, {'frequency': 'r', 'synset': 'bookmark.n.01', 'synonyms': ['bookmark', 'bookmarker'], 'id': 130, 'def': 'a marker (a piece of paper or ribbon) placed between the pages of a book', 'name': 'bookmark'}, {'frequency': 'r', 'synset': 'boom.n.04', 'synonyms': ['boom_microphone', 'microphone_boom'], 'id': 131, 'def': 'a pole carrying an overhead microphone projected over a film or tv set', 'name': 'boom_microphone'}, {'frequency': 'f', 'synset': 'boot.n.01', 'synonyms': ['boot'], 'id': 132, 'def': 'footwear that covers the whole foot and lower leg', 'name': 'boot'}, {'frequency': 'f', 'synset': 'bottle.n.01', 'synonyms': ['bottle'], 'id': 133, 'def': 'a glass or plastic vessel used for storing drinks or other liquids', 'name': 'bottle'}, {'frequency': 'c', 'synset': 'bottle_opener.n.01', 'synonyms': ['bottle_opener'], 'id': 134, 'def': 'an opener for removing caps or corks from bottles', 'name': 'bottle_opener'}, {'frequency': 'c', 'synset': 'bouquet.n.01', 'synonyms': ['bouquet'], 'id': 135, 'def': 'an arrangement of flowers that is usually given as a present', 'name': 'bouquet'}, {'frequency': 'r', 'synset': 'bow.n.04', 'synonyms': ['bow_(weapon)'], 'id': 136, 'def': 'a weapon for shooting arrows', 'name': 'bow_(weapon)'}, {'frequency': 'f', 'synset': 'bow.n.08', 'synonyms': ['bow_(decorative_ribbons)'], 'id': 137, 'def': 'a decorative interlacing of ribbons', 'name': 'bow_(decorative_ribbons)'}, {'frequency': 'f', 'synset': 'bow_tie.n.01', 'synonyms': ['bow-tie', 'bowtie'], 'id': 138, 'def': "a man's tie that ties in a bow", 'name': 'bow-tie'}, {'frequency': 'f', 'synset': 'bowl.n.03', 'synonyms': ['bowl'], 'id': 139, 'def': 'a dish that is round and open at the top for serving foods', 'name': 'bowl'}, {'frequency': 'r', 'synset': 'bowl.n.08', 'synonyms': ['pipe_bowl'], 'id': 140, 'def': 'a small round container that is open at the top for holding tobacco', 'name': 'pipe_bowl'}, {'frequency': 'c', 'synset': 'bowler_hat.n.01', 'synonyms': ['bowler_hat', 'bowler', 'derby_hat', 'derby', 'plug_hat'], 'id': 141, 'def': 'a felt hat that is round and hard with a narrow brim', 'name': 'bowler_hat'}, {'frequency': 'r', 'synset': 'bowling_ball.n.01', 'synonyms': ['bowling_ball'], 'id': 142, 'def': 'a large ball with finger holes used in the sport of bowling', 'name': 'bowling_ball'}, {'frequency': 'f', 'synset': 'box.n.01', 'synonyms': ['box'], 'id': 143, 'def': 'a (usually rectangular) container; may have a lid', 'name': 'box'}, {'frequency': 'r', 'synset': 'boxing_glove.n.01', 'synonyms': ['boxing_glove'], 'id': 144, 'def': 'large glove coverings the fists of a fighter worn for the sport of boxing', 'name': 'boxing_glove'}, {'frequency': 'c', 'synset': 'brace.n.06', 'synonyms': ['suspenders'], 'id': 145, 'def': 'elastic straps that hold trousers up (usually used in the plural)', 'name': 'suspenders'}, {'frequency': 'f', 'synset': 'bracelet.n.02', 'synonyms': ['bracelet', 'bangle'], 'id': 146, 'def': 'jewelry worn around the wrist for decoration', 'name': 'bracelet'}, {'frequency': 'r', 'synset': 'brass.n.07', 'synonyms': ['brass_plaque'], 'id': 147, 'def': 'a memorial made of brass', 'name': 'brass_plaque'}, {'frequency': 'c', 'synset': 'brassiere.n.01', 'synonyms': ['brassiere', 'bra', 'bandeau'], 'id': 148, 'def': 'an undergarment worn by women to support their breasts', 'name': 'brassiere'}, {'frequency': 'c', 'synset': 'bread-bin.n.01', 'synonyms': ['bread-bin', 'breadbox'], 'id': 149, 'def': 'a container used to keep bread or cake in', 'name': 'bread-bin'}, {'frequency': 'f', 'synset': 'bread.n.01', 'synonyms': ['bread'], 'id': 150, 'def': 'food made from dough of flour or meal and usually raised with yeast or baking powder and then baked', 'name': 'bread'}, {'frequency': 'r', 'synset': 'breechcloth.n.01', 'synonyms': ['breechcloth', 'breechclout', 'loincloth'], 'id': 151, 'def': 'a garment that provides covering for the loins', 'name': 'breechcloth'}, {'frequency': 'f', 'synset': 'bridal_gown.n.01', 'synonyms': ['bridal_gown', 'wedding_gown', 'wedding_dress'], 'id': 152, 'def': 'a gown worn by the bride at a wedding', 'name': 'bridal_gown'}, {'frequency': 'c', 'synset': 'briefcase.n.01', 'synonyms': ['briefcase'], 'id': 153, 'def': 'a case with a handle; for carrying papers or files or books', 'name': 'briefcase'}, {'frequency': 'f', 'synset': 'broccoli.n.01', 'synonyms': ['broccoli'], 'id': 154, 'def': 'plant with dense clusters of tight green flower buds', 'name': 'broccoli'}, {'frequency': 'r', 'synset': 'brooch.n.01', 'synonyms': ['broach'], 'id': 155, 'def': 'a decorative pin worn by women', 'name': 'broach'}, {'frequency': 'c', 'synset': 'broom.n.01', 'synonyms': ['broom'], 'id': 156, 'def': 'bundle of straws or twigs attached to a long handle; used for cleaning', 'name': 'broom'}, {'frequency': 'c', 'synset': 'brownie.n.03', 'synonyms': ['brownie'], 'id': 157, 'def': 'square or bar of very rich chocolate cake usually with nuts', 'name': 'brownie'}, {'frequency': 'c', 'synset': 'brussels_sprouts.n.01', 'synonyms': ['brussels_sprouts'], 'id': 158, 'def': 'the small edible cabbage-like buds growing along a stalk', 'name': 'brussels_sprouts'}, {'frequency': 'r', 'synset': 'bubble_gum.n.01', 'synonyms': ['bubble_gum'], 'id': 159, 'def': 'a kind of chewing gum that can be blown into bubbles', 'name': 'bubble_gum'}, {'frequency': 'f', 'synset': 'bucket.n.01', 'synonyms': ['bucket', 'pail'], 'id': 160, 'def': 'a roughly cylindrical vessel that is open at the top', 'name': 'bucket'}, {'frequency': 'r', 'synset': 'buggy.n.01', 'synonyms': ['horse_buggy'], 'id': 161, 'def': 'a small lightweight carriage; drawn by a single horse', 'name': 'horse_buggy'}, {'frequency': 'c', 'synset': 'bull.n.11', 'synonyms': ['horned_cow'], 'id': 162, 'def': 'a cow with horns', 'name': 'bull'}, {'frequency': 'c', 'synset': 'bulldog.n.01', 'synonyms': ['bulldog'], 'id': 163, 'def': 'a thickset short-haired dog with a large head and strong undershot lower jaw', 'name': 'bulldog'}, {'frequency': 'r', 'synset': 'bulldozer.n.01', 'synonyms': ['bulldozer', 'dozer'], 'id': 164, 'def': 'large powerful tractor; a large blade in front flattens areas of ground', 'name': 'bulldozer'}, {'frequency': 'c', 'synset': 'bullet_train.n.01', 'synonyms': ['bullet_train'], 'id': 165, 'def': 'a high-speed passenger train', 'name': 'bullet_train'}, {'frequency': 'c', 'synset': 'bulletin_board.n.02', 'synonyms': ['bulletin_board', 'notice_board'], 'id': 166, 'def': 'a board that hangs on a wall; displays announcements', 'name': 'bulletin_board'}, {'frequency': 'r', 'synset': 'bulletproof_vest.n.01', 'synonyms': ['bulletproof_vest'], 'id': 167, 'def': 'a vest capable of resisting the impact of a bullet', 'name': 'bulletproof_vest'}, {'frequency': 'c', 'synset': 'bullhorn.n.01', 'synonyms': ['bullhorn', 'megaphone'], 'id': 168, 'def': 'a portable loudspeaker with built-in microphone and amplifier', 'name': 'bullhorn'}, {'frequency': 'f', 'synset': 'bun.n.01', 'synonyms': ['bun', 'roll'], 'id': 169, 'def': 'small rounded bread either plain or sweet', 'name': 'bun'}, {'frequency': 'c', 'synset': 'bunk_bed.n.01', 'synonyms': ['bunk_bed'], 'id': 170, 'def': 'beds built one above the other', 'name': 'bunk_bed'}, {'frequency': 'f', 'synset': 'buoy.n.01', 'synonyms': ['buoy'], 'id': 171, 'def': 'a float attached by rope to the seabed to mark channels in a harbor or underwater hazards', 'name': 'buoy'}, {'frequency': 'r', 'synset': 'burrito.n.01', 'synonyms': ['burrito'], 'id': 172, 'def': 'a flour tortilla folded around a filling', 'name': 'burrito'}, {'frequency': 'f', 'synset': 'bus.n.01', 'synonyms': ['bus_(vehicle)', 'autobus', 'charabanc', 'double-decker', 'motorbus', 'motorcoach'], 'id': 173, 'def': 'a vehicle carrying many passengers; used for public transport', 'name': 'bus_(vehicle)'}, {'frequency': 'c', 'synset': 'business_card.n.01', 'synonyms': ['business_card'], 'id': 174, 'def': "a card on which are printed the person's name and business affiliation", 'name': 'business_card'}, {'frequency': 'f', 'synset': 'butter.n.01', 'synonyms': ['butter'], 'id': 175, 'def': 'an edible emulsion of fat globules made by churning milk or cream; for cooking and table use', 'name': 'butter'}, {'frequency': 'c', 'synset': 'butterfly.n.01', 'synonyms': ['butterfly'], 'id': 176, 'def': 'insect typically having a slender body with knobbed antennae and broad colorful wings', 'name': 'butterfly'}, {'frequency': 'f', 'synset': 'button.n.01', 'synonyms': ['button'], 'id': 177, 'def': 'a round fastener sewn to shirts and coats etc to fit through buttonholes', 'name': 'button'}, {'frequency': 'f', 'synset': 'cab.n.03', 'synonyms': ['cab_(taxi)', 'taxi', 'taxicab'], 'id': 178, 'def': 'a car that takes passengers where they want to go in exchange for money', 'name': 'cab_(taxi)'}, {'frequency': 'r', 'synset': 'cabana.n.01', 'synonyms': ['cabana'], 'id': 179, 'def': 'a small tent used as a dressing room beside the sea or a swimming pool', 'name': 'cabana'}, {'frequency': 'c', 'synset': 'cabin_car.n.01', 'synonyms': ['cabin_car', 'caboose'], 'id': 180, 'def': 'a car on a freight train for use of the train crew; usually the last car on the train', 'name': 'cabin_car'}, {'frequency': 'f', 'synset': 'cabinet.n.01', 'synonyms': ['cabinet'], 'id': 181, 'def': 'a piece of furniture resembling a cupboard with doors and shelves and drawers', 'name': 'cabinet'}, {'frequency': 'r', 'synset': 'cabinet.n.03', 'synonyms': ['locker', 'storage_locker'], 'id': 182, 'def': 'a storage compartment for clothes and valuables; usually it has a lock', 'name': 'locker'}, {'frequency': 'f', 'synset': 'cake.n.03', 'synonyms': ['cake'], 'id': 183, 'def': 'baked goods made from or based on a mixture of flour, sugar, eggs, and fat', 'name': 'cake'}, {'frequency': 'c', 'synset': 'calculator.n.02', 'synonyms': ['calculator'], 'id': 184, 'def': 'a small machine that is used for mathematical calculations', 'name': 'calculator'}, {'frequency': 'f', 'synset': 'calendar.n.02', 'synonyms': ['calendar'], 'id': 185, 'def': 'a list or register of events (appointments/social events/court cases, etc)', 'name': 'calendar'}, {'frequency': 'c', 'synset': 'calf.n.01', 'synonyms': ['calf'], 'id': 186, 'def': 'young of domestic cattle', 'name': 'calf'}, {'frequency': 'c', 'synset': 'camcorder.n.01', 'synonyms': ['camcorder'], 'id': 187, 'def': 'a portable television camera and videocassette recorder', 'name': 'camcorder'}, {'frequency': 'c', 'synset': 'camel.n.01', 'synonyms': ['camel'], 'id': 188, 'def': 'cud-chewing mammal used as a draft or saddle animal in desert regions', 'name': 'camel'}, {'frequency': 'f', 'synset': 'camera.n.01', 'synonyms': ['camera'], 'id': 189, 'def': 'equipment for taking photographs', 'name': 'camera'}, {'frequency': 'c', 'synset': 'camera_lens.n.01', 'synonyms': ['camera_lens'], 'id': 190, 'def': 'a lens that focuses the image in a camera', 'name': 'camera_lens'}, {'frequency': 'c', 'synset': 'camper.n.02', 'synonyms': ['camper_(vehicle)', 'camping_bus', 'motor_home'], 'id': 191, 'def': 'a recreational vehicle equipped for camping out while traveling', 'name': 'camper_(vehicle)'}, {'frequency': 'f', 'synset': 'can.n.01', 'synonyms': ['can', 'tin_can'], 'id': 192, 'def': 'airtight sealed metal container for food or drink or paint etc.', 'name': 'can'}, {'frequency': 'c', 'synset': 'can_opener.n.01', 'synonyms': ['can_opener', 'tin_opener'], 'id': 193, 'def': 'a device for cutting cans open', 'name': 'can_opener'}, {'frequency': 'f', 'synset': 'candle.n.01', 'synonyms': ['candle', 'candlestick'], 'id': 194, 'def': 'stick of wax with a wick in the middle', 'name': 'candle'}, {'frequency': 'f', 'synset': 'candlestick.n.01', 'synonyms': ['candle_holder'], 'id': 195, 'def': 'a holder with sockets for candles', 'name': 'candle_holder'}, {'frequency': 'r', 'synset': 'candy_bar.n.01', 'synonyms': ['candy_bar'], 'id': 196, 'def': 'a candy shaped as a bar', 'name': 'candy_bar'}, {'frequency': 'c', 'synset': 'candy_cane.n.01', 'synonyms': ['candy_cane'], 'id': 197, 'def': 'a hard candy in the shape of a rod (usually with stripes)', 'name': 'candy_cane'}, {'frequency': 'c', 'synset': 'cane.n.01', 'synonyms': ['walking_cane'], 'id': 198, 'def': 'a stick that people can lean on to help them walk', 'name': 'walking_cane'}, {'frequency': 'c', 'synset': 'canister.n.02', 'synonyms': ['canister', 'cannister'], 'id': 199, 'def': 'metal container for storing dry foods such as tea or flour', 'name': 'canister'}, {'frequency': 'c', 'synset': 'canoe.n.01', 'synonyms': ['canoe'], 'id': 200, 'def': 'small and light boat; pointed at both ends; propelled with a paddle', 'name': 'canoe'}, {'frequency': 'c', 'synset': 'cantaloup.n.02', 'synonyms': ['cantaloup', 'cantaloupe'], 'id': 201, 'def': 'the fruit of a cantaloup vine; small to medium-sized melon with yellowish flesh', 'name': 'cantaloup'}, {'frequency': 'r', 'synset': 'canteen.n.01', 'synonyms': ['canteen'], 'id': 202, 'def': 'a flask for carrying water; used by soldiers or travelers', 'name': 'canteen'}, {'frequency': 'f', 'synset': 'cap.n.01', 'synonyms': ['cap_(headwear)'], 'id': 203, 'def': 'a tight-fitting headwear', 'name': 'cap_(headwear)'}, {'frequency': 'f', 'synset': 'cap.n.02', 'synonyms': ['bottle_cap', 'cap_(container_lid)'], 'id': 204, 'def': 'a top (as for a bottle)', 'name': 'bottle_cap'}, {'frequency': 'c', 'synset': 'cape.n.02', 'synonyms': ['cape'], 'id': 205, 'def': 'a sleeveless garment like a cloak but shorter', 'name': 'cape'}, {'frequency': 'c', 'synset': 'cappuccino.n.01', 'synonyms': ['cappuccino', 'coffee_cappuccino'], 'id': 206, 'def': 'equal parts of espresso and steamed milk', 'name': 'cappuccino'}, {'frequency': 'f', 'synset': 'car.n.01', 'synonyms': ['car_(automobile)', 'auto_(automobile)', 'automobile'], 'id': 207, 'def': 'a motor vehicle with four wheels', 'name': 'car_(automobile)'}, {'frequency': 'f', 'synset': 'car.n.02', 'synonyms': ['railcar_(part_of_a_train)', 'railway_car_(part_of_a_train)', 'railroad_car_(part_of_a_train)'], 'id': 208, 'def': 'a wheeled vehicle adapted to the rails of railroad (mark each individual railcar separately)', 'name': 'railcar_(part_of_a_train)'}, {'frequency': 'r', 'synset': 'car.n.04', 'synonyms': ['elevator_car'], 'id': 209, 'def': 'where passengers ride up and down', 'name': 'elevator_car'}, {'frequency': 'r', 'synset': 'car_battery.n.01', 'synonyms': ['car_battery', 'automobile_battery'], 'id': 210, 'def': 'a battery in a motor vehicle', 'name': 'car_battery'}, {'frequency': 'c', 'synset': 'card.n.02', 'synonyms': ['identity_card'], 'id': 211, 'def': 'a card certifying the identity of the bearer', 'name': 'identity_card'}, {'frequency': 'c', 'synset': 'card.n.03', 'synonyms': ['card'], 'id': 212, 'def': 'a rectangular piece of paper used to send messages (e.g. greetings or pictures)', 'name': 'card'}, {'frequency': 'c', 'synset': 'cardigan.n.01', 'synonyms': ['cardigan'], 'id': 213, 'def': 'knitted jacket that is fastened up the front with buttons or a zipper', 'name': 'cardigan'}, {'frequency': 'r', 'synset': 'cargo_ship.n.01', 'synonyms': ['cargo_ship', 'cargo_vessel'], 'id': 214, 'def': 'a ship designed to carry cargo', 'name': 'cargo_ship'}, {'frequency': 'r', 'synset': 'carnation.n.01', 'synonyms': ['carnation'], 'id': 215, 'def': 'plant with pink to purple-red spice-scented usually double flowers', 'name': 'carnation'}, {'frequency': 'c', 'synset': 'carriage.n.02', 'synonyms': ['horse_carriage'], 'id': 216, 'def': 'a vehicle with wheels drawn by one or more horses', 'name': 'horse_carriage'}, {'frequency': 'f', 'synset': 'carrot.n.01', 'synonyms': ['carrot'], 'id': 217, 'def': 'deep orange edible root of the cultivated carrot plant', 'name': 'carrot'}, {'frequency': 'f', 'synset': 'carryall.n.01', 'synonyms': ['tote_bag'], 'id': 218, 'def': 'a capacious bag or basket', 'name': 'tote_bag'}, {'frequency': 'c', 'synset': 'cart.n.01', 'synonyms': ['cart'], 'id': 219, 'def': 'a heavy open wagon usually having two wheels and drawn by an animal', 'name': 'cart'}, {'frequency': 'c', 'synset': 'carton.n.02', 'synonyms': ['carton'], 'id': 220, 'def': 'a container made of cardboard for holding food or drink', 'name': 'carton'}, {'frequency': 'c', 'synset': 'cash_register.n.01', 'synonyms': ['cash_register', 'register_(for_cash_transactions)'], 'id': 221, 'def': 'a cashbox with an adding machine to register transactions', 'name': 'cash_register'}, {'frequency': 'r', 'synset': 'casserole.n.01', 'synonyms': ['casserole'], 'id': 222, 'def': 'food cooked and served in a casserole', 'name': 'casserole'}, {'frequency': 'r', 'synset': 'cassette.n.01', 'synonyms': ['cassette'], 'id': 223, 'def': 'a container that holds a magnetic tape used for recording or playing sound or video', 'name': 'cassette'}, {'frequency': 'c', 'synset': 'cast.n.05', 'synonyms': ['cast', 'plaster_cast', 'plaster_bandage'], 'id': 224, 'def': 'bandage consisting of a firm covering that immobilizes broken bones while they heal', 'name': 'cast'}, {'frequency': 'f', 'synset': 'cat.n.01', 'synonyms': ['cat'], 'id': 225, 'def': 'a domestic house cat', 'name': 'cat'}, {'frequency': 'f', 'synset': 'cauliflower.n.02', 'synonyms': ['cauliflower'], 'id': 226, 'def': 'edible compact head of white undeveloped flowers', 'name': 'cauliflower'}, {'frequency': 'c', 'synset': 'cayenne.n.02', 'synonyms': ['cayenne_(spice)', 'cayenne_pepper_(spice)', 'red_pepper_(spice)'], 'id': 227, 'def': 'ground pods and seeds of pungent red peppers of the genus Capsicum', 'name': 'cayenne_(spice)'}, {'frequency': 'c', 'synset': 'cd_player.n.01', 'synonyms': ['CD_player'], 'id': 228, 'def': 'electronic equipment for playing compact discs (CDs)', 'name': 'CD_player'}, {'frequency': 'f', 'synset': 'celery.n.01', 'synonyms': ['celery'], 'id': 229, 'def': 'widely cultivated herb with aromatic leaf stalks that are eaten raw or cooked', 'name': 'celery'}, {'frequency': 'f', 'synset': 'cellular_telephone.n.01', 'synonyms': ['cellular_telephone', 'cellular_phone', 'cellphone', 'mobile_phone', 'smart_phone'], 'id': 230, 'def': 'a hand-held mobile telephone', 'name': 'cellular_telephone'}, {'frequency': 'r', 'synset': 'chain_mail.n.01', 'synonyms': ['chain_mail', 'ring_mail', 'chain_armor', 'chain_armour', 'ring_armor', 'ring_armour'], 'id': 231, 'def': '(Middle Ages) flexible armor made of interlinked metal rings', 'name': 'chain_mail'}, {'frequency': 'f', 'synset': 'chair.n.01', 'synonyms': ['chair'], 'id': 232, 'def': 'a seat for one person, with a support for the back', 'name': 'chair'}, {'frequency': 'r', 'synset': 'chaise_longue.n.01', 'synonyms': ['chaise_longue', 'chaise', 'daybed'], 'id': 233, 'def': 'a long chair; for reclining', 'name': 'chaise_longue'}, {'frequency': 'r', 'synset': 'chalice.n.01', 'synonyms': ['chalice'], 'id': 234, 'def': 'a bowl-shaped drinking vessel; especially the Eucharistic cup', 'name': 'chalice'}, {'frequency': 'f', 'synset': 'chandelier.n.01', 'synonyms': ['chandelier'], 'id': 235, 'def': 'branched lighting fixture; often ornate; hangs from the ceiling', 'name': 'chandelier'}, {'frequency': 'r', 'synset': 'chap.n.04', 'synonyms': ['chap'], 'id': 236, 'def': 'leather leggings without a seat; worn over trousers by cowboys to protect their legs', 'name': 'chap'}, {'frequency': 'r', 'synset': 'checkbook.n.01', 'synonyms': ['checkbook', 'chequebook'], 'id': 237, 'def': 'a book issued to holders of checking accounts', 'name': 'checkbook'}, {'frequency': 'r', 'synset': 'checkerboard.n.01', 'synonyms': ['checkerboard'], 'id': 238, 'def': 'a board having 64 squares of two alternating colors', 'name': 'checkerboard'}, {'frequency': 'c', 'synset': 'cherry.n.03', 'synonyms': ['cherry'], 'id': 239, 'def': 'a red fruit with a single hard stone', 'name': 'cherry'}, {'frequency': 'r', 'synset': 'chessboard.n.01', 'synonyms': ['chessboard'], 'id': 240, 'def': 'a checkerboard used to play chess', 'name': 'chessboard'}, {'frequency': 'c', 'synset': 'chicken.n.02', 'synonyms': ['chicken_(animal)'], 'id': 241, 'def': 'a domestic fowl bred for flesh or eggs', 'name': 'chicken_(animal)'}, {'frequency': 'c', 'synset': 'chickpea.n.01', 'synonyms': ['chickpea', 'garbanzo'], 'id': 242, 'def': 'the seed of the chickpea plant; usually dried', 'name': 'chickpea'}, {'frequency': 'c', 'synset': 'chili.n.02', 'synonyms': ['chili_(vegetable)', 'chili_pepper_(vegetable)', 'chilli_(vegetable)', 'chilly_(vegetable)', 'chile_(vegetable)'], 'id': 243, 'def': 'very hot and finely tapering pepper of special pungency', 'name': 'chili_(vegetable)'}, {'frequency': 'r', 'synset': 'chime.n.01', 'synonyms': ['chime', 'gong'], 'id': 244, 'def': 'an instrument consisting of a set of bells that are struck with a hammer', 'name': 'chime'}, {'frequency': 'r', 'synset': 'chinaware.n.01', 'synonyms': ['chinaware'], 'id': 245, 'def': 'dishware made of high quality porcelain', 'name': 'chinaware'}, {'frequency': 'c', 'synset': 'chip.n.04', 'synonyms': ['crisp_(potato_chip)', 'potato_chip'], 'id': 246, 'def': 'a thin crisp slice of potato fried in deep fat', 'name': 'crisp_(potato_chip)'}, {'frequency': 'r', 'synset': 'chip.n.06', 'synonyms': ['poker_chip'], 'id': 247, 'def': 'a small disk-shaped counter used to represent money when gambling', 'name': 'poker_chip'}, {'frequency': 'c', 'synset': 'chocolate_bar.n.01', 'synonyms': ['chocolate_bar'], 'id': 248, 'def': 'a bar of chocolate candy', 'name': 'chocolate_bar'}, {'frequency': 'c', 'synset': 'chocolate_cake.n.01', 'synonyms': ['chocolate_cake'], 'id': 249, 'def': 'cake containing chocolate', 'name': 'chocolate_cake'}, {'frequency': 'r', 'synset': 'chocolate_milk.n.01', 'synonyms': ['chocolate_milk'], 'id': 250, 'def': 'milk flavored with chocolate syrup', 'name': 'chocolate_milk'}, {'frequency': 'r', 'synset': 'chocolate_mousse.n.01', 'synonyms': ['chocolate_mousse'], 'id': 251, 'def': 'dessert mousse made with chocolate', 'name': 'chocolate_mousse'}, {'frequency': 'f', 'synset': 'choker.n.03', 'synonyms': ['choker', 'collar', 'neckband'], 'id': 252, 'def': 'shirt collar, animal collar, or tight-fitting necklace', 'name': 'choker'}, {'frequency': 'f', 'synset': 'chopping_board.n.01', 'synonyms': ['chopping_board', 'cutting_board', 'chopping_block'], 'id': 253, 'def': 'a wooden board where meats or vegetables can be cut', 'name': 'chopping_board'}, {'frequency': 'f', 'synset': 'chopstick.n.01', 'synonyms': ['chopstick'], 'id': 254, 'def': 'one of a pair of slender sticks used as oriental tableware to eat food with', 'name': 'chopstick'}, {'frequency': 'f', 'synset': 'christmas_tree.n.05', 'synonyms': ['Christmas_tree'], 'id': 255, 'def': 'an ornamented evergreen used as a Christmas decoration', 'name': 'Christmas_tree'}, {'frequency': 'c', 'synset': 'chute.n.02', 'synonyms': ['slide'], 'id': 256, 'def': 'sloping channel through which things can descend', 'name': 'slide'}, {'frequency': 'r', 'synset': 'cider.n.01', 'synonyms': ['cider', 'cyder'], 'id': 257, 'def': 'a beverage made from juice pressed from apples', 'name': 'cider'}, {'frequency': 'r', 'synset': 'cigar_box.n.01', 'synonyms': ['cigar_box'], 'id': 258, 'def': 'a box for holding cigars', 'name': 'cigar_box'}, {'frequency': 'f', 'synset': 'cigarette.n.01', 'synonyms': ['cigarette'], 'id': 259, 'def': 'finely ground tobacco wrapped in paper; for smoking', 'name': 'cigarette'}, {'frequency': 'c', 'synset': 'cigarette_case.n.01', 'synonyms': ['cigarette_case', 'cigarette_pack'], 'id': 260, 'def': 'a small flat case for holding cigarettes', 'name': 'cigarette_case'}, {'frequency': 'f', 'synset': 'cistern.n.02', 'synonyms': ['cistern', 'water_tank'], 'id': 261, 'def': 'a tank that holds the water used to flush a toilet', 'name': 'cistern'}, {'frequency': 'r', 'synset': 'clarinet.n.01', 'synonyms': ['clarinet'], 'id': 262, 'def': 'a single-reed instrument with a straight tube', 'name': 'clarinet'}, {'frequency': 'c', 'synset': 'clasp.n.01', 'synonyms': ['clasp'], 'id': 263, 'def': 'a fastener (as a buckle or hook) that is used to hold two things together', 'name': 'clasp'}, {'frequency': 'c', 'synset': 'cleansing_agent.n.01', 'synonyms': ['cleansing_agent', 'cleanser', 'cleaner'], 'id': 264, 'def': 'a preparation used in cleaning something', 'name': 'cleansing_agent'}, {'frequency': 'r', 'synset': 'cleat.n.02', 'synonyms': ['cleat_(for_securing_rope)'], 'id': 265, 'def': 'a fastener (usually with two projecting horns) around which a rope can be secured', 'name': 'cleat_(for_securing_rope)'}, {'frequency': 'r', 'synset': 'clementine.n.01', 'synonyms': ['clementine'], 'id': 266, 'def': 'a variety of mandarin orange', 'name': 'clementine'}, {'frequency': 'c', 'synset': 'clip.n.03', 'synonyms': ['clip'], 'id': 267, 'def': 'any of various small fasteners used to hold loose articles together', 'name': 'clip'}, {'frequency': 'c', 'synset': 'clipboard.n.01', 'synonyms': ['clipboard'], 'id': 268, 'def': 'a small writing board with a clip at the top for holding papers', 'name': 'clipboard'}, {'frequency': 'r', 'synset': 'clipper.n.03', 'synonyms': ['clippers_(for_plants)'], 'id': 269, 'def': 'shears for cutting grass or shrubbery (often used in the plural)', 'name': 'clippers_(for_plants)'}, {'frequency': 'r', 'synset': 'cloak.n.02', 'synonyms': ['cloak'], 'id': 270, 'def': 'a loose outer garment', 'name': 'cloak'}, {'frequency': 'f', 'synset': 'clock.n.01', 'synonyms': ['clock', 'timepiece', 'timekeeper'], 'id': 271, 'def': 'a timepiece that shows the time of day', 'name': 'clock'}, {'frequency': 'f', 'synset': 'clock_tower.n.01', 'synonyms': ['clock_tower'], 'id': 272, 'def': 'a tower with a large clock visible high up on an outside face', 'name': 'clock_tower'}, {'frequency': 'c', 'synset': 'clothes_hamper.n.01', 'synonyms': ['clothes_hamper', 'laundry_basket', 'clothes_basket'], 'id': 273, 'def': 'a hamper that holds dirty clothes to be washed or wet clothes to be dried', 'name': 'clothes_hamper'}, {'frequency': 'c', 'synset': 'clothespin.n.01', 'synonyms': ['clothespin', 'clothes_peg'], 'id': 274, 'def': 'wood or plastic fastener; for holding clothes on a clothesline', 'name': 'clothespin'}, {'frequency': 'r', 'synset': 'clutch_bag.n.01', 'synonyms': ['clutch_bag'], 'id': 275, 'def': "a woman's strapless purse that is carried in the hand", 'name': 'clutch_bag'}, {'frequency': 'f', 'synset': 'coaster.n.03', 'synonyms': ['coaster'], 'id': 276, 'def': 'a covering (plate or mat) that protects the surface of a table', 'name': 'coaster'}, {'frequency': 'f', 'synset': 'coat.n.01', 'synonyms': ['coat'], 'id': 277, 'def': 'an outer garment that has sleeves and covers the body from shoulder down', 'name': 'coat'}, {'frequency': 'c', 'synset': 'coat_hanger.n.01', 'synonyms': ['coat_hanger', 'clothes_hanger', 'dress_hanger'], 'id': 278, 'def': "a hanger that is shaped like a person's shoulders", 'name': 'coat_hanger'}, {'frequency': 'c', 'synset': 'coatrack.n.01', 'synonyms': ['coatrack', 'hatrack'], 'id': 279, 'def': 'a rack with hooks for temporarily holding coats and hats', 'name': 'coatrack'}, {'frequency': 'c', 'synset': 'cock.n.04', 'synonyms': ['cock', 'rooster'], 'id': 280, 'def': 'adult male chicken', 'name': 'cock'}, {'frequency': 'r', 'synset': 'cockroach.n.01', 'synonyms': ['cockroach'], 'id': 281, 'def': 'any of numerous chiefly nocturnal insects; some are domestic pests', 'name': 'cockroach'}, {'frequency': 'r', 'synset': 'cocoa.n.01', 'synonyms': ['cocoa_(beverage)', 'hot_chocolate_(beverage)', 'drinking_chocolate'], 'id': 282, 'def': 'a beverage made from cocoa powder and milk and sugar; usually drunk hot', 'name': 'cocoa_(beverage)'}, {'frequency': 'c', 'synset': 'coconut.n.02', 'synonyms': ['coconut', 'cocoanut'], 'id': 283, 'def': 'large hard-shelled brown oval nut with a fibrous husk', 'name': 'coconut'}, {'frequency': 'f', 'synset': 'coffee_maker.n.01', 'synonyms': ['coffee_maker', 'coffee_machine'], 'id': 284, 'def': 'a kitchen appliance for brewing coffee automatically', 'name': 'coffee_maker'}, {'frequency': 'f', 'synset': 'coffee_table.n.01', 'synonyms': ['coffee_table', 'cocktail_table'], 'id': 285, 'def': 'low table where magazines can be placed and coffee or cocktails are served', 'name': 'coffee_table'}, {'frequency': 'c', 'synset': 'coffeepot.n.01', 'synonyms': ['coffeepot'], 'id': 286, 'def': 'tall pot in which coffee is brewed', 'name': 'coffeepot'}, {'frequency': 'r', 'synset': 'coil.n.05', 'synonyms': ['coil'], 'id': 287, 'def': 'tubing that is wound in a spiral', 'name': 'coil'}, {'frequency': 'c', 'synset': 'coin.n.01', 'synonyms': ['coin'], 'id': 288, 'def': 'a flat metal piece (usually a disc) used as money', 'name': 'coin'}, {'frequency': 'c', 'synset': 'colander.n.01', 'synonyms': ['colander', 'cullender'], 'id': 289, 'def': 'bowl-shaped strainer; used to wash or drain foods', 'name': 'colander'}, {'frequency': 'c', 'synset': 'coleslaw.n.01', 'synonyms': ['coleslaw', 'slaw'], 'id': 290, 'def': 'basically shredded cabbage', 'name': 'coleslaw'}, {'frequency': 'r', 'synset': 'coloring_material.n.01', 'synonyms': ['coloring_material', 'colouring_material'], 'id': 291, 'def': 'any material used for its color', 'name': 'coloring_material'}, {'frequency': 'r', 'synset': 'combination_lock.n.01', 'synonyms': ['combination_lock'], 'id': 292, 'def': 'lock that can be opened only by turning dials in a special sequence', 'name': 'combination_lock'}, {'frequency': 'c', 'synset': 'comforter.n.04', 'synonyms': ['pacifier', 'teething_ring'], 'id': 293, 'def': 'device used for an infant to suck or bite on', 'name': 'pacifier'}, {'frequency': 'r', 'synset': 'comic_book.n.01', 'synonyms': ['comic_book'], 'id': 294, 'def': 'a magazine devoted to comic strips', 'name': 'comic_book'}, {'frequency': 'r', 'synset': 'compass.n.01', 'synonyms': ['compass'], 'id': 295, 'def': 'navigational instrument for finding directions', 'name': 'compass'}, {'frequency': 'f', 'synset': 'computer_keyboard.n.01', 'synonyms': ['computer_keyboard', 'keyboard_(computer)'], 'id': 296, 'def': 'a keyboard that is a data input device for computers', 'name': 'computer_keyboard'}, {'frequency': 'f', 'synset': 'condiment.n.01', 'synonyms': ['condiment'], 'id': 297, 'def': 'a preparation (a sauce or relish or spice) to enhance flavor or enjoyment', 'name': 'condiment'}, {'frequency': 'f', 'synset': 'cone.n.01', 'synonyms': ['cone', 'traffic_cone'], 'id': 298, 'def': 'a cone-shaped object used to direct traffic', 'name': 'cone'}, {'frequency': 'f', 'synset': 'control.n.09', 'synonyms': ['control', 'controller'], 'id': 299, 'def': 'a mechanism that controls the operation of a machine', 'name': 'control'}, {'frequency': 'r', 'synset': 'convertible.n.01', 'synonyms': ['convertible_(automobile)'], 'id': 300, 'def': 'a car that has top that can be folded or removed', 'name': 'convertible_(automobile)'}, {'frequency': 'r', 'synset': 'convertible.n.03', 'synonyms': ['sofa_bed'], 'id': 301, 'def': 'a sofa that can be converted into a bed', 'name': 'sofa_bed'}, {'frequency': 'r', 'synset': 'cooker.n.01', 'synonyms': ['cooker'], 'id': 302, 'def': 'a utensil for cooking', 'name': 'cooker'}, {'frequency': 'f', 'synset': 'cookie.n.01', 'synonyms': ['cookie', 'cooky', 'biscuit_(cookie)'], 'id': 303, 'def': "any of various small flat sweet cakes (`biscuit' is the British term)", 'name': 'cookie'}, {'frequency': 'r', 'synset': 'cooking_utensil.n.01', 'synonyms': ['cooking_utensil'], 'id': 304, 'def': 'a kitchen utensil made of material that does not melt easily; used for cooking', 'name': 'cooking_utensil'}, {'frequency': 'f', 'synset': 'cooler.n.01', 'synonyms': ['cooler_(for_food)', 'ice_chest'], 'id': 305, 'def': 'an insulated box for storing food often with ice', 'name': 'cooler_(for_food)'}, {'frequency': 'f', 'synset': 'cork.n.04', 'synonyms': ['cork_(bottle_plug)', 'bottle_cork'], 'id': 306, 'def': 'the plug in the mouth of a bottle (especially a wine bottle)', 'name': 'cork_(bottle_plug)'}, {'frequency': 'r', 'synset': 'corkboard.n.01', 'synonyms': ['corkboard'], 'id': 307, 'def': 'a sheet consisting of cork granules', 'name': 'corkboard'}, {'frequency': 'c', 'synset': 'corkscrew.n.01', 'synonyms': ['corkscrew', 'bottle_screw'], 'id': 308, 'def': 'a bottle opener that pulls corks', 'name': 'corkscrew'}, {'frequency': 'f', 'synset': 'corn.n.03', 'synonyms': ['edible_corn', 'corn', 'maize'], 'id': 309, 'def': 'ears or kernels of corn that can be prepared and served for human food (only mark individual ears or kernels)', 'name': 'edible_corn'}, {'frequency': 'r', 'synset': 'cornbread.n.01', 'synonyms': ['cornbread'], 'id': 310, 'def': 'bread made primarily of cornmeal', 'name': 'cornbread'}, {'frequency': 'c', 'synset': 'cornet.n.01', 'synonyms': ['cornet', 'horn', 'trumpet'], 'id': 311, 'def': 'a brass musical instrument with a narrow tube and a flared bell and many valves', 'name': 'cornet'}, {'frequency': 'c', 'synset': 'cornice.n.01', 'synonyms': ['cornice', 'valance', 'valance_board', 'pelmet'], 'id': 312, 'def': 'a decorative framework to conceal curtain fixtures at the top of a window casing', 'name': 'cornice'}, {'frequency': 'r', 'synset': 'cornmeal.n.01', 'synonyms': ['cornmeal'], 'id': 313, 'def': 'coarsely ground corn', 'name': 'cornmeal'}, {'frequency': 'c', 'synset': 'corset.n.01', 'synonyms': ['corset', 'girdle'], 'id': 314, 'def': "a woman's close-fitting foundation garment", 'name': 'corset'}, {'frequency': 'c', 'synset': 'costume.n.04', 'synonyms': ['costume'], 'id': 315, 'def': 'the attire characteristic of a country or a time or a social class', 'name': 'costume'}, {'frequency': 'r', 'synset': 'cougar.n.01', 'synonyms': ['cougar', 'puma', 'catamount', 'mountain_lion', 'panther'], 'id': 316, 'def': 'large American feline resembling a lion', 'name': 'cougar'}, {'frequency': 'r', 'synset': 'coverall.n.01', 'synonyms': ['coverall'], 'id': 317, 'def': 'a loose-fitting protective garment that is worn over other clothing', 'name': 'coverall'}, {'frequency': 'c', 'synset': 'cowbell.n.01', 'synonyms': ['cowbell'], 'id': 318, 'def': 'a bell hung around the neck of cow so that the cow can be easily located', 'name': 'cowbell'}, {'frequency': 'f', 'synset': 'cowboy_hat.n.01', 'synonyms': ['cowboy_hat', 'ten-gallon_hat'], 'id': 319, 'def': 'a hat with a wide brim and a soft crown; worn by American ranch hands', 'name': 'cowboy_hat'}, {'frequency': 'c', 'synset': 'crab.n.01', 'synonyms': ['crab_(animal)'], 'id': 320, 'def': 'decapod having eyes on short stalks and a broad flattened shell and pincers', 'name': 'crab_(animal)'}, {'frequency': 'r', 'synset': 'crab.n.05', 'synonyms': ['crabmeat'], 'id': 321, 'def': 'the edible flesh of any of various crabs', 'name': 'crabmeat'}, {'frequency': 'c', 'synset': 'cracker.n.01', 'synonyms': ['cracker'], 'id': 322, 'def': 'a thin crisp wafer', 'name': 'cracker'}, {'frequency': 'r', 'synset': 'crape.n.01', 'synonyms': ['crape', 'crepe', 'French_pancake'], 'id': 323, 'def': 'small very thin pancake', 'name': 'crape'}, {'frequency': 'f', 'synset': 'crate.n.01', 'synonyms': ['crate'], 'id': 324, 'def': 'a rugged box (usually made of wood); used for shipping', 'name': 'crate'}, {'frequency': 'c', 'synset': 'crayon.n.01', 'synonyms': ['crayon', 'wax_crayon'], 'id': 325, 'def': 'writing or drawing implement made of a colored stick of composition wax', 'name': 'crayon'}, {'frequency': 'r', 'synset': 'cream_pitcher.n.01', 'synonyms': ['cream_pitcher'], 'id': 326, 'def': 'a small pitcher for serving cream', 'name': 'cream_pitcher'}, {'frequency': 'c', 'synset': 'crescent_roll.n.01', 'synonyms': ['crescent_roll', 'croissant'], 'id': 327, 'def': 'very rich flaky crescent-shaped roll', 'name': 'crescent_roll'}, {'frequency': 'c', 'synset': 'crib.n.01', 'synonyms': ['crib', 'cot'], 'id': 328, 'def': 'baby bed with high sides made of slats', 'name': 'crib'}, {'frequency': 'c', 'synset': 'crock.n.03', 'synonyms': ['crock_pot', 'earthenware_jar'], 'id': 329, 'def': 'an earthen jar (made of baked clay) or a modern electric crockpot', 'name': 'crock_pot'}, {'frequency': 'f', 'synset': 'crossbar.n.01', 'synonyms': ['crossbar'], 'id': 330, 'def': 'a horizontal bar that goes across something', 'name': 'crossbar'}, {'frequency': 'r', 'synset': 'crouton.n.01', 'synonyms': ['crouton'], 'id': 331, 'def': 'a small piece of toasted or fried bread; served in soup or salads', 'name': 'crouton'}, {'frequency': 'c', 'synset': 'crow.n.01', 'synonyms': ['crow'], 'id': 332, 'def': 'black birds having a raucous call', 'name': 'crow'}, {'frequency': 'r', 'synset': 'crowbar.n.01', 'synonyms': ['crowbar', 'wrecking_bar', 'pry_bar'], 'id': 333, 'def': 'a heavy iron lever with one end forged into a wedge', 'name': 'crowbar'}, {'frequency': 'c', 'synset': 'crown.n.04', 'synonyms': ['crown'], 'id': 334, 'def': 'an ornamental jeweled headdress signifying sovereignty', 'name': 'crown'}, {'frequency': 'c', 'synset': 'crucifix.n.01', 'synonyms': ['crucifix'], 'id': 335, 'def': 'representation of the cross on which Jesus died', 'name': 'crucifix'}, {'frequency': 'c', 'synset': 'cruise_ship.n.01', 'synonyms': ['cruise_ship', 'cruise_liner'], 'id': 336, 'def': 'a passenger ship used commercially for pleasure cruises', 'name': 'cruise_ship'}, {'frequency': 'c', 'synset': 'cruiser.n.01', 'synonyms': ['police_cruiser', 'patrol_car', 'police_car', 'squad_car'], 'id': 337, 'def': 'a car in which policemen cruise the streets', 'name': 'police_cruiser'}, {'frequency': 'f', 'synset': 'crumb.n.03', 'synonyms': ['crumb'], 'id': 338, 'def': 'small piece of e.g. bread or cake', 'name': 'crumb'}, {'frequency': 'c', 'synset': 'crutch.n.01', 'synonyms': ['crutch'], 'id': 339, 'def': 'a wooden or metal staff that fits under the armpit and reaches to the ground', 'name': 'crutch'}, {'frequency': 'c', 'synset': 'cub.n.03', 'synonyms': ['cub_(animal)'], 'id': 340, 'def': 'the young of certain carnivorous mammals such as the bear or wolf or lion', 'name': 'cub_(animal)'}, {'frequency': 'c', 'synset': 'cube.n.05', 'synonyms': ['cube', 'square_block'], 'id': 341, 'def': 'a block in the (approximate) shape of a cube', 'name': 'cube'}, {'frequency': 'f', 'synset': 'cucumber.n.02', 'synonyms': ['cucumber', 'cuke'], 'id': 342, 'def': 'cylindrical green fruit with thin green rind and white flesh eaten as a vegetable', 'name': 'cucumber'}, {'frequency': 'c', 'synset': 'cufflink.n.01', 'synonyms': ['cufflink'], 'id': 343, 'def': 'jewelry consisting of linked buttons used to fasten the cuffs of a shirt', 'name': 'cufflink'}, {'frequency': 'f', 'synset': 'cup.n.01', 'synonyms': ['cup'], 'id': 344, 'def': 'a small open container usually used for drinking; usually has a handle', 'name': 'cup'}, {'frequency': 'c', 'synset': 'cup.n.08', 'synonyms': ['trophy_cup'], 'id': 345, 'def': 'a metal award or cup-shaped vessel with handles that is awarded as a trophy to a competition winner', 'name': 'trophy_cup'}, {'frequency': 'f', 'synset': 'cupboard.n.01', 'synonyms': ['cupboard', 'closet'], 'id': 346, 'def': 'a small room (or recess) or cabinet used for storage space', 'name': 'cupboard'}, {'frequency': 'f', 'synset': 'cupcake.n.01', 'synonyms': ['cupcake'], 'id': 347, 'def': 'small cake baked in a muffin tin', 'name': 'cupcake'}, {'frequency': 'r', 'synset': 'curler.n.01', 'synonyms': ['hair_curler', 'hair_roller', 'hair_crimper'], 'id': 348, 'def': 'a cylindrical tube around which the hair is wound to curl it', 'name': 'hair_curler'}, {'frequency': 'r', 'synset': 'curling_iron.n.01', 'synonyms': ['curling_iron'], 'id': 349, 'def': 'a cylindrical home appliance that heats hair that has been curled around it', 'name': 'curling_iron'}, {'frequency': 'f', 'synset': 'curtain.n.01', 'synonyms': ['curtain', 'drapery'], 'id': 350, 'def': 'hanging cloth used as a blind (especially for a window)', 'name': 'curtain'}, {'frequency': 'f', 'synset': 'cushion.n.03', 'synonyms': ['cushion'], 'id': 351, 'def': 'a soft bag filled with air or padding such as feathers or foam rubber', 'name': 'cushion'}, {'frequency': 'r', 'synset': 'cylinder.n.04', 'synonyms': ['cylinder'], 'id': 352, 'def': 'a cylindrical container', 'name': 'cylinder'}, {'frequency': 'r', 'synset': 'cymbal.n.01', 'synonyms': ['cymbal'], 'id': 353, 'def': 'a percussion instrument consisting of a concave brass disk', 'name': 'cymbal'}, {'frequency': 'r', 'synset': 'dagger.n.01', 'synonyms': ['dagger'], 'id': 354, 'def': 'a short knife with a pointed blade used for piercing or stabbing', 'name': 'dagger'}, {'frequency': 'r', 'synset': 'dalmatian.n.02', 'synonyms': ['dalmatian'], 'id': 355, 'def': 'a large breed having a smooth white coat with black or brown spots', 'name': 'dalmatian'}, {'frequency': 'c', 'synset': 'dartboard.n.01', 'synonyms': ['dartboard'], 'id': 356, 'def': 'a circular board of wood or cork used as the target in the game of darts', 'name': 'dartboard'}, {'frequency': 'r', 'synset': 'date.n.08', 'synonyms': ['date_(fruit)'], 'id': 357, 'def': 'sweet edible fruit of the date palm with a single long woody seed', 'name': 'date_(fruit)'}, {'frequency': 'f', 'synset': 'deck_chair.n.01', 'synonyms': ['deck_chair', 'beach_chair'], 'id': 358, 'def': 'a folding chair for use outdoors; a wooden frame supports a length of canvas', 'name': 'deck_chair'}, {'frequency': 'c', 'synset': 'deer.n.01', 'synonyms': ['deer', 'cervid'], 'id': 359, 'def': "distinguished from Bovidae by the male's having solid deciduous antlers", 'name': 'deer'}, {'frequency': 'c', 'synset': 'dental_floss.n.01', 'synonyms': ['dental_floss', 'floss'], 'id': 360, 'def': 'a soft thread for cleaning the spaces between the teeth', 'name': 'dental_floss'}, {'frequency': 'f', 'synset': 'desk.n.01', 'synonyms': ['desk'], 'id': 361, 'def': 'a piece of furniture with a writing surface and usually drawers or other compartments', 'name': 'desk'}, {'frequency': 'r', 'synset': 'detergent.n.01', 'synonyms': ['detergent'], 'id': 362, 'def': 'a surface-active chemical widely used in industry and laundering', 'name': 'detergent'}, {'frequency': 'c', 'synset': 'diaper.n.01', 'synonyms': ['diaper'], 'id': 363, 'def': 'garment consisting of a folded cloth drawn up between the legs and fastened at the waist', 'name': 'diaper'}, {'frequency': 'r', 'synset': 'diary.n.01', 'synonyms': ['diary', 'journal'], 'id': 364, 'def': 'yearly planner book', 'name': 'diary'}, {'frequency': 'r', 'synset': 'die.n.01', 'synonyms': ['die', 'dice'], 'id': 365, 'def': 'a small cube with 1 to 6 spots on the six faces; used in gambling', 'name': 'die'}, {'frequency': 'r', 'synset': 'dinghy.n.01', 'synonyms': ['dinghy', 'dory', 'rowboat'], 'id': 366, 'def': 'a small boat of shallow draft with seats and oars with which it is propelled', 'name': 'dinghy'}, {'frequency': 'f', 'synset': 'dining_table.n.01', 'synonyms': ['dining_table'], 'id': 367, 'def': 'a table at which meals are served', 'name': 'dining_table'}, {'frequency': 'r', 'synset': 'dinner_jacket.n.01', 'synonyms': ['tux', 'tuxedo'], 'id': 368, 'def': 'semiformal evening dress for men', 'name': 'tux'}, {'frequency': 'f', 'synset': 'dish.n.01', 'synonyms': ['dish'], 'id': 369, 'def': 'a piece of dishware normally used as a container for holding or serving food', 'name': 'dish'}, {'frequency': 'c', 'synset': 'dish.n.05', 'synonyms': ['dish_antenna'], 'id': 370, 'def': 'directional antenna consisting of a parabolic reflector', 'name': 'dish_antenna'}, {'frequency': 'c', 'synset': 'dishrag.n.01', 'synonyms': ['dishrag', 'dishcloth'], 'id': 371, 'def': 'a cloth for washing dishes or cleaning in general', 'name': 'dishrag'}, {'frequency': 'f', 'synset': 'dishtowel.n.01', 'synonyms': ['dishtowel', 'tea_towel'], 'id': 372, 'def': 'a towel for drying dishes', 'name': 'dishtowel'}, {'frequency': 'f', 'synset': 'dishwasher.n.01', 'synonyms': ['dishwasher', 'dishwashing_machine'], 'id': 373, 'def': 'a machine for washing dishes', 'name': 'dishwasher'}, {'frequency': 'r', 'synset': 'dishwasher_detergent.n.01', 'synonyms': ['dishwasher_detergent', 'dishwashing_detergent', 'dishwashing_liquid', 'dishsoap'], 'id': 374, 'def': 'dishsoap or dish detergent designed for use in dishwashers', 'name': 'dishwasher_detergent'}, {'frequency': 'f', 'synset': 'dispenser.n.01', 'synonyms': ['dispenser'], 'id': 375, 'def': 'a container so designed that the contents can be used in prescribed amounts', 'name': 'dispenser'}, {'frequency': 'r', 'synset': 'diving_board.n.01', 'synonyms': ['diving_board'], 'id': 376, 'def': 'a springboard from which swimmers can dive', 'name': 'diving_board'}, {'frequency': 'f', 'synset': 'dixie_cup.n.01', 'synonyms': ['Dixie_cup', 'paper_cup'], 'id': 377, 'def': 'a disposable cup made of paper; for holding drinks', 'name': 'Dixie_cup'}, {'frequency': 'f', 'synset': 'dog.n.01', 'synonyms': ['dog'], 'id': 378, 'def': 'a common domesticated dog', 'name': 'dog'}, {'frequency': 'f', 'synset': 'dog_collar.n.01', 'synonyms': ['dog_collar'], 'id': 379, 'def': 'a collar for a dog', 'name': 'dog_collar'}, {'frequency': 'f', 'synset': 'doll.n.01', 'synonyms': ['doll'], 'id': 380, 'def': 'a toy replica of a HUMAN (NOT AN ANIMAL)', 'name': 'doll'}, {'frequency': 'r', 'synset': 'dollar.n.02', 'synonyms': ['dollar', 'dollar_bill', 'one_dollar_bill'], 'id': 381, 'def': 'a piece of paper money worth one dollar', 'name': 'dollar'}, {'frequency': 'r', 'synset': 'dollhouse.n.01', 'synonyms': ['dollhouse', "doll's_house"], 'id': 382, 'def': "a house so small that it is likened to a child's plaything", 'name': 'dollhouse'}, {'frequency': 'c', 'synset': 'dolphin.n.02', 'synonyms': ['dolphin'], 'id': 383, 'def': 'any of various small toothed whales with a beaklike snout; larger than porpoises', 'name': 'dolphin'}, {'frequency': 'c', 'synset': 'domestic_ass.n.01', 'synonyms': ['domestic_ass', 'donkey'], 'id': 384, 'def': 'domestic beast of burden descended from the African wild ass; patient but stubborn', 'name': 'domestic_ass'}, {'frequency': 'f', 'synset': 'doorknob.n.01', 'synonyms': ['doorknob', 'doorhandle'], 'id': 385, 'def': "a knob used to open a door (often called `doorhandle' in Great Britain)", 'name': 'doorknob'}, {'frequency': 'c', 'synset': 'doormat.n.02', 'synonyms': ['doormat', 'welcome_mat'], 'id': 386, 'def': 'a mat placed outside an exterior door for wiping the shoes before entering', 'name': 'doormat'}, {'frequency': 'f', 'synset': 'doughnut.n.02', 'synonyms': ['doughnut', 'donut'], 'id': 387, 'def': 'a small ring-shaped friedcake', 'name': 'doughnut'}, {'frequency': 'r', 'synset': 'dove.n.01', 'synonyms': ['dove'], 'id': 388, 'def': 'any of numerous small pigeons', 'name': 'dove'}, {'frequency': 'r', 'synset': 'dragonfly.n.01', 'synonyms': ['dragonfly'], 'id': 389, 'def': 'slender-bodied non-stinging insect having iridescent wings that are outspread at rest', 'name': 'dragonfly'}, {'frequency': 'f', 'synset': 'drawer.n.01', 'synonyms': ['drawer'], 'id': 390, 'def': 'a boxlike container in a piece of furniture; made so as to slide in and out', 'name': 'drawer'}, {'frequency': 'c', 'synset': 'drawers.n.01', 'synonyms': ['underdrawers', 'boxers', 'boxershorts'], 'id': 391, 'def': 'underpants worn by men', 'name': 'underdrawers'}, {'frequency': 'f', 'synset': 'dress.n.01', 'synonyms': ['dress', 'frock'], 'id': 392, 'def': 'a one-piece garment for a woman; has skirt and bodice', 'name': 'dress'}, {'frequency': 'c', 'synset': 'dress_hat.n.01', 'synonyms': ['dress_hat', 'high_hat', 'opera_hat', 'silk_hat', 'top_hat'], 'id': 393, 'def': "a man's hat with a tall crown; usually covered with silk or with beaver fur", 'name': 'dress_hat'}, {'frequency': 'f', 'synset': 'dress_suit.n.01', 'synonyms': ['dress_suit'], 'id': 394, 'def': 'formalwear consisting of full evening dress for men', 'name': 'dress_suit'}, {'frequency': 'f', 'synset': 'dresser.n.05', 'synonyms': ['dresser'], 'id': 395, 'def': 'a cabinet with shelves', 'name': 'dresser'}, {'frequency': 'c', 'synset': 'drill.n.01', 'synonyms': ['drill'], 'id': 396, 'def': 'a tool with a sharp rotating point for making holes in hard materials', 'name': 'drill'}, {'frequency': 'r', 'synset': 'drone.n.04', 'synonyms': ['drone'], 'id': 397, 'def': 'an aircraft without a pilot that is operated by remote control', 'name': 'drone'}, {'frequency': 'r', 'synset': 'dropper.n.01', 'synonyms': ['dropper', 'eye_dropper'], 'id': 398, 'def': 'pipet consisting of a small tube with a vacuum bulb at one end for drawing liquid in and releasing it a drop at a time', 'name': 'dropper'}, {'frequency': 'c', 'synset': 'drum.n.01', 'synonyms': ['drum_(musical_instrument)'], 'id': 399, 'def': 'a musical percussion instrument; usually consists of a hollow cylinder with a membrane stretched across each end', 'name': 'drum_(musical_instrument)'}, {'frequency': 'r', 'synset': 'drumstick.n.02', 'synonyms': ['drumstick'], 'id': 400, 'def': 'a stick used for playing a drum', 'name': 'drumstick'}, {'frequency': 'f', 'synset': 'duck.n.01', 'synonyms': ['duck'], 'id': 401, 'def': 'small web-footed broad-billed swimming bird', 'name': 'duck'}, {'frequency': 'c', 'synset': 'duckling.n.02', 'synonyms': ['duckling'], 'id': 402, 'def': 'young duck', 'name': 'duckling'}, {'frequency': 'c', 'synset': 'duct_tape.n.01', 'synonyms': ['duct_tape'], 'id': 403, 'def': 'a wide silvery adhesive tape', 'name': 'duct_tape'}, {'frequency': 'f', 'synset': 'duffel_bag.n.01', 'synonyms': ['duffel_bag', 'duffle_bag', 'duffel', 'duffle'], 'id': 404, 'def': 'a large cylindrical bag of heavy cloth (does not include suitcases)', 'name': 'duffel_bag'}, {'frequency': 'r', 'synset': 'dumbbell.n.01', 'synonyms': ['dumbbell'], 'id': 405, 'def': 'an exercising weight with two ball-like ends connected by a short handle', 'name': 'dumbbell'}, {'frequency': 'c', 'synset': 'dumpster.n.01', 'synonyms': ['dumpster'], 'id': 406, 'def': 'a container designed to receive and transport and dump waste', 'name': 'dumpster'}, {'frequency': 'r', 'synset': 'dustpan.n.02', 'synonyms': ['dustpan'], 'id': 407, 'def': 'a short-handled receptacle into which dust can be swept', 'name': 'dustpan'}, {'frequency': 'c', 'synset': 'eagle.n.01', 'synonyms': ['eagle'], 'id': 408, 'def': 'large birds of prey noted for their broad wings and strong soaring flight', 'name': 'eagle'}, {'frequency': 'f', 'synset': 'earphone.n.01', 'synonyms': ['earphone', 'earpiece', 'headphone'], 'id': 409, 'def': 'device for listening to audio that is held over or inserted into the ear', 'name': 'earphone'}, {'frequency': 'r', 'synset': 'earplug.n.01', 'synonyms': ['earplug'], 'id': 410, 'def': 'a soft plug that is inserted into the ear canal to block sound', 'name': 'earplug'}, {'frequency': 'f', 'synset': 'earring.n.01', 'synonyms': ['earring'], 'id': 411, 'def': 'jewelry to ornament the ear', 'name': 'earring'}, {'frequency': 'c', 'synset': 'easel.n.01', 'synonyms': ['easel'], 'id': 412, 'def': "an upright tripod for displaying something (usually an artist's canvas)", 'name': 'easel'}, {'frequency': 'r', 'synset': 'eclair.n.01', 'synonyms': ['eclair'], 'id': 413, 'def': 'oblong cream puff', 'name': 'eclair'}, {'frequency': 'r', 'synset': 'eel.n.01', 'synonyms': ['eel'], 'id': 414, 'def': 'an elongate fish with fatty flesh', 'name': 'eel'}, {'frequency': 'f', 'synset': 'egg.n.02', 'synonyms': ['egg', 'eggs'], 'id': 415, 'def': 'oval reproductive body of a fowl (especially a hen) used as food', 'name': 'egg'}, {'frequency': 'r', 'synset': 'egg_roll.n.01', 'synonyms': ['egg_roll', 'spring_roll'], 'id': 416, 'def': 'minced vegetables and meat wrapped in a pancake and fried', 'name': 'egg_roll'}, {'frequency': 'c', 'synset': 'egg_yolk.n.01', 'synonyms': ['egg_yolk', 'yolk_(egg)'], 'id': 417, 'def': 'the yellow spherical part of an egg', 'name': 'egg_yolk'}, {'frequency': 'c', 'synset': 'eggbeater.n.02', 'synonyms': ['eggbeater', 'eggwhisk'], 'id': 418, 'def': 'a mixer for beating eggs or whipping cream', 'name': 'eggbeater'}, {'frequency': 'c', 'synset': 'eggplant.n.01', 'synonyms': ['eggplant', 'aubergine'], 'id': 419, 'def': 'egg-shaped vegetable having a shiny skin typically dark purple', 'name': 'eggplant'}, {'frequency': 'r', 'synset': 'electric_chair.n.01', 'synonyms': ['electric_chair'], 'id': 420, 'def': 'a chair-shaped instrument of execution by electrocution', 'name': 'electric_chair'}, {'frequency': 'f', 'synset': 'electric_refrigerator.n.01', 'synonyms': ['refrigerator'], 'id': 421, 'def': 'a refrigerator in which the coolant is pumped around by an electric motor', 'name': 'refrigerator'}, {'frequency': 'f', 'synset': 'elephant.n.01', 'synonyms': ['elephant'], 'id': 422, 'def': 'a common elephant', 'name': 'elephant'}, {'frequency': 'c', 'synset': 'elk.n.01', 'synonyms': ['elk', 'moose'], 'id': 423, 'def': 'large northern deer with enormous flattened antlers in the male', 'name': 'elk'}, {'frequency': 'c', 'synset': 'envelope.n.01', 'synonyms': ['envelope'], 'id': 424, 'def': 'a flat (usually rectangular) container for a letter, thin package, etc.', 'name': 'envelope'}, {'frequency': 'c', 'synset': 'eraser.n.01', 'synonyms': ['eraser'], 'id': 425, 'def': 'an implement used to erase something', 'name': 'eraser'}, {'frequency': 'r', 'synset': 'escargot.n.01', 'synonyms': ['escargot'], 'id': 426, 'def': 'edible snail usually served in the shell with a sauce of melted butter and garlic', 'name': 'escargot'}, {'frequency': 'r', 'synset': 'eyepatch.n.01', 'synonyms': ['eyepatch'], 'id': 427, 'def': 'a protective cloth covering for an injured eye', 'name': 'eyepatch'}, {'frequency': 'r', 'synset': 'falcon.n.01', 'synonyms': ['falcon'], 'id': 428, 'def': 'birds of prey having long pointed powerful wings adapted for swift flight', 'name': 'falcon'}, {'frequency': 'f', 'synset': 'fan.n.01', 'synonyms': ['fan'], 'id': 429, 'def': 'a device for creating a current of air by movement of a surface or surfaces', 'name': 'fan'}, {'frequency': 'f', 'synset': 'faucet.n.01', 'synonyms': ['faucet', 'spigot', 'tap'], 'id': 430, 'def': 'a regulator for controlling the flow of a liquid from a reservoir', 'name': 'faucet'}, {'frequency': 'r', 'synset': 'fedora.n.01', 'synonyms': ['fedora'], 'id': 431, 'def': 'a hat made of felt with a creased crown', 'name': 'fedora'}, {'frequency': 'r', 'synset': 'ferret.n.02', 'synonyms': ['ferret'], 'id': 432, 'def': 'domesticated albino variety of the European polecat bred for hunting rats and rabbits', 'name': 'ferret'}, {'frequency': 'c', 'synset': 'ferris_wheel.n.01', 'synonyms': ['Ferris_wheel'], 'id': 433, 'def': 'a large wheel with suspended seats that remain upright as the wheel rotates', 'name': 'Ferris_wheel'}, {'frequency': 'c', 'synset': 'ferry.n.01', 'synonyms': ['ferry', 'ferryboat'], 'id': 434, 'def': 'a boat that transports people or vehicles across a body of water and operates on a regular schedule', 'name': 'ferry'}, {'frequency': 'r', 'synset': 'fig.n.04', 'synonyms': ['fig_(fruit)'], 'id': 435, 'def': 'fleshy sweet pear-shaped yellowish or purple fruit eaten fresh or preserved or dried', 'name': 'fig_(fruit)'}, {'frequency': 'c', 'synset': 'fighter.n.02', 'synonyms': ['fighter_jet', 'fighter_aircraft', 'attack_aircraft'], 'id': 436, 'def': 'a high-speed military or naval airplane designed to destroy enemy targets', 'name': 'fighter_jet'}, {'frequency': 'f', 'synset': 'figurine.n.01', 'synonyms': ['figurine'], 'id': 437, 'def': 'a small carved or molded figure', 'name': 'figurine'}, {'frequency': 'c', 'synset': 'file.n.03', 'synonyms': ['file_cabinet', 'filing_cabinet'], 'id': 438, 'def': 'office furniture consisting of a container for keeping papers in order', 'name': 'file_cabinet'}, {'frequency': 'r', 'synset': 'file.n.04', 'synonyms': ['file_(tool)'], 'id': 439, 'def': 'a steel hand tool with small sharp teeth on some or all of its surfaces; used for smoothing wood or metal', 'name': 'file_(tool)'}, {'frequency': 'f', 'synset': 'fire_alarm.n.02', 'synonyms': ['fire_alarm', 'smoke_alarm'], 'id': 440, 'def': 'an alarm that is tripped off by fire or smoke', 'name': 'fire_alarm'}, {'frequency': 'f', 'synset': 'fire_engine.n.01', 'synonyms': ['fire_engine', 'fire_truck'], 'id': 441, 'def': 'large trucks that carry firefighters and equipment to the site of a fire', 'name': 'fire_engine'}, {'frequency': 'f', 'synset': 'fire_extinguisher.n.01', 'synonyms': ['fire_extinguisher', 'extinguisher'], 'id': 442, 'def': 'a manually operated device for extinguishing small fires', 'name': 'fire_extinguisher'}, {'frequency': 'c', 'synset': 'fire_hose.n.01', 'synonyms': ['fire_hose'], 'id': 443, 'def': 'a large hose that carries water from a fire hydrant to the site of the fire', 'name': 'fire_hose'}, {'frequency': 'f', 'synset': 'fireplace.n.01', 'synonyms': ['fireplace'], 'id': 444, 'def': 'an open recess in a wall at the base of a chimney where a fire can be built', 'name': 'fireplace'}, {'frequency': 'f', 'synset': 'fireplug.n.01', 'synonyms': ['fireplug', 'fire_hydrant', 'hydrant'], 'id': 445, 'def': 'an upright hydrant for drawing water to use in fighting a fire', 'name': 'fireplug'}, {'frequency': 'r', 'synset': 'first-aid_kit.n.01', 'synonyms': ['first-aid_kit'], 'id': 446, 'def': 'kit consisting of a set of bandages and medicines for giving first aid', 'name': 'first-aid_kit'}, {'frequency': 'f', 'synset': 'fish.n.01', 'synonyms': ['fish'], 'id': 447, 'def': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills', 'name': 'fish'}, {'frequency': 'c', 'synset': 'fish.n.02', 'synonyms': ['fish_(food)'], 'id': 448, 'def': 'the flesh of fish used as food', 'name': 'fish_(food)'}, {'frequency': 'r', 'synset': 'fishbowl.n.02', 'synonyms': ['fishbowl', 'goldfish_bowl'], 'id': 449, 'def': 'a transparent bowl in which small fish are kept', 'name': 'fishbowl'}, {'frequency': 'c', 'synset': 'fishing_rod.n.01', 'synonyms': ['fishing_rod', 'fishing_pole'], 'id': 450, 'def': 'a rod that is used in fishing to extend the fishing line', 'name': 'fishing_rod'}, {'frequency': 'f', 'synset': 'flag.n.01', 'synonyms': ['flag'], 'id': 451, 'def': 'emblem usually consisting of a rectangular piece of cloth of distinctive design (do not include pole)', 'name': 'flag'}, {'frequency': 'f', 'synset': 'flagpole.n.02', 'synonyms': ['flagpole', 'flagstaff'], 'id': 452, 'def': 'a tall staff or pole on which a flag is raised', 'name': 'flagpole'}, {'frequency': 'c', 'synset': 'flamingo.n.01', 'synonyms': ['flamingo'], 'id': 453, 'def': 'large pink web-footed bird with down-bent bill', 'name': 'flamingo'}, {'frequency': 'c', 'synset': 'flannel.n.01', 'synonyms': ['flannel'], 'id': 454, 'def': 'a soft light woolen fabric; used for clothing', 'name': 'flannel'}, {'frequency': 'c', 'synset': 'flap.n.01', 'synonyms': ['flap'], 'id': 455, 'def': 'any broad thin covering attached at one edge, such as a mud flap next to a wheel or a flap on an airplane wing', 'name': 'flap'}, {'frequency': 'r', 'synset': 'flash.n.10', 'synonyms': ['flash', 'flashbulb'], 'id': 456, 'def': 'a lamp for providing momentary light to take a photograph', 'name': 'flash'}, {'frequency': 'c', 'synset': 'flashlight.n.01', 'synonyms': ['flashlight', 'torch'], 'id': 457, 'def': 'a small portable battery-powered electric lamp', 'name': 'flashlight'}, {'frequency': 'r', 'synset': 'fleece.n.03', 'synonyms': ['fleece'], 'id': 458, 'def': 'a soft bulky fabric with deep pile; used chiefly for clothing', 'name': 'fleece'}, {'frequency': 'f', 'synset': 'flip-flop.n.02', 'synonyms': ['flip-flop_(sandal)'], 'id': 459, 'def': 'a backless sandal held to the foot by a thong between two toes', 'name': 'flip-flop_(sandal)'}, {'frequency': 'c', 'synset': 'flipper.n.01', 'synonyms': ['flipper_(footwear)', 'fin_(footwear)'], 'id': 460, 'def': 'a shoe to aid a person in swimming', 'name': 'flipper_(footwear)'}, {'frequency': 'f', 'synset': 'flower_arrangement.n.01', 'synonyms': ['flower_arrangement', 'floral_arrangement'], 'id': 461, 'def': 'a decorative arrangement of flowers', 'name': 'flower_arrangement'}, {'frequency': 'c', 'synset': 'flute.n.02', 'synonyms': ['flute_glass', 'champagne_flute'], 'id': 462, 'def': 'a tall narrow wineglass', 'name': 'flute_glass'}, {'frequency': 'c', 'synset': 'foal.n.01', 'synonyms': ['foal'], 'id': 463, 'def': 'a young horse', 'name': 'foal'}, {'frequency': 'c', 'synset': 'folding_chair.n.01', 'synonyms': ['folding_chair'], 'id': 464, 'def': 'a chair that can be folded flat for storage', 'name': 'folding_chair'}, {'frequency': 'c', 'synset': 'food_processor.n.01', 'synonyms': ['food_processor'], 'id': 465, 'def': 'a kitchen appliance for shredding, blending, chopping, or slicing food', 'name': 'food_processor'}, {'frequency': 'c', 'synset': 'football.n.02', 'synonyms': ['football_(American)'], 'id': 466, 'def': 'the inflated oblong ball used in playing American football', 'name': 'football_(American)'}, {'frequency': 'r', 'synset': 'football_helmet.n.01', 'synonyms': ['football_helmet'], 'id': 467, 'def': 'a padded helmet with a face mask to protect the head of football players', 'name': 'football_helmet'}, {'frequency': 'c', 'synset': 'footstool.n.01', 'synonyms': ['footstool', 'footrest'], 'id': 468, 'def': 'a low seat or a stool to rest the feet of a seated person', 'name': 'footstool'}, {'frequency': 'f', 'synset': 'fork.n.01', 'synonyms': ['fork'], 'id': 469, 'def': 'cutlery used for serving and eating food', 'name': 'fork'}, {'frequency': 'c', 'synset': 'forklift.n.01', 'synonyms': ['forklift'], 'id': 470, 'def': 'an industrial vehicle with a power operated fork in front that can be inserted under loads to lift and move them', 'name': 'forklift'}, {'frequency': 'c', 'synset': 'freight_car.n.01', 'synonyms': ['freight_car'], 'id': 471, 'def': 'a railway car that carries freight', 'name': 'freight_car'}, {'frequency': 'c', 'synset': 'french_toast.n.01', 'synonyms': ['French_toast'], 'id': 472, 'def': 'bread slice dipped in egg and milk and fried', 'name': 'French_toast'}, {'frequency': 'c', 'synset': 'freshener.n.01', 'synonyms': ['freshener', 'air_freshener'], 'id': 473, 'def': 'anything that freshens air by removing or covering odor', 'name': 'freshener'}, {'frequency': 'f', 'synset': 'frisbee.n.01', 'synonyms': ['frisbee'], 'id': 474, 'def': 'a light, plastic disk propelled with a flip of the wrist for recreation or competition', 'name': 'frisbee'}, {'frequency': 'c', 'synset': 'frog.n.01', 'synonyms': ['frog', 'toad', 'toad_frog'], 'id': 475, 'def': 'a tailless stout-bodied amphibians with long hind limbs for leaping', 'name': 'frog'}, {'frequency': 'c', 'synset': 'fruit_juice.n.01', 'synonyms': ['fruit_juice'], 'id': 476, 'def': 'drink produced by squeezing or crushing fruit', 'name': 'fruit_juice'}, {'frequency': 'f', 'synset': 'frying_pan.n.01', 'synonyms': ['frying_pan', 'frypan', 'skillet'], 'id': 477, 'def': 'a pan used for frying foods', 'name': 'frying_pan'}, {'frequency': 'r', 'synset': 'fudge.n.01', 'synonyms': ['fudge'], 'id': 478, 'def': 'soft creamy candy', 'name': 'fudge'}, {'frequency': 'r', 'synset': 'funnel.n.02', 'synonyms': ['funnel'], 'id': 479, 'def': 'a cone-shaped utensil used to channel a substance into a container with a small mouth', 'name': 'funnel'}, {'frequency': 'r', 'synset': 'futon.n.01', 'synonyms': ['futon'], 'id': 480, 'def': 'a pad that is used for sleeping on the floor or on a raised frame', 'name': 'futon'}, {'frequency': 'r', 'synset': 'gag.n.02', 'synonyms': ['gag', 'muzzle'], 'id': 481, 'def': "restraint put into a person's mouth to prevent speaking or shouting", 'name': 'gag'}, {'frequency': 'r', 'synset': 'garbage.n.03', 'synonyms': ['garbage'], 'id': 482, 'def': 'a receptacle where waste can be discarded', 'name': 'garbage'}, {'frequency': 'c', 'synset': 'garbage_truck.n.01', 'synonyms': ['garbage_truck'], 'id': 483, 'def': 'a truck for collecting domestic refuse', 'name': 'garbage_truck'}, {'frequency': 'c', 'synset': 'garden_hose.n.01', 'synonyms': ['garden_hose'], 'id': 484, 'def': 'a hose used for watering a lawn or garden', 'name': 'garden_hose'}, {'frequency': 'c', 'synset': 'gargle.n.01', 'synonyms': ['gargle', 'mouthwash'], 'id': 485, 'def': 'a medicated solution used for gargling and rinsing the mouth', 'name': 'gargle'}, {'frequency': 'r', 'synset': 'gargoyle.n.02', 'synonyms': ['gargoyle'], 'id': 486, 'def': 'an ornament consisting of a grotesquely carved figure of a person or animal', 'name': 'gargoyle'}, {'frequency': 'c', 'synset': 'garlic.n.02', 'synonyms': ['garlic', 'ail'], 'id': 487, 'def': 'aromatic bulb used as seasoning', 'name': 'garlic'}, {'frequency': 'r', 'synset': 'gasmask.n.01', 'synonyms': ['gasmask', 'respirator', 'gas_helmet'], 'id': 488, 'def': 'a protective face mask with a filter', 'name': 'gasmask'}, {'frequency': 'c', 'synset': 'gazelle.n.01', 'synonyms': ['gazelle'], 'id': 489, 'def': 'small swift graceful antelope of Africa and Asia having lustrous eyes', 'name': 'gazelle'}, {'frequency': 'c', 'synset': 'gelatin.n.02', 'synonyms': ['gelatin', 'jelly'], 'id': 490, 'def': 'an edible jelly made with gelatin and used as a dessert or salad base or a coating for foods', 'name': 'gelatin'}, {'frequency': 'r', 'synset': 'gem.n.02', 'synonyms': ['gemstone'], 'id': 491, 'def': 'a crystalline rock that can be cut and polished for jewelry', 'name': 'gemstone'}, {'frequency': 'r', 'synset': 'generator.n.02', 'synonyms': ['generator'], 'id': 492, 'def': 'engine that converts mechanical energy into electrical energy by electromagnetic induction', 'name': 'generator'}, {'frequency': 'c', 'synset': 'giant_panda.n.01', 'synonyms': ['giant_panda', 'panda', 'panda_bear'], 'id': 493, 'def': 'large black-and-white herbivorous mammal of bamboo forests of China and Tibet', 'name': 'giant_panda'}, {'frequency': 'c', 'synset': 'gift_wrap.n.01', 'synonyms': ['gift_wrap'], 'id': 494, 'def': 'attractive wrapping paper suitable for wrapping gifts', 'name': 'gift_wrap'}, {'frequency': 'c', 'synset': 'ginger.n.03', 'synonyms': ['ginger', 'gingerroot'], 'id': 495, 'def': 'the root of the common ginger plant; used fresh as a seasoning', 'name': 'ginger'}, {'frequency': 'f', 'synset': 'giraffe.n.01', 'synonyms': ['giraffe'], 'id': 496, 'def': 'tall animal having a spotted coat and small horns and very long neck and legs', 'name': 'giraffe'}, {'frequency': 'c', 'synset': 'girdle.n.02', 'synonyms': ['cincture', 'sash', 'waistband', 'waistcloth'], 'id': 497, 'def': 'a band of material around the waist that strengthens a skirt or trousers', 'name': 'cincture'}, {'frequency': 'f', 'synset': 'glass.n.02', 'synonyms': ['glass_(drink_container)', 'drinking_glass'], 'id': 498, 'def': 'a container for holding liquids while drinking', 'name': 'glass_(drink_container)'}, {'frequency': 'c', 'synset': 'globe.n.03', 'synonyms': ['globe'], 'id': 499, 'def': 'a sphere on which a map (especially of the earth) is represented', 'name': 'globe'}, {'frequency': 'f', 'synset': 'glove.n.02', 'synonyms': ['glove'], 'id': 500, 'def': 'handwear covering the hand', 'name': 'glove'}, {'frequency': 'c', 'synset': 'goat.n.01', 'synonyms': ['goat'], 'id': 501, 'def': 'a common goat', 'name': 'goat'}, {'frequency': 'f', 'synset': 'goggles.n.01', 'synonyms': ['goggles'], 'id': 502, 'def': 'tight-fitting spectacles worn to protect the eyes', 'name': 'goggles'}, {'frequency': 'r', 'synset': 'goldfish.n.01', 'synonyms': ['goldfish'], 'id': 503, 'def': 'small golden or orange-red freshwater fishes used as pond or aquarium pets', 'name': 'goldfish'}, {'frequency': 'c', 'synset': 'golf_club.n.02', 'synonyms': ['golf_club', 'golf-club'], 'id': 504, 'def': 'golf equipment used by a golfer to hit a golf ball', 'name': 'golf_club'}, {'frequency': 'c', 'synset': 'golfcart.n.01', 'synonyms': ['golfcart'], 'id': 505, 'def': 'a small motor vehicle in which golfers can ride between shots', 'name': 'golfcart'}, {'frequency': 'r', 'synset': 'gondola.n.02', 'synonyms': ['gondola_(boat)'], 'id': 506, 'def': 'long narrow flat-bottomed boat propelled by sculling; traditionally used on canals of Venice', 'name': 'gondola_(boat)'}, {'frequency': 'c', 'synset': 'goose.n.01', 'synonyms': ['goose'], 'id': 507, 'def': 'loud, web-footed long-necked aquatic birds usually larger than ducks', 'name': 'goose'}, {'frequency': 'r', 'synset': 'gorilla.n.01', 'synonyms': ['gorilla'], 'id': 508, 'def': 'largest ape', 'name': 'gorilla'}, {'frequency': 'r', 'synset': 'gourd.n.02', 'synonyms': ['gourd'], 'id': 509, 'def': 'any of numerous inedible fruits with hard rinds', 'name': 'gourd'}, {'frequency': 'f', 'synset': 'grape.n.01', 'synonyms': ['grape'], 'id': 510, 'def': 'any of various juicy fruit with green or purple skins; grow in clusters', 'name': 'grape'}, {'frequency': 'c', 'synset': 'grater.n.01', 'synonyms': ['grater'], 'id': 511, 'def': 'utensil with sharp perforations for shredding foods (as vegetables or cheese)', 'name': 'grater'}, {'frequency': 'c', 'synset': 'gravestone.n.01', 'synonyms': ['gravestone', 'headstone', 'tombstone'], 'id': 512, 'def': 'a stone that is used to mark a grave', 'name': 'gravestone'}, {'frequency': 'r', 'synset': 'gravy_boat.n.01', 'synonyms': ['gravy_boat', 'gravy_holder'], 'id': 513, 'def': 'a dish (often boat-shaped) for serving gravy or sauce', 'name': 'gravy_boat'}, {'frequency': 'f', 'synset': 'green_bean.n.02', 'synonyms': ['green_bean'], 'id': 514, 'def': 'a common bean plant cultivated for its slender green edible pods', 'name': 'green_bean'}, {'frequency': 'f', 'synset': 'green_onion.n.01', 'synonyms': ['green_onion', 'spring_onion', 'scallion'], 'id': 515, 'def': 'a young onion before the bulb has enlarged', 'name': 'green_onion'}, {'frequency': 'r', 'synset': 'griddle.n.01', 'synonyms': ['griddle'], 'id': 516, 'def': 'cooking utensil consisting of a flat heated surface on which food is cooked', 'name': 'griddle'}, {'frequency': 'f', 'synset': 'grill.n.02', 'synonyms': ['grill', 'grille', 'grillwork', 'radiator_grille'], 'id': 517, 'def': 'a framework of metal bars used as a partition or a grate', 'name': 'grill'}, {'frequency': 'r', 'synset': 'grits.n.01', 'synonyms': ['grits', 'hominy_grits'], 'id': 518, 'def': 'coarsely ground corn boiled as a breakfast dish', 'name': 'grits'}, {'frequency': 'c', 'synset': 'grizzly.n.01', 'synonyms': ['grizzly', 'grizzly_bear'], 'id': 519, 'def': 'powerful brownish-yellow bear of the uplands of western North America', 'name': 'grizzly'}, {'frequency': 'c', 'synset': 'grocery_bag.n.01', 'synonyms': ['grocery_bag'], 'id': 520, 'def': "a sack for holding customer's groceries", 'name': 'grocery_bag'}, {'frequency': 'f', 'synset': 'guitar.n.01', 'synonyms': ['guitar'], 'id': 521, 'def': 'a stringed instrument usually having six strings; played by strumming or plucking', 'name': 'guitar'}, {'frequency': 'c', 'synset': 'gull.n.02', 'synonyms': ['gull', 'seagull'], 'id': 522, 'def': 'mostly white aquatic bird having long pointed wings and short legs', 'name': 'gull'}, {'frequency': 'c', 'synset': 'gun.n.01', 'synonyms': ['gun'], 'id': 523, 'def': 'a weapon that discharges a bullet at high velocity from a metal tube', 'name': 'gun'}, {'frequency': 'f', 'synset': 'hairbrush.n.01', 'synonyms': ['hairbrush'], 'id': 524, 'def': "a brush used to groom a person's hair", 'name': 'hairbrush'}, {'frequency': 'c', 'synset': 'hairnet.n.01', 'synonyms': ['hairnet'], 'id': 525, 'def': 'a small net that someone wears over their hair to keep it in place', 'name': 'hairnet'}, {'frequency': 'c', 'synset': 'hairpin.n.01', 'synonyms': ['hairpin'], 'id': 526, 'def': "a double pronged pin used to hold women's hair in place", 'name': 'hairpin'}, {'frequency': 'r', 'synset': 'halter.n.03', 'synonyms': ['halter_top'], 'id': 527, 'def': "a woman's top that fastens behind the back and neck leaving the back and arms uncovered", 'name': 'halter_top'}, {'frequency': 'f', 'synset': 'ham.n.01', 'synonyms': ['ham', 'jambon', 'gammon'], 'id': 528, 'def': 'meat cut from the thigh of a hog (usually smoked)', 'name': 'ham'}, {'frequency': 'c', 'synset': 'hamburger.n.01', 'synonyms': ['hamburger', 'beefburger', 'burger'], 'id': 529, 'def': 'a sandwich consisting of a patty of minced beef served on a bun', 'name': 'hamburger'}, {'frequency': 'c', 'synset': 'hammer.n.02', 'synonyms': ['hammer'], 'id': 530, 'def': 'a hand tool with a heavy head and a handle; used to deliver an impulsive force by striking', 'name': 'hammer'}, {'frequency': 'c', 'synset': 'hammock.n.02', 'synonyms': ['hammock'], 'id': 531, 'def': 'a hanging bed of canvas or rope netting (usually suspended between two trees)', 'name': 'hammock'}, {'frequency': 'r', 'synset': 'hamper.n.02', 'synonyms': ['hamper'], 'id': 532, 'def': 'a basket usually with a cover', 'name': 'hamper'}, {'frequency': 'c', 'synset': 'hamster.n.01', 'synonyms': ['hamster'], 'id': 533, 'def': 'short-tailed burrowing rodent with large cheek pouches', 'name': 'hamster'}, {'frequency': 'f', 'synset': 'hand_blower.n.01', 'synonyms': ['hair_dryer'], 'id': 534, 'def': 'a hand-held electric blower that can blow warm air onto the hair', 'name': 'hair_dryer'}, {'frequency': 'r', 'synset': 'hand_glass.n.01', 'synonyms': ['hand_glass', 'hand_mirror'], 'id': 535, 'def': 'a mirror intended to be held in the hand', 'name': 'hand_glass'}, {'frequency': 'f', 'synset': 'hand_towel.n.01', 'synonyms': ['hand_towel', 'face_towel'], 'id': 536, 'def': 'a small towel used to dry the hands or face', 'name': 'hand_towel'}, {'frequency': 'c', 'synset': 'handcart.n.01', 'synonyms': ['handcart', 'pushcart', 'hand_truck'], 'id': 537, 'def': 'wheeled vehicle that can be pushed by a person', 'name': 'handcart'}, {'frequency': 'r', 'synset': 'handcuff.n.01', 'synonyms': ['handcuff'], 'id': 538, 'def': 'shackle that consists of a metal loop that can be locked around the wrist', 'name': 'handcuff'}, {'frequency': 'c', 'synset': 'handkerchief.n.01', 'synonyms': ['handkerchief'], 'id': 539, 'def': 'a square piece of cloth used for wiping the eyes or nose or as a costume accessory', 'name': 'handkerchief'}, {'frequency': 'f', 'synset': 'handle.n.01', 'synonyms': ['handle', 'grip', 'handgrip'], 'id': 540, 'def': 'the appendage to an object that is designed to be held in order to use or move it', 'name': 'handle'}, {'frequency': 'r', 'synset': 'handsaw.n.01', 'synonyms': ['handsaw', "carpenter's_saw"], 'id': 541, 'def': 'a saw used with one hand for cutting wood', 'name': 'handsaw'}, {'frequency': 'r', 'synset': 'hardback.n.01', 'synonyms': ['hardback_book', 'hardcover_book'], 'id': 542, 'def': 'a book with cardboard or cloth or leather covers', 'name': 'hardback_book'}, {'frequency': 'r', 'synset': 'harmonium.n.01', 'synonyms': ['harmonium', 'organ_(musical_instrument)', 'reed_organ_(musical_instrument)'], 'id': 543, 'def': 'a free-reed instrument in which air is forced through the reeds by bellows', 'name': 'harmonium'}, {'frequency': 'f', 'synset': 'hat.n.01', 'synonyms': ['hat'], 'id': 544, 'def': 'headwear that protects the head from bad weather, sun, or worn for fashion', 'name': 'hat'}, {'frequency': 'r', 'synset': 'hatbox.n.01', 'synonyms': ['hatbox'], 'id': 545, 'def': 'a round piece of luggage for carrying hats', 'name': 'hatbox'}, {'frequency': 'c', 'synset': 'head_covering.n.01', 'synonyms': ['veil'], 'id': 546, 'def': 'a garment that covers the head OR face', 'name': 'veil'}, {'frequency': 'f', 'synset': 'headband.n.01', 'synonyms': ['headband'], 'id': 547, 'def': 'a band worn around or over the head', 'name': 'headband'}, {'frequency': 'f', 'synset': 'headboard.n.01', 'synonyms': ['headboard'], 'id': 548, 'def': 'a vertical board or panel forming the head of a bedstead', 'name': 'headboard'}, {'frequency': 'f', 'synset': 'headlight.n.01', 'synonyms': ['headlight', 'headlamp'], 'id': 549, 'def': 'a powerful light with reflector; attached to the front of an automobile or locomotive', 'name': 'headlight'}, {'frequency': 'c', 'synset': 'headscarf.n.01', 'synonyms': ['headscarf'], 'id': 550, 'def': 'a kerchief worn over the head and tied under the chin', 'name': 'headscarf'}, {'frequency': 'r', 'synset': 'headset.n.01', 'synonyms': ['headset'], 'id': 551, 'def': 'receiver consisting of a pair of headphones', 'name': 'headset'}, {'frequency': 'c', 'synset': 'headstall.n.01', 'synonyms': ['headstall_(for_horses)', 'headpiece_(for_horses)'], 'id': 552, 'def': "the band that is the part of a bridle that fits around a horse's head", 'name': 'headstall_(for_horses)'}, {'frequency': 'c', 'synset': 'heart.n.02', 'synonyms': ['heart'], 'id': 553, 'def': 'a muscular organ; its contractions move the blood through the body', 'name': 'heart'}, {'frequency': 'c', 'synset': 'heater.n.01', 'synonyms': ['heater', 'warmer'], 'id': 554, 'def': 'device that heats water or supplies warmth to a room', 'name': 'heater'}, {'frequency': 'c', 'synset': 'helicopter.n.01', 'synonyms': ['helicopter'], 'id': 555, 'def': 'an aircraft without wings that obtains its lift from the rotation of overhead blades', 'name': 'helicopter'}, {'frequency': 'f', 'synset': 'helmet.n.02', 'synonyms': ['helmet'], 'id': 556, 'def': 'a protective headgear made of hard material to resist blows', 'name': 'helmet'}, {'frequency': 'r', 'synset': 'heron.n.02', 'synonyms': ['heron'], 'id': 557, 'def': 'grey or white wading bird with long neck and long legs and (usually) long bill', 'name': 'heron'}, {'frequency': 'c', 'synset': 'highchair.n.01', 'synonyms': ['highchair', 'feeding_chair'], 'id': 558, 'def': 'a chair for feeding a very young child', 'name': 'highchair'}, {'frequency': 'f', 'synset': 'hinge.n.01', 'synonyms': ['hinge'], 'id': 559, 'def': 'a joint that holds two parts together so that one can swing relative to the other', 'name': 'hinge'}, {'frequency': 'r', 'synset': 'hippopotamus.n.01', 'synonyms': ['hippopotamus'], 'id': 560, 'def': 'massive thick-skinned animal living in or around rivers of tropical Africa', 'name': 'hippopotamus'}, {'frequency': 'r', 'synset': 'hockey_stick.n.01', 'synonyms': ['hockey_stick'], 'id': 561, 'def': 'sports implement consisting of a stick used by hockey players to move the puck', 'name': 'hockey_stick'}, {'frequency': 'c', 'synset': 'hog.n.03', 'synonyms': ['hog', 'pig'], 'id': 562, 'def': 'domestic swine', 'name': 'hog'}, {'frequency': 'f', 'synset': 'home_plate.n.01', 'synonyms': ['home_plate_(baseball)', 'home_base_(baseball)'], 'id': 563, 'def': '(baseball) a rubber slab where the batter stands; it must be touched by a base runner in order to score', 'name': 'home_plate_(baseball)'}, {'frequency': 'c', 'synset': 'honey.n.01', 'synonyms': ['honey'], 'id': 564, 'def': 'a sweet yellow liquid produced by bees', 'name': 'honey'}, {'frequency': 'f', 'synset': 'hood.n.06', 'synonyms': ['fume_hood', 'exhaust_hood'], 'id': 565, 'def': 'metal covering leading to a vent that exhausts smoke or fumes', 'name': 'fume_hood'}, {'frequency': 'f', 'synset': 'hook.n.05', 'synonyms': ['hook'], 'id': 566, 'def': 'a curved or bent implement for suspending or pulling something', 'name': 'hook'}, {'frequency': 'r', 'synset': 'hookah.n.01', 'synonyms': ['hookah', 'narghile', 'nargileh', 'sheesha', 'shisha', 'water_pipe'], 'id': 567, 'def': 'a tobacco pipe with a long flexible tube connected to a container where the smoke is cooled by passing through water', 'name': 'hookah'}, {'frequency': 'r', 'synset': 'hornet.n.01', 'synonyms': ['hornet'], 'id': 568, 'def': 'large stinging wasp', 'name': 'hornet'}, {'frequency': 'f', 'synset': 'horse.n.01', 'synonyms': ['horse'], 'id': 569, 'def': 'a common horse', 'name': 'horse'}, {'frequency': 'f', 'synset': 'hose.n.03', 'synonyms': ['hose', 'hosepipe'], 'id': 570, 'def': 'a flexible pipe for conveying a liquid or gas', 'name': 'hose'}, {'frequency': 'r', 'synset': 'hot-air_balloon.n.01', 'synonyms': ['hot-air_balloon'], 'id': 571, 'def': 'balloon for travel through the air in a basket suspended below a large bag of heated air', 'name': 'hot-air_balloon'}, {'frequency': 'r', 'synset': 'hot_plate.n.01', 'synonyms': ['hotplate'], 'id': 572, 'def': 'a portable electric appliance for heating or cooking or keeping food warm', 'name': 'hotplate'}, {'frequency': 'c', 'synset': 'hot_sauce.n.01', 'synonyms': ['hot_sauce'], 'id': 573, 'def': 'a pungent peppery sauce', 'name': 'hot_sauce'}, {'frequency': 'r', 'synset': 'hourglass.n.01', 'synonyms': ['hourglass'], 'id': 574, 'def': 'a sandglass timer that runs for sixty minutes', 'name': 'hourglass'}, {'frequency': 'r', 'synset': 'houseboat.n.01', 'synonyms': ['houseboat'], 'id': 575, 'def': 'a barge that is designed and equipped for use as a dwelling', 'name': 'houseboat'}, {'frequency': 'c', 'synset': 'hummingbird.n.01', 'synonyms': ['hummingbird'], 'id': 576, 'def': 'tiny American bird having brilliant iridescent plumage and long slender bills', 'name': 'hummingbird'}, {'frequency': 'r', 'synset': 'hummus.n.01', 'synonyms': ['hummus', 'humus', 'hommos', 'hoummos', 'humous'], 'id': 577, 'def': 'a thick spread made from mashed chickpeas', 'name': 'hummus'}, {'frequency': 'f', 'synset': 'ice_bear.n.01', 'synonyms': ['polar_bear'], 'id': 578, 'def': 'white bear of Arctic regions', 'name': 'polar_bear'}, {'frequency': 'c', 'synset': 'ice_cream.n.01', 'synonyms': ['icecream'], 'id': 579, 'def': 'frozen dessert containing cream and sugar and flavoring', 'name': 'icecream'}, {'frequency': 'r', 'synset': 'ice_lolly.n.01', 'synonyms': ['popsicle'], 'id': 580, 'def': 'ice cream or water ice on a small wooden stick', 'name': 'popsicle'}, {'frequency': 'c', 'synset': 'ice_maker.n.01', 'synonyms': ['ice_maker'], 'id': 581, 'def': 'an appliance included in some electric refrigerators for making ice cubes', 'name': 'ice_maker'}, {'frequency': 'r', 'synset': 'ice_pack.n.01', 'synonyms': ['ice_pack', 'ice_bag'], 'id': 582, 'def': 'a waterproof bag filled with ice: applied to the body (especially the head) to cool or reduce swelling', 'name': 'ice_pack'}, {'frequency': 'r', 'synset': 'ice_skate.n.01', 'synonyms': ['ice_skate'], 'id': 583, 'def': 'skate consisting of a boot with a steel blade fitted to the sole', 'name': 'ice_skate'}, {'frequency': 'c', 'synset': 'igniter.n.01', 'synonyms': ['igniter', 'ignitor', 'lighter'], 'id': 584, 'def': 'a substance or device used to start a fire', 'name': 'igniter'}, {'frequency': 'r', 'synset': 'inhaler.n.01', 'synonyms': ['inhaler', 'inhalator'], 'id': 585, 'def': 'a dispenser that produces a chemical vapor to be inhaled through mouth or nose', 'name': 'inhaler'}, {'frequency': 'f', 'synset': 'ipod.n.01', 'synonyms': ['iPod'], 'id': 586, 'def': 'a pocket-sized device used to play music files', 'name': 'iPod'}, {'frequency': 'c', 'synset': 'iron.n.04', 'synonyms': ['iron_(for_clothing)', 'smoothing_iron_(for_clothing)'], 'id': 587, 'def': 'home appliance consisting of a flat metal base that is heated and used to smooth cloth', 'name': 'iron_(for_clothing)'}, {'frequency': 'c', 'synset': 'ironing_board.n.01', 'synonyms': ['ironing_board'], 'id': 588, 'def': 'narrow padded board on collapsible supports; used for ironing clothes', 'name': 'ironing_board'}, {'frequency': 'f', 'synset': 'jacket.n.01', 'synonyms': ['jacket'], 'id': 589, 'def': 'a waist-length coat', 'name': 'jacket'}, {'frequency': 'c', 'synset': 'jam.n.01', 'synonyms': ['jam'], 'id': 590, 'def': 'preserve of crushed fruit', 'name': 'jam'}, {'frequency': 'f', 'synset': 'jar.n.01', 'synonyms': ['jar'], 'id': 591, 'def': 'a vessel (usually cylindrical) with a wide mouth and without handles', 'name': 'jar'}, {'frequency': 'f', 'synset': 'jean.n.01', 'synonyms': ['jean', 'blue_jean', 'denim'], 'id': 592, 'def': '(usually plural) close-fitting trousers of heavy denim for manual work or casual wear', 'name': 'jean'}, {'frequency': 'c', 'synset': 'jeep.n.01', 'synonyms': ['jeep', 'landrover'], 'id': 593, 'def': 'a car suitable for traveling over rough terrain', 'name': 'jeep'}, {'frequency': 'r', 'synset': 'jelly_bean.n.01', 'synonyms': ['jelly_bean', 'jelly_egg'], 'id': 594, 'def': 'sugar-glazed jellied candy', 'name': 'jelly_bean'}, {'frequency': 'f', 'synset': 'jersey.n.03', 'synonyms': ['jersey', 'T-shirt', 'tee_shirt'], 'id': 595, 'def': 'a close-fitting pullover shirt', 'name': 'jersey'}, {'frequency': 'c', 'synset': 'jet.n.01', 'synonyms': ['jet_plane', 'jet-propelled_plane'], 'id': 596, 'def': 'an airplane powered by one or more jet engines', 'name': 'jet_plane'}, {'frequency': 'r', 'synset': 'jewel.n.01', 'synonyms': ['jewel', 'gem', 'precious_stone'], 'id': 597, 'def': 'a precious or semiprecious stone incorporated into a piece of jewelry', 'name': 'jewel'}, {'frequency': 'c', 'synset': 'jewelry.n.01', 'synonyms': ['jewelry', 'jewellery'], 'id': 598, 'def': 'an adornment (as a bracelet or ring or necklace) made of precious metals and set with gems (or imitation gems)', 'name': 'jewelry'}, {'frequency': 'r', 'synset': 'joystick.n.02', 'synonyms': ['joystick'], 'id': 599, 'def': 'a control device for computers consisting of a vertical handle that can move freely in two directions', 'name': 'joystick'}, {'frequency': 'c', 'synset': 'jump_suit.n.01', 'synonyms': ['jumpsuit'], 'id': 600, 'def': "one-piece garment fashioned after a parachutist's uniform", 'name': 'jumpsuit'}, {'frequency': 'c', 'synset': 'kayak.n.01', 'synonyms': ['kayak'], 'id': 601, 'def': 'a small canoe consisting of a light frame made watertight with animal skins', 'name': 'kayak'}, {'frequency': 'r', 'synset': 'keg.n.02', 'synonyms': ['keg'], 'id': 602, 'def': 'small cask or barrel', 'name': 'keg'}, {'frequency': 'r', 'synset': 'kennel.n.01', 'synonyms': ['kennel', 'doghouse'], 'id': 603, 'def': 'outbuilding that serves as a shelter for a dog', 'name': 'kennel'}, {'frequency': 'c', 'synset': 'kettle.n.01', 'synonyms': ['kettle', 'boiler'], 'id': 604, 'def': 'a metal pot for stewing or boiling; usually has a lid', 'name': 'kettle'}, {'frequency': 'f', 'synset': 'key.n.01', 'synonyms': ['key'], 'id': 605, 'def': 'metal instrument used to unlock a lock', 'name': 'key'}, {'frequency': 'r', 'synset': 'keycard.n.01', 'synonyms': ['keycard'], 'id': 606, 'def': 'a plastic card used to gain access typically to a door', 'name': 'keycard'}, {'frequency': 'c', 'synset': 'kilt.n.01', 'synonyms': ['kilt'], 'id': 607, 'def': 'a knee-length pleated tartan skirt worn by men as part of the traditional dress in the Highlands of northern Scotland', 'name': 'kilt'}, {'frequency': 'c', 'synset': 'kimono.n.01', 'synonyms': ['kimono'], 'id': 608, 'def': 'a loose robe; imitated from robes originally worn by Japanese', 'name': 'kimono'}, {'frequency': 'f', 'synset': 'kitchen_sink.n.01', 'synonyms': ['kitchen_sink'], 'id': 609, 'def': 'a sink in a kitchen', 'name': 'kitchen_sink'}, {'frequency': 'r', 'synset': 'kitchen_table.n.01', 'synonyms': ['kitchen_table'], 'id': 610, 'def': 'a table in the kitchen', 'name': 'kitchen_table'}, {'frequency': 'f', 'synset': 'kite.n.03', 'synonyms': ['kite'], 'id': 611, 'def': 'plaything consisting of a light frame covered with tissue paper; flown in wind at end of a string', 'name': 'kite'}, {'frequency': 'c', 'synset': 'kitten.n.01', 'synonyms': ['kitten', 'kitty'], 'id': 612, 'def': 'young domestic cat', 'name': 'kitten'}, {'frequency': 'c', 'synset': 'kiwi.n.03', 'synonyms': ['kiwi_fruit'], 'id': 613, 'def': 'fuzzy brown egg-shaped fruit with slightly tart green flesh', 'name': 'kiwi_fruit'}, {'frequency': 'f', 'synset': 'knee_pad.n.01', 'synonyms': ['knee_pad'], 'id': 614, 'def': 'protective garment consisting of a pad worn by football or baseball or hockey players', 'name': 'knee_pad'}, {'frequency': 'f', 'synset': 'knife.n.01', 'synonyms': ['knife'], 'id': 615, 'def': 'tool with a blade and point used as a cutting instrument', 'name': 'knife'}, {'frequency': 'r', 'synset': 'knitting_needle.n.01', 'synonyms': ['knitting_needle'], 'id': 616, 'def': 'needle consisting of a slender rod with pointed ends; usually used in pairs', 'name': 'knitting_needle'}, {'frequency': 'f', 'synset': 'knob.n.02', 'synonyms': ['knob'], 'id': 617, 'def': 'a round handle often found on a door', 'name': 'knob'}, {'frequency': 'r', 'synset': 'knocker.n.05', 'synonyms': ['knocker_(on_a_door)', 'doorknocker'], 'id': 618, 'def': 'a device (usually metal and ornamental) attached by a hinge to a door', 'name': 'knocker_(on_a_door)'}, {'frequency': 'r', 'synset': 'koala.n.01', 'synonyms': ['koala', 'koala_bear'], 'id': 619, 'def': 'sluggish tailless Australian marsupial with grey furry ears and coat', 'name': 'koala'}, {'frequency': 'r', 'synset': 'lab_coat.n.01', 'synonyms': ['lab_coat', 'laboratory_coat'], 'id': 620, 'def': 'a light coat worn to protect clothing from substances used while working in a laboratory', 'name': 'lab_coat'}, {'frequency': 'f', 'synset': 'ladder.n.01', 'synonyms': ['ladder'], 'id': 621, 'def': 'steps consisting of two parallel members connected by rungs', 'name': 'ladder'}, {'frequency': 'c', 'synset': 'ladle.n.01', 'synonyms': ['ladle'], 'id': 622, 'def': 'a spoon-shaped vessel with a long handle frequently used to transfer liquids', 'name': 'ladle'}, {'frequency': 'c', 'synset': 'ladybug.n.01', 'synonyms': ['ladybug', 'ladybeetle', 'ladybird_beetle'], 'id': 623, 'def': 'small round bright-colored and spotted beetle, typically red and black', 'name': 'ladybug'}, {'frequency': 'f', 'synset': 'lamb.n.01', 'synonyms': ['lamb_(animal)'], 'id': 624, 'def': 'young sheep', 'name': 'lamb_(animal)'}, {'frequency': 'r', 'synset': 'lamb_chop.n.01', 'synonyms': ['lamb-chop', 'lambchop'], 'id': 625, 'def': 'chop cut from a lamb', 'name': 'lamb-chop'}, {'frequency': 'f', 'synset': 'lamp.n.02', 'synonyms': ['lamp'], 'id': 626, 'def': 'a piece of furniture holding one or more electric light bulbs', 'name': 'lamp'}, {'frequency': 'f', 'synset': 'lamppost.n.01', 'synonyms': ['lamppost'], 'id': 627, 'def': 'a metal post supporting an outdoor lamp (such as a streetlight)', 'name': 'lamppost'}, {'frequency': 'f', 'synset': 'lampshade.n.01', 'synonyms': ['lampshade'], 'id': 628, 'def': 'a protective ornamental shade used to screen a light bulb from direct view', 'name': 'lampshade'}, {'frequency': 'c', 'synset': 'lantern.n.01', 'synonyms': ['lantern'], 'id': 629, 'def': 'light in a transparent protective case', 'name': 'lantern'}, {'frequency': 'f', 'synset': 'lanyard.n.02', 'synonyms': ['lanyard', 'laniard'], 'id': 630, 'def': 'a cord worn around the neck to hold a knife or whistle, etc.', 'name': 'lanyard'}, {'frequency': 'f', 'synset': 'laptop.n.01', 'synonyms': ['laptop_computer', 'notebook_computer'], 'id': 631, 'def': 'a portable computer small enough to use in your lap', 'name': 'laptop_computer'}, {'frequency': 'r', 'synset': 'lasagna.n.01', 'synonyms': ['lasagna', 'lasagne'], 'id': 632, 'def': 'baked dish of layers of lasagna pasta with sauce and cheese and meat or vegetables', 'name': 'lasagna'}, {'frequency': 'f', 'synset': 'latch.n.02', 'synonyms': ['latch'], 'id': 633, 'def': 'a bar that can be lowered or slid into a groove to fasten a door or gate', 'name': 'latch'}, {'frequency': 'r', 'synset': 'lawn_mower.n.01', 'synonyms': ['lawn_mower'], 'id': 634, 'def': 'garden tool for mowing grass on lawns', 'name': 'lawn_mower'}, {'frequency': 'r', 'synset': 'leather.n.01', 'synonyms': ['leather'], 'id': 635, 'def': 'an animal skin made smooth and flexible by removing the hair and then tanning', 'name': 'leather'}, {'frequency': 'c', 'synset': 'legging.n.01', 'synonyms': ['legging_(clothing)', 'leging_(clothing)', 'leg_covering'], 'id': 636, 'def': 'a garment covering the leg (usually extending from the knee to the ankle)', 'name': 'legging_(clothing)'}, {'frequency': 'c', 'synset': 'lego.n.01', 'synonyms': ['Lego', 'Lego_set'], 'id': 637, 'def': "a child's plastic construction set for making models from blocks", 'name': 'Lego'}, {'frequency': 'r', 'synset': 'legume.n.02', 'synonyms': ['legume'], 'id': 638, 'def': 'the fruit or seed of bean or pea plants', 'name': 'legume'}, {'frequency': 'f', 'synset': 'lemon.n.01', 'synonyms': ['lemon'], 'id': 639, 'def': 'yellow oval fruit with juicy acidic flesh', 'name': 'lemon'}, {'frequency': 'r', 'synset': 'lemonade.n.01', 'synonyms': ['lemonade'], 'id': 640, 'def': 'sweetened beverage of diluted lemon juice', 'name': 'lemonade'}, {'frequency': 'f', 'synset': 'lettuce.n.02', 'synonyms': ['lettuce'], 'id': 641, 'def': 'leafy plant commonly eaten in salad or on sandwiches', 'name': 'lettuce'}, {'frequency': 'f', 'synset': 'license_plate.n.01', 'synonyms': ['license_plate', 'numberplate'], 'id': 642, 'def': "a plate mounted on the front and back of car and bearing the car's registration number", 'name': 'license_plate'}, {'frequency': 'f', 'synset': 'life_buoy.n.01', 'synonyms': ['life_buoy', 'lifesaver', 'life_belt', 'life_ring'], 'id': 643, 'def': 'a ring-shaped life preserver used to prevent drowning (NOT a life-jacket or vest)', 'name': 'life_buoy'}, {'frequency': 'f', 'synset': 'life_jacket.n.01', 'synonyms': ['life_jacket', 'life_vest'], 'id': 644, 'def': 'life preserver consisting of a sleeveless jacket of buoyant or inflatable design', 'name': 'life_jacket'}, {'frequency': 'f', 'synset': 'light_bulb.n.01', 'synonyms': ['lightbulb'], 'id': 645, 'def': 'lightblub/source of light', 'name': 'lightbulb'}, {'frequency': 'r', 'synset': 'lightning_rod.n.02', 'synonyms': ['lightning_rod', 'lightning_conductor'], 'id': 646, 'def': 'a metallic conductor that is attached to a high point and leads to the ground', 'name': 'lightning_rod'}, {'frequency': 'f', 'synset': 'lime.n.06', 'synonyms': ['lime'], 'id': 647, 'def': 'the green acidic fruit of any of various lime trees', 'name': 'lime'}, {'frequency': 'r', 'synset': 'limousine.n.01', 'synonyms': ['limousine'], 'id': 648, 'def': 'long luxurious car; usually driven by a chauffeur', 'name': 'limousine'}, {'frequency': 'c', 'synset': 'lion.n.01', 'synonyms': ['lion'], 'id': 649, 'def': 'large gregarious predatory cat of Africa and India', 'name': 'lion'}, {'frequency': 'c', 'synset': 'lip_balm.n.01', 'synonyms': ['lip_balm'], 'id': 650, 'def': 'a balm applied to the lips', 'name': 'lip_balm'}, {'frequency': 'r', 'synset': 'liquor.n.01', 'synonyms': ['liquor', 'spirits', 'hard_liquor', 'liqueur', 'cordial'], 'id': 651, 'def': 'liquor or beer', 'name': 'liquor'}, {'frequency': 'c', 'synset': 'lizard.n.01', 'synonyms': ['lizard'], 'id': 652, 'def': 'a reptile with usually two pairs of legs and a tapering tail', 'name': 'lizard'}, {'frequency': 'f', 'synset': 'log.n.01', 'synonyms': ['log'], 'id': 653, 'def': 'a segment of the trunk of a tree when stripped of branches', 'name': 'log'}, {'frequency': 'c', 'synset': 'lollipop.n.02', 'synonyms': ['lollipop'], 'id': 654, 'def': 'hard candy on a stick', 'name': 'lollipop'}, {'frequency': 'f', 'synset': 'loudspeaker.n.01', 'synonyms': ['speaker_(stero_equipment)'], 'id': 655, 'def': 'electronic device that produces sound often as part of a stereo system', 'name': 'speaker_(stero_equipment)'}, {'frequency': 'c', 'synset': 'love_seat.n.01', 'synonyms': ['loveseat'], 'id': 656, 'def': 'small sofa that seats two people', 'name': 'loveseat'}, {'frequency': 'r', 'synset': 'machine_gun.n.01', 'synonyms': ['machine_gun'], 'id': 657, 'def': 'a rapidly firing automatic gun', 'name': 'machine_gun'}, {'frequency': 'f', 'synset': 'magazine.n.02', 'synonyms': ['magazine'], 'id': 658, 'def': 'a paperback periodic publication', 'name': 'magazine'}, {'frequency': 'f', 'synset': 'magnet.n.01', 'synonyms': ['magnet'], 'id': 659, 'def': 'a device that attracts iron and produces a magnetic field', 'name': 'magnet'}, {'frequency': 'c', 'synset': 'mail_slot.n.01', 'synonyms': ['mail_slot'], 'id': 660, 'def': 'a slot (usually in a door) through which mail can be delivered', 'name': 'mail_slot'}, {'frequency': 'f', 'synset': 'mailbox.n.01', 'synonyms': ['mailbox_(at_home)', 'letter_box_(at_home)'], 'id': 661, 'def': 'a private box for delivery of mail', 'name': 'mailbox_(at_home)'}, {'frequency': 'r', 'synset': 'mallard.n.01', 'synonyms': ['mallard'], 'id': 662, 'def': 'wild dabbling duck from which domestic ducks are descended', 'name': 'mallard'}, {'frequency': 'r', 'synset': 'mallet.n.01', 'synonyms': ['mallet'], 'id': 663, 'def': 'a sports implement with a long handle and a hammer-like head used to hit a ball', 'name': 'mallet'}, {'frequency': 'r', 'synset': 'mammoth.n.01', 'synonyms': ['mammoth'], 'id': 664, 'def': 'any of numerous extinct elephants widely distributed in the Pleistocene', 'name': 'mammoth'}, {'frequency': 'r', 'synset': 'manatee.n.01', 'synonyms': ['manatee'], 'id': 665, 'def': 'sirenian mammal of tropical coastal waters of America', 'name': 'manatee'}, {'frequency': 'c', 'synset': 'mandarin.n.05', 'synonyms': ['mandarin_orange'], 'id': 666, 'def': 'a somewhat flat reddish-orange loose skinned citrus of China', 'name': 'mandarin_orange'}, {'frequency': 'c', 'synset': 'manger.n.01', 'synonyms': ['manger', 'trough'], 'id': 667, 'def': 'a container (usually in a barn or stable) from which cattle or horses feed', 'name': 'manger'}, {'frequency': 'f', 'synset': 'manhole.n.01', 'synonyms': ['manhole'], 'id': 668, 'def': 'a hole (usually with a flush cover) through which a person can gain access to an underground structure', 'name': 'manhole'}, {'frequency': 'f', 'synset': 'map.n.01', 'synonyms': ['map'], 'id': 669, 'def': "a diagrammatic representation of the earth's surface (or part of it)", 'name': 'map'}, {'frequency': 'f', 'synset': 'marker.n.03', 'synonyms': ['marker'], 'id': 670, 'def': 'a writing implement for making a mark', 'name': 'marker'}, {'frequency': 'r', 'synset': 'martini.n.01', 'synonyms': ['martini'], 'id': 671, 'def': 'a cocktail made of gin (or vodka) with dry vermouth', 'name': 'martini'}, {'frequency': 'r', 'synset': 'mascot.n.01', 'synonyms': ['mascot'], 'id': 672, 'def': 'a person or animal that is adopted by a team or other group as a symbolic figure', 'name': 'mascot'}, {'frequency': 'c', 'synset': 'mashed_potato.n.01', 'synonyms': ['mashed_potato'], 'id': 673, 'def': 'potato that has been peeled and boiled and then mashed', 'name': 'mashed_potato'}, {'frequency': 'r', 'synset': 'masher.n.02', 'synonyms': ['masher'], 'id': 674, 'def': 'a kitchen utensil used for mashing (e.g. potatoes)', 'name': 'masher'}, {'frequency': 'f', 'synset': 'mask.n.04', 'synonyms': ['mask', 'facemask'], 'id': 675, 'def': 'a protective covering worn over the face', 'name': 'mask'}, {'frequency': 'f', 'synset': 'mast.n.01', 'synonyms': ['mast'], 'id': 676, 'def': 'a vertical spar for supporting sails', 'name': 'mast'}, {'frequency': 'c', 'synset': 'mat.n.03', 'synonyms': ['mat_(gym_equipment)', 'gym_mat'], 'id': 677, 'def': 'sports equipment consisting of a piece of thick padding on the floor for gymnastics', 'name': 'mat_(gym_equipment)'}, {'frequency': 'r', 'synset': 'matchbox.n.01', 'synonyms': ['matchbox'], 'id': 678, 'def': 'a box for holding matches', 'name': 'matchbox'}, {'frequency': 'f', 'synset': 'mattress.n.01', 'synonyms': ['mattress'], 'id': 679, 'def': 'a thick pad filled with resilient material used as a bed or part of a bed', 'name': 'mattress'}, {'frequency': 'c', 'synset': 'measuring_cup.n.01', 'synonyms': ['measuring_cup'], 'id': 680, 'def': 'graduated cup used to measure liquid or granular ingredients', 'name': 'measuring_cup'}, {'frequency': 'c', 'synset': 'measuring_stick.n.01', 'synonyms': ['measuring_stick', 'ruler_(measuring_stick)', 'measuring_rod'], 'id': 681, 'def': 'measuring instrument having a sequence of marks at regular intervals', 'name': 'measuring_stick'}, {'frequency': 'c', 'synset': 'meatball.n.01', 'synonyms': ['meatball'], 'id': 682, 'def': 'ground meat formed into a ball and fried or simmered in broth', 'name': 'meatball'}, {'frequency': 'c', 'synset': 'medicine.n.02', 'synonyms': ['medicine'], 'id': 683, 'def': 'something that treats or prevents or alleviates the symptoms of disease', 'name': 'medicine'}, {'frequency': 'c', 'synset': 'melon.n.01', 'synonyms': ['melon'], 'id': 684, 'def': 'fruit of the gourd family having a hard rind and sweet juicy flesh', 'name': 'melon'}, {'frequency': 'f', 'synset': 'microphone.n.01', 'synonyms': ['microphone'], 'id': 685, 'def': 'device for converting sound waves into electrical energy', 'name': 'microphone'}, {'frequency': 'r', 'synset': 'microscope.n.01', 'synonyms': ['microscope'], 'id': 686, 'def': 'magnifier of the image of small objects', 'name': 'microscope'}, {'frequency': 'f', 'synset': 'microwave.n.02', 'synonyms': ['microwave_oven'], 'id': 687, 'def': 'kitchen appliance that cooks food by passing an electromagnetic wave through it', 'name': 'microwave_oven'}, {'frequency': 'r', 'synset': 'milestone.n.01', 'synonyms': ['milestone', 'milepost'], 'id': 688, 'def': 'stone post at side of a road to show distances', 'name': 'milestone'}, {'frequency': 'f', 'synset': 'milk.n.01', 'synonyms': ['milk'], 'id': 689, 'def': 'a white nutritious liquid secreted by mammals and used as food by human beings', 'name': 'milk'}, {'frequency': 'r', 'synset': 'milk_can.n.01', 'synonyms': ['milk_can'], 'id': 690, 'def': 'can for transporting milk', 'name': 'milk_can'}, {'frequency': 'r', 'synset': 'milkshake.n.01', 'synonyms': ['milkshake'], 'id': 691, 'def': 'frothy drink of milk and flavoring and sometimes fruit or ice cream', 'name': 'milkshake'}, {'frequency': 'f', 'synset': 'minivan.n.01', 'synonyms': ['minivan'], 'id': 692, 'def': 'a small box-shaped passenger van', 'name': 'minivan'}, {'frequency': 'r', 'synset': 'mint.n.05', 'synonyms': ['mint_candy'], 'id': 693, 'def': 'a candy that is flavored with a mint oil', 'name': 'mint_candy'}, {'frequency': 'f', 'synset': 'mirror.n.01', 'synonyms': ['mirror'], 'id': 694, 'def': 'polished surface that forms images by reflecting light', 'name': 'mirror'}, {'frequency': 'c', 'synset': 'mitten.n.01', 'synonyms': ['mitten'], 'id': 695, 'def': 'glove that encases the thumb separately and the other four fingers together', 'name': 'mitten'}, {'frequency': 'c', 'synset': 'mixer.n.04', 'synonyms': ['mixer_(kitchen_tool)', 'stand_mixer'], 'id': 696, 'def': 'a kitchen utensil that is used for mixing foods', 'name': 'mixer_(kitchen_tool)'}, {'frequency': 'c', 'synset': 'money.n.03', 'synonyms': ['money'], 'id': 697, 'def': 'the official currency issued by a government or national bank', 'name': 'money'}, {'frequency': 'f', 'synset': 'monitor.n.04', 'synonyms': ['monitor_(computer_equipment) computer_monitor'], 'id': 698, 'def': 'a computer monitor', 'name': 'monitor_(computer_equipment) computer_monitor'}, {'frequency': 'c', 'synset': 'monkey.n.01', 'synonyms': ['monkey'], 'id': 699, 'def': 'any of various long-tailed primates', 'name': 'monkey'}, {'frequency': 'f', 'synset': 'motor.n.01', 'synonyms': ['motor'], 'id': 700, 'def': 'machine that converts other forms of energy into mechanical energy and so imparts motion', 'name': 'motor'}, {'frequency': 'f', 'synset': 'motor_scooter.n.01', 'synonyms': ['motor_scooter', 'scooter'], 'id': 701, 'def': 'a wheeled vehicle with small wheels and a low-powered engine', 'name': 'motor_scooter'}, {'frequency': 'r', 'synset': 'motor_vehicle.n.01', 'synonyms': ['motor_vehicle', 'automotive_vehicle'], 'id': 702, 'def': 'a self-propelled wheeled vehicle that does not run on rails', 'name': 'motor_vehicle'}, {'frequency': 'f', 'synset': 'motorcycle.n.01', 'synonyms': ['motorcycle'], 'id': 703, 'def': 'a motor vehicle with two wheels and a strong frame', 'name': 'motorcycle'}, {'frequency': 'f', 'synset': 'mound.n.01', 'synonyms': ['mound_(baseball)', "pitcher's_mound"], 'id': 704, 'def': '(baseball) the slight elevation on which the pitcher stands', 'name': 'mound_(baseball)'}, {'frequency': 'f', 'synset': 'mouse.n.04', 'synonyms': ['mouse_(computer_equipment)', 'computer_mouse'], 'id': 705, 'def': 'a computer input device that controls an on-screen pointer (does not include trackpads / touchpads)', 'name': 'mouse_(computer_equipment)'}, {'frequency': 'f', 'synset': 'mousepad.n.01', 'synonyms': ['mousepad'], 'id': 706, 'def': 'a small portable pad that provides an operating surface for a computer mouse', 'name': 'mousepad'}, {'frequency': 'c', 'synset': 'muffin.n.01', 'synonyms': ['muffin'], 'id': 707, 'def': 'a sweet quick bread baked in a cup-shaped pan', 'name': 'muffin'}, {'frequency': 'f', 'synset': 'mug.n.04', 'synonyms': ['mug'], 'id': 708, 'def': 'with handle and usually cylindrical', 'name': 'mug'}, {'frequency': 'f', 'synset': 'mushroom.n.02', 'synonyms': ['mushroom'], 'id': 709, 'def': 'a common mushroom', 'name': 'mushroom'}, {'frequency': 'r', 'synset': 'music_stool.n.01', 'synonyms': ['music_stool', 'piano_stool'], 'id': 710, 'def': 'a stool for piano players; usually adjustable in height', 'name': 'music_stool'}, {'frequency': 'c', 'synset': 'musical_instrument.n.01', 'synonyms': ['musical_instrument', 'instrument_(musical)'], 'id': 711, 'def': 'any of various devices or contrivances that can be used to produce musical tones or sounds', 'name': 'musical_instrument'}, {'frequency': 'r', 'synset': 'nailfile.n.01', 'synonyms': ['nailfile'], 'id': 712, 'def': 'a small flat file for shaping the nails', 'name': 'nailfile'}, {'frequency': 'f', 'synset': 'napkin.n.01', 'synonyms': ['napkin', 'table_napkin', 'serviette'], 'id': 713, 'def': 'a small piece of table linen or paper that is used to wipe the mouth and to cover the lap in order to protect clothing', 'name': 'napkin'}, {'frequency': 'r', 'synset': 'neckerchief.n.01', 'synonyms': ['neckerchief'], 'id': 714, 'def': 'a kerchief worn around the neck', 'name': 'neckerchief'}, {'frequency': 'f', 'synset': 'necklace.n.01', 'synonyms': ['necklace'], 'id': 715, 'def': 'jewelry consisting of a cord or chain (often bearing gems) worn about the neck as an ornament', 'name': 'necklace'}, {'frequency': 'f', 'synset': 'necktie.n.01', 'synonyms': ['necktie', 'tie_(necktie)'], 'id': 716, 'def': 'neckwear consisting of a long narrow piece of material worn under a collar and tied in knot at the front', 'name': 'necktie'}, {'frequency': 'c', 'synset': 'needle.n.03', 'synonyms': ['needle'], 'id': 717, 'def': 'a sharp pointed implement (usually metal)', 'name': 'needle'}, {'frequency': 'c', 'synset': 'nest.n.01', 'synonyms': ['nest'], 'id': 718, 'def': 'a structure in which animals lay eggs or give birth to their young', 'name': 'nest'}, {'frequency': 'f', 'synset': 'newspaper.n.01', 'synonyms': ['newspaper', 'paper_(newspaper)'], 'id': 719, 'def': 'a daily or weekly publication on folded sheets containing news, articles, and advertisements', 'name': 'newspaper'}, {'frequency': 'c', 'synset': 'newsstand.n.01', 'synonyms': ['newsstand'], 'id': 720, 'def': 'a stall where newspapers and other periodicals are sold', 'name': 'newsstand'}, {'frequency': 'c', 'synset': 'nightwear.n.01', 'synonyms': ['nightshirt', 'nightwear', 'sleepwear', 'nightclothes'], 'id': 721, 'def': 'garments designed to be worn in bed', 'name': 'nightshirt'}, {'frequency': 'r', 'synset': 'nosebag.n.01', 'synonyms': ['nosebag_(for_animals)', 'feedbag'], 'id': 722, 'def': 'a canvas bag that is used to feed an animal (such as a horse); covers the muzzle and fastens at the top of the head', 'name': 'nosebag_(for_animals)'}, {'frequency': 'c', 'synset': 'noseband.n.01', 'synonyms': ['noseband_(for_animals)', 'nosepiece_(for_animals)'], 'id': 723, 'def': "a strap that is the part of a bridle that goes over the animal's nose", 'name': 'noseband_(for_animals)'}, {'frequency': 'f', 'synset': 'notebook.n.01', 'synonyms': ['notebook'], 'id': 724, 'def': 'a book with blank pages for recording notes or memoranda', 'name': 'notebook'}, {'frequency': 'c', 'synset': 'notepad.n.01', 'synonyms': ['notepad'], 'id': 725, 'def': 'a pad of paper for keeping notes', 'name': 'notepad'}, {'frequency': 'f', 'synset': 'nut.n.03', 'synonyms': ['nut'], 'id': 726, 'def': 'a small metal block (usually square or hexagonal) with internal screw thread to be fitted onto a bolt', 'name': 'nut'}, {'frequency': 'r', 'synset': 'nutcracker.n.01', 'synonyms': ['nutcracker'], 'id': 727, 'def': 'a hand tool used to crack nuts open', 'name': 'nutcracker'}, {'frequency': 'f', 'synset': 'oar.n.01', 'synonyms': ['oar'], 'id': 728, 'def': 'an implement used to propel or steer a boat', 'name': 'oar'}, {'frequency': 'r', 'synset': 'octopus.n.01', 'synonyms': ['octopus_(food)'], 'id': 729, 'def': 'tentacles of octopus prepared as food', 'name': 'octopus_(food)'}, {'frequency': 'r', 'synset': 'octopus.n.02', 'synonyms': ['octopus_(animal)'], 'id': 730, 'def': 'bottom-living cephalopod having a soft oval body with eight long tentacles', 'name': 'octopus_(animal)'}, {'frequency': 'c', 'synset': 'oil_lamp.n.01', 'synonyms': ['oil_lamp', 'kerosene_lamp', 'kerosine_lamp'], 'id': 731, 'def': 'a lamp that burns oil (as kerosine) for light', 'name': 'oil_lamp'}, {'frequency': 'c', 'synset': 'olive_oil.n.01', 'synonyms': ['olive_oil'], 'id': 732, 'def': 'oil from olives', 'name': 'olive_oil'}, {'frequency': 'r', 'synset': 'omelet.n.01', 'synonyms': ['omelet', 'omelette'], 'id': 733, 'def': 'beaten eggs cooked until just set; may be folded around e.g. ham or cheese or jelly', 'name': 'omelet'}, {'frequency': 'f', 'synset': 'onion.n.01', 'synonyms': ['onion'], 'id': 734, 'def': 'the bulb of an onion plant', 'name': 'onion'}, {'frequency': 'f', 'synset': 'orange.n.01', 'synonyms': ['orange_(fruit)'], 'id': 735, 'def': 'orange (FRUIT of an orange tree)', 'name': 'orange_(fruit)'}, {'frequency': 'c', 'synset': 'orange_juice.n.01', 'synonyms': ['orange_juice'], 'id': 736, 'def': 'bottled or freshly squeezed juice of oranges', 'name': 'orange_juice'}, {'frequency': 'c', 'synset': 'ostrich.n.02', 'synonyms': ['ostrich'], 'id': 737, 'def': 'fast-running African flightless bird with two-toed feet; largest living bird', 'name': 'ostrich'}, {'frequency': 'f', 'synset': 'ottoman.n.03', 'synonyms': ['ottoman', 'pouf', 'pouffe', 'hassock'], 'id': 738, 'def': 'a thick standalone cushion used as a seat or footrest, often next to a chair', 'name': 'ottoman'}, {'frequency': 'f', 'synset': 'oven.n.01', 'synonyms': ['oven'], 'id': 739, 'def': 'kitchen appliance used for baking or roasting', 'name': 'oven'}, {'frequency': 'c', 'synset': 'overall.n.01', 'synonyms': ['overalls_(clothing)'], 'id': 740, 'def': 'work clothing consisting of denim trousers usually with a bib and shoulder straps', 'name': 'overalls_(clothing)'}, {'frequency': 'c', 'synset': 'owl.n.01', 'synonyms': ['owl'], 'id': 741, 'def': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes', 'name': 'owl'}, {'frequency': 'c', 'synset': 'packet.n.03', 'synonyms': ['packet'], 'id': 742, 'def': 'a small package or bundle', 'name': 'packet'}, {'frequency': 'r', 'synset': 'pad.n.03', 'synonyms': ['inkpad', 'inking_pad', 'stamp_pad'], 'id': 743, 'def': 'absorbent material saturated with ink used to transfer ink evenly to a rubber stamp', 'name': 'inkpad'}, {'frequency': 'c', 'synset': 'pad.n.04', 'synonyms': ['pad'], 'id': 744, 'def': 'mostly arm/knee pads labeled', 'name': 'pad'}, {'frequency': 'f', 'synset': 'paddle.n.04', 'synonyms': ['paddle', 'boat_paddle'], 'id': 745, 'def': 'a short light oar used without an oarlock to propel a canoe or small boat', 'name': 'paddle'}, {'frequency': 'c', 'synset': 'padlock.n.01', 'synonyms': ['padlock'], 'id': 746, 'def': 'a detachable, portable lock', 'name': 'padlock'}, {'frequency': 'c', 'synset': 'paintbrush.n.01', 'synonyms': ['paintbrush'], 'id': 747, 'def': 'a brush used as an applicator to apply paint', 'name': 'paintbrush'}, {'frequency': 'f', 'synset': 'painting.n.01', 'synonyms': ['painting'], 'id': 748, 'def': 'graphic art consisting of an artistic composition made by applying paints to a surface', 'name': 'painting'}, {'frequency': 'f', 'synset': 'pajama.n.02', 'synonyms': ['pajamas', 'pyjamas'], 'id': 749, 'def': 'loose-fitting nightclothes worn for sleeping or lounging', 'name': 'pajamas'}, {'frequency': 'c', 'synset': 'palette.n.02', 'synonyms': ['palette', 'pallet'], 'id': 750, 'def': 'board that provides a flat surface on which artists mix paints and the range of colors used', 'name': 'palette'}, {'frequency': 'f', 'synset': 'pan.n.01', 'synonyms': ['pan_(for_cooking)', 'cooking_pan'], 'id': 751, 'def': 'cooking utensil consisting of a wide metal vessel', 'name': 'pan_(for_cooking)'}, {'frequency': 'r', 'synset': 'pan.n.03', 'synonyms': ['pan_(metal_container)'], 'id': 752, 'def': 'shallow container made of metal', 'name': 'pan_(metal_container)'}, {'frequency': 'c', 'synset': 'pancake.n.01', 'synonyms': ['pancake'], 'id': 753, 'def': 'a flat cake of thin batter fried on both sides on a griddle', 'name': 'pancake'}, {'frequency': 'r', 'synset': 'pantyhose.n.01', 'synonyms': ['pantyhose'], 'id': 754, 'def': "a woman's tights consisting of underpants and stockings", 'name': 'pantyhose'}, {'frequency': 'r', 'synset': 'papaya.n.02', 'synonyms': ['papaya'], 'id': 755, 'def': 'large oval melon-like tropical fruit with yellowish flesh', 'name': 'papaya'}, {'frequency': 'f', 'synset': 'paper_plate.n.01', 'synonyms': ['paper_plate'], 'id': 756, 'def': 'a disposable plate made of cardboard', 'name': 'paper_plate'}, {'frequency': 'f', 'synset': 'paper_towel.n.01', 'synonyms': ['paper_towel'], 'id': 757, 'def': 'a disposable towel made of absorbent paper', 'name': 'paper_towel'}, {'frequency': 'r', 'synset': 'paperback_book.n.01', 'synonyms': ['paperback_book', 'paper-back_book', 'softback_book', 'soft-cover_book'], 'id': 758, 'def': 'a book with paper covers', 'name': 'paperback_book'}, {'frequency': 'r', 'synset': 'paperweight.n.01', 'synonyms': ['paperweight'], 'id': 759, 'def': 'a weight used to hold down a stack of papers', 'name': 'paperweight'}, {'frequency': 'c', 'synset': 'parachute.n.01', 'synonyms': ['parachute'], 'id': 760, 'def': 'rescue equipment consisting of a device that fills with air and retards your fall', 'name': 'parachute'}, {'frequency': 'c', 'synset': 'parakeet.n.01', 'synonyms': ['parakeet', 'parrakeet', 'parroket', 'paraquet', 'paroquet', 'parroquet'], 'id': 761, 'def': 'any of numerous small slender long-tailed parrots', 'name': 'parakeet'}, {'frequency': 'c', 'synset': 'parasail.n.01', 'synonyms': ['parasail_(sports)'], 'id': 762, 'def': 'parachute that will lift a person up into the air when it is towed by a motorboat or a car', 'name': 'parasail_(sports)'}, {'frequency': 'c', 'synset': 'parasol.n.01', 'synonyms': ['parasol', 'sunshade'], 'id': 763, 'def': 'a handheld collapsible source of shade', 'name': 'parasol'}, {'frequency': 'r', 'synset': 'parchment.n.01', 'synonyms': ['parchment'], 'id': 764, 'def': 'a superior paper resembling sheepskin', 'name': 'parchment'}, {'frequency': 'c', 'synset': 'parka.n.01', 'synonyms': ['parka', 'anorak'], 'id': 765, 'def': "a kind of heavy jacket (`windcheater' is a British term)", 'name': 'parka'}, {'frequency': 'f', 'synset': 'parking_meter.n.01', 'synonyms': ['parking_meter'], 'id': 766, 'def': 'a coin-operated timer located next to a parking space', 'name': 'parking_meter'}, {'frequency': 'c', 'synset': 'parrot.n.01', 'synonyms': ['parrot'], 'id': 767, 'def': 'usually brightly colored tropical birds with short hooked beaks and the ability to mimic sounds', 'name': 'parrot'}, {'frequency': 'c', 'synset': 'passenger_car.n.01', 'synonyms': ['passenger_car_(part_of_a_train)', 'coach_(part_of_a_train)'], 'id': 768, 'def': 'a railcar where passengers ride', 'name': 'passenger_car_(part_of_a_train)'}, {'frequency': 'r', 'synset': 'passenger_ship.n.01', 'synonyms': ['passenger_ship'], 'id': 769, 'def': 'a ship built to carry passengers', 'name': 'passenger_ship'}, {'frequency': 'c', 'synset': 'passport.n.02', 'synonyms': ['passport'], 'id': 770, 'def': 'a document issued by a country to a citizen allowing that person to travel abroad and re-enter the home country', 'name': 'passport'}, {'frequency': 'f', 'synset': 'pastry.n.02', 'synonyms': ['pastry'], 'id': 771, 'def': 'any of various baked foods made of dough or batter', 'name': 'pastry'}, {'frequency': 'r', 'synset': 'patty.n.01', 'synonyms': ['patty_(food)'], 'id': 772, 'def': 'small flat mass of chopped food', 'name': 'patty_(food)'}, {'frequency': 'c', 'synset': 'pea.n.01', 'synonyms': ['pea_(food)'], 'id': 773, 'def': 'seed of a pea plant used for food', 'name': 'pea_(food)'}, {'frequency': 'c', 'synset': 'peach.n.03', 'synonyms': ['peach'], 'id': 774, 'def': 'downy juicy fruit with sweet yellowish or whitish flesh', 'name': 'peach'}, {'frequency': 'c', 'synset': 'peanut_butter.n.01', 'synonyms': ['peanut_butter'], 'id': 775, 'def': 'a spread made from ground peanuts', 'name': 'peanut_butter'}, {'frequency': 'f', 'synset': 'pear.n.01', 'synonyms': ['pear'], 'id': 776, 'def': 'sweet juicy gritty-textured fruit available in many varieties', 'name': 'pear'}, {'frequency': 'c', 'synset': 'peeler.n.03', 'synonyms': ['peeler_(tool_for_fruit_and_vegetables)'], 'id': 777, 'def': 'a device for peeling vegetables or fruits', 'name': 'peeler_(tool_for_fruit_and_vegetables)'}, {'frequency': 'r', 'synset': 'peg.n.04', 'synonyms': ['wooden_leg', 'pegleg'], 'id': 778, 'def': 'a prosthesis that replaces a missing leg', 'name': 'wooden_leg'}, {'frequency': 'r', 'synset': 'pegboard.n.01', 'synonyms': ['pegboard'], 'id': 779, 'def': 'a board perforated with regularly spaced holes into which pegs can be fitted', 'name': 'pegboard'}, {'frequency': 'c', 'synset': 'pelican.n.01', 'synonyms': ['pelican'], 'id': 780, 'def': 'large long-winged warm-water seabird having a large bill with a distensible pouch for fish', 'name': 'pelican'}, {'frequency': 'f', 'synset': 'pen.n.01', 'synonyms': ['pen'], 'id': 781, 'def': 'a writing implement with a point from which ink flows', 'name': 'pen'}, {'frequency': 'f', 'synset': 'pencil.n.01', 'synonyms': ['pencil'], 'id': 782, 'def': 'a thin cylindrical pointed writing implement made of wood and graphite', 'name': 'pencil'}, {'frequency': 'r', 'synset': 'pencil_box.n.01', 'synonyms': ['pencil_box', 'pencil_case'], 'id': 783, 'def': 'a box for holding pencils', 'name': 'pencil_box'}, {'frequency': 'r', 'synset': 'pencil_sharpener.n.01', 'synonyms': ['pencil_sharpener'], 'id': 784, 'def': 'a rotary implement for sharpening the point on pencils', 'name': 'pencil_sharpener'}, {'frequency': 'r', 'synset': 'pendulum.n.01', 'synonyms': ['pendulum'], 'id': 785, 'def': 'an apparatus consisting of an object mounted so that it swings freely under the influence of gravity', 'name': 'pendulum'}, {'frequency': 'c', 'synset': 'penguin.n.01', 'synonyms': ['penguin'], 'id': 786, 'def': 'short-legged flightless birds of cold southern regions having webbed feet and wings modified as flippers', 'name': 'penguin'}, {'frequency': 'r', 'synset': 'pennant.n.02', 'synonyms': ['pennant'], 'id': 787, 'def': 'a flag longer than it is wide (and often tapering)', 'name': 'pennant'}, {'frequency': 'r', 'synset': 'penny.n.02', 'synonyms': ['penny_(coin)'], 'id': 788, 'def': 'a coin worth one-hundredth of the value of the basic unit', 'name': 'penny_(coin)'}, {'frequency': 'f', 'synset': 'pepper.n.03', 'synonyms': ['pepper', 'peppercorn'], 'id': 789, 'def': 'pungent seasoning from the berry of the common pepper plant; whole or ground', 'name': 'pepper'}, {'frequency': 'c', 'synset': 'pepper_mill.n.01', 'synonyms': ['pepper_mill', 'pepper_grinder'], 'id': 790, 'def': 'a mill for grinding pepper', 'name': 'pepper_mill'}, {'frequency': 'c', 'synset': 'perfume.n.02', 'synonyms': ['perfume'], 'id': 791, 'def': 'a toiletry that emits and diffuses a fragrant odor', 'name': 'perfume'}, {'frequency': 'r', 'synset': 'persimmon.n.02', 'synonyms': ['persimmon'], 'id': 792, 'def': 'orange fruit resembling a plum; edible when fully ripe', 'name': 'persimmon'}, {'frequency': 'f', 'synset': 'person.n.01', 'synonyms': ['person', 'baby', 'child', 'boy', 'girl', 'man', 'woman', 'human'], 'id': 793, 'def': 'a human being', 'name': 'person'}, {'frequency': 'c', 'synset': 'pet.n.01', 'synonyms': ['pet'], 'id': 794, 'def': 'a domesticated animal kept for companionship or amusement', 'name': 'pet'}, {'frequency': 'c', 'synset': 'pew.n.01', 'synonyms': ['pew_(church_bench)', 'church_bench'], 'id': 795, 'def': 'long bench with backs; used in church by the congregation', 'name': 'pew_(church_bench)'}, {'frequency': 'r', 'synset': 'phonebook.n.01', 'synonyms': ['phonebook', 'telephone_book', 'telephone_directory'], 'id': 796, 'def': 'a directory containing an alphabetical list of telephone subscribers and their telephone numbers', 'name': 'phonebook'}, {'frequency': 'c', 'synset': 'phonograph_record.n.01', 'synonyms': ['phonograph_record', 'phonograph_recording', 'record_(phonograph_recording)'], 'id': 797, 'def': 'sound recording consisting of a typically black disk with a continuous groove', 'name': 'phonograph_record'}, {'frequency': 'f', 'synset': 'piano.n.01', 'synonyms': ['piano'], 'id': 798, 'def': 'a keyboard instrument that is played by depressing keys that cause hammers to strike tuned strings and produce sounds', 'name': 'piano'}, {'frequency': 'f', 'synset': 'pickle.n.01', 'synonyms': ['pickle'], 'id': 799, 'def': 'vegetables (especially cucumbers) preserved in brine or vinegar', 'name': 'pickle'}, {'frequency': 'f', 'synset': 'pickup.n.01', 'synonyms': ['pickup_truck'], 'id': 800, 'def': 'a light truck with an open body and low sides and a tailboard', 'name': 'pickup_truck'}, {'frequency': 'c', 'synset': 'pie.n.01', 'synonyms': ['pie'], 'id': 801, 'def': 'dish baked in pastry-lined pan often with a pastry top', 'name': 'pie'}, {'frequency': 'c', 'synset': 'pigeon.n.01', 'synonyms': ['pigeon'], 'id': 802, 'def': 'wild and domesticated birds having a heavy body and short legs', 'name': 'pigeon'}, {'frequency': 'r', 'synset': 'piggy_bank.n.01', 'synonyms': ['piggy_bank', 'penny_bank'], 'id': 803, 'def': "a child's coin bank (often shaped like a pig)", 'name': 'piggy_bank'}, {'frequency': 'f', 'synset': 'pillow.n.01', 'synonyms': ['pillow'], 'id': 804, 'def': 'a cushion to support the head of a sleeping person', 'name': 'pillow'}, {'frequency': 'r', 'synset': 'pin.n.09', 'synonyms': ['pin_(non_jewelry)'], 'id': 805, 'def': 'a small slender (often pointed) piece of wood or metal used to support or fasten or attach things', 'name': 'pin_(non_jewelry)'}, {'frequency': 'f', 'synset': 'pineapple.n.02', 'synonyms': ['pineapple'], 'id': 806, 'def': 'large sweet fleshy tropical fruit with a tuft of stiff leaves', 'name': 'pineapple'}, {'frequency': 'c', 'synset': 'pinecone.n.01', 'synonyms': ['pinecone'], 'id': 807, 'def': 'the seed-producing cone of a pine tree', 'name': 'pinecone'}, {'frequency': 'r', 'synset': 'ping-pong_ball.n.01', 'synonyms': ['ping-pong_ball'], 'id': 808, 'def': 'light hollow ball used in playing table tennis', 'name': 'ping-pong_ball'}, {'frequency': 'r', 'synset': 'pinwheel.n.03', 'synonyms': ['pinwheel'], 'id': 809, 'def': 'a toy consisting of vanes of colored paper or plastic that is pinned to a stick and spins when it is pointed into the wind', 'name': 'pinwheel'}, {'frequency': 'r', 'synset': 'pipe.n.01', 'synonyms': ['tobacco_pipe'], 'id': 810, 'def': 'a tube with a small bowl at one end; used for smoking tobacco', 'name': 'tobacco_pipe'}, {'frequency': 'f', 'synset': 'pipe.n.02', 'synonyms': ['pipe', 'piping'], 'id': 811, 'def': 'a long tube made of metal or plastic that is used to carry water or oil or gas etc.', 'name': 'pipe'}, {'frequency': 'r', 'synset': 'pistol.n.01', 'synonyms': ['pistol', 'handgun'], 'id': 812, 'def': 'a firearm that is held and fired with one hand', 'name': 'pistol'}, {'frequency': 'c', 'synset': 'pita.n.01', 'synonyms': ['pita_(bread)', 'pocket_bread'], 'id': 813, 'def': 'usually small round bread that can open into a pocket for filling', 'name': 'pita_(bread)'}, {'frequency': 'f', 'synset': 'pitcher.n.02', 'synonyms': ['pitcher_(vessel_for_liquid)', 'ewer'], 'id': 814, 'def': 'an open vessel with a handle and a spout for pouring', 'name': 'pitcher_(vessel_for_liquid)'}, {'frequency': 'r', 'synset': 'pitchfork.n.01', 'synonyms': ['pitchfork'], 'id': 815, 'def': 'a long-handled hand tool with sharp widely spaced prongs for lifting and pitching hay', 'name': 'pitchfork'}, {'frequency': 'f', 'synset': 'pizza.n.01', 'synonyms': ['pizza'], 'id': 816, 'def': 'Italian open pie made of thin bread dough spread with a spiced mixture of e.g. tomato sauce and cheese', 'name': 'pizza'}, {'frequency': 'f', 'synset': 'place_mat.n.01', 'synonyms': ['place_mat'], 'id': 817, 'def': 'a mat placed on a table for an individual place setting', 'name': 'place_mat'}, {'frequency': 'f', 'synset': 'plate.n.04', 'synonyms': ['plate'], 'id': 818, 'def': 'dish on which food is served or from which food is eaten', 'name': 'plate'}, {'frequency': 'c', 'synset': 'platter.n.01', 'synonyms': ['platter'], 'id': 819, 'def': 'a large shallow dish used for serving food', 'name': 'platter'}, {'frequency': 'r', 'synset': 'playpen.n.01', 'synonyms': ['playpen'], 'id': 820, 'def': 'a portable enclosure in which babies may be left to play', 'name': 'playpen'}, {'frequency': 'c', 'synset': 'pliers.n.01', 'synonyms': ['pliers', 'plyers'], 'id': 821, 'def': 'a gripping hand tool with two hinged arms and (usually) serrated jaws', 'name': 'pliers'}, {'frequency': 'r', 'synset': 'plow.n.01', 'synonyms': ['plow_(farm_equipment)', 'plough_(farm_equipment)'], 'id': 822, 'def': 'a farm tool having one or more heavy blades to break the soil and cut a furrow prior to sowing', 'name': 'plow_(farm_equipment)'}, {'frequency': 'r', 'synset': 'plume.n.02', 'synonyms': ['plume'], 'id': 823, 'def': 'a feather or cluster of feathers worn as an ornament', 'name': 'plume'}, {'frequency': 'r', 'synset': 'pocket_watch.n.01', 'synonyms': ['pocket_watch'], 'id': 824, 'def': 'a watch that is carried in a small watch pocket', 'name': 'pocket_watch'}, {'frequency': 'c', 'synset': 'pocketknife.n.01', 'synonyms': ['pocketknife'], 'id': 825, 'def': 'a knife with a blade that folds into the handle; suitable for carrying in the pocket', 'name': 'pocketknife'}, {'frequency': 'c', 'synset': 'poker.n.01', 'synonyms': ['poker_(fire_stirring_tool)', 'stove_poker', 'fire_hook'], 'id': 826, 'def': 'fire iron consisting of a metal rod with a handle; used to stir a fire', 'name': 'poker_(fire_stirring_tool)'}, {'frequency': 'f', 'synset': 'pole.n.01', 'synonyms': ['pole', 'post'], 'id': 827, 'def': 'a long (usually round) rod of wood or metal or plastic', 'name': 'pole'}, {'frequency': 'f', 'synset': 'polo_shirt.n.01', 'synonyms': ['polo_shirt', 'sport_shirt'], 'id': 828, 'def': 'a shirt with short sleeves designed for comfort and casual wear', 'name': 'polo_shirt'}, {'frequency': 'r', 'synset': 'poncho.n.01', 'synonyms': ['poncho'], 'id': 829, 'def': 'a blanket-like cloak with a hole in the center for the head', 'name': 'poncho'}, {'frequency': 'c', 'synset': 'pony.n.05', 'synonyms': ['pony'], 'id': 830, 'def': 'any of various breeds of small gentle horses usually less than five feet high at the shoulder', 'name': 'pony'}, {'frequency': 'r', 'synset': 'pool_table.n.01', 'synonyms': ['pool_table', 'billiard_table', 'snooker_table'], 'id': 831, 'def': 'game equipment consisting of a heavy table on which pool is played', 'name': 'pool_table'}, {'frequency': 'f', 'synset': 'pop.n.02', 'synonyms': ['pop_(soda)', 'soda_(pop)', 'tonic', 'soft_drink'], 'id': 832, 'def': 'a sweet drink containing carbonated water and flavoring', 'name': 'pop_(soda)'}, {'frequency': 'c', 'synset': 'postbox.n.01', 'synonyms': ['postbox_(public)', 'mailbox_(public)'], 'id': 833, 'def': 'public box for deposit of mail', 'name': 'postbox_(public)'}, {'frequency': 'c', 'synset': 'postcard.n.01', 'synonyms': ['postcard', 'postal_card', 'mailing-card'], 'id': 834, 'def': 'a card for sending messages by post without an envelope', 'name': 'postcard'}, {'frequency': 'f', 'synset': 'poster.n.01', 'synonyms': ['poster', 'placard'], 'id': 835, 'def': 'a sign posted in a public place as an advertisement', 'name': 'poster'}, {'frequency': 'f', 'synset': 'pot.n.01', 'synonyms': ['pot'], 'id': 836, 'def': 'metal or earthenware cooking vessel that is usually round and deep; often has a handle and lid', 'name': 'pot'}, {'frequency': 'f', 'synset': 'pot.n.04', 'synonyms': ['flowerpot'], 'id': 837, 'def': 'a container in which plants are cultivated', 'name': 'flowerpot'}, {'frequency': 'f', 'synset': 'potato.n.01', 'synonyms': ['potato'], 'id': 838, 'def': 'an edible tuber native to South America', 'name': 'potato'}, {'frequency': 'c', 'synset': 'potholder.n.01', 'synonyms': ['potholder'], 'id': 839, 'def': 'an insulated pad for holding hot pots', 'name': 'potholder'}, {'frequency': 'c', 'synset': 'pottery.n.01', 'synonyms': ['pottery', 'clayware'], 'id': 840, 'def': 'ceramic ware made from clay and baked in a kiln', 'name': 'pottery'}, {'frequency': 'c', 'synset': 'pouch.n.01', 'synonyms': ['pouch'], 'id': 841, 'def': 'a small or medium size container for holding or carrying things', 'name': 'pouch'}, {'frequency': 'c', 'synset': 'power_shovel.n.01', 'synonyms': ['power_shovel', 'excavator', 'digger'], 'id': 842, 'def': 'a machine for excavating', 'name': 'power_shovel'}, {'frequency': 'c', 'synset': 'prawn.n.01', 'synonyms': ['prawn', 'shrimp'], 'id': 843, 'def': 'any of various edible decapod crustaceans', 'name': 'prawn'}, {'frequency': 'c', 'synset': 'pretzel.n.01', 'synonyms': ['pretzel'], 'id': 844, 'def': 'glazed and salted cracker typically in the shape of a loose knot', 'name': 'pretzel'}, {'frequency': 'f', 'synset': 'printer.n.03', 'synonyms': ['printer', 'printing_machine'], 'id': 845, 'def': 'a machine that prints', 'name': 'printer'}, {'frequency': 'c', 'synset': 'projectile.n.01', 'synonyms': ['projectile_(weapon)', 'missile'], 'id': 846, 'def': 'a weapon that is forcibly thrown or projected at a targets', 'name': 'projectile_(weapon)'}, {'frequency': 'c', 'synset': 'projector.n.02', 'synonyms': ['projector'], 'id': 847, 'def': 'an optical instrument that projects an enlarged image onto a screen', 'name': 'projector'}, {'frequency': 'f', 'synset': 'propeller.n.01', 'synonyms': ['propeller', 'propellor'], 'id': 848, 'def': 'a mechanical device that rotates to push against air or water', 'name': 'propeller'}, {'frequency': 'r', 'synset': 'prune.n.01', 'synonyms': ['prune'], 'id': 849, 'def': 'dried plum', 'name': 'prune'}, {'frequency': 'r', 'synset': 'pudding.n.01', 'synonyms': ['pudding'], 'id': 850, 'def': 'any of various soft thick unsweetened baked dishes', 'name': 'pudding'}, {'frequency': 'r', 'synset': 'puffer.n.02', 'synonyms': ['puffer_(fish)', 'pufferfish', 'blowfish', 'globefish'], 'id': 851, 'def': 'fishes whose elongated spiny body can inflate itself with water or air to form a globe', 'name': 'puffer_(fish)'}, {'frequency': 'r', 'synset': 'puffin.n.01', 'synonyms': ['puffin'], 'id': 852, 'def': 'seabirds having short necks and brightly colored compressed bills', 'name': 'puffin'}, {'frequency': 'r', 'synset': 'pug.n.01', 'synonyms': ['pug-dog'], 'id': 853, 'def': 'small compact smooth-coated breed of Asiatic origin having a tightly curled tail and broad flat wrinkled muzzle', 'name': 'pug-dog'}, {'frequency': 'c', 'synset': 'pumpkin.n.02', 'synonyms': ['pumpkin'], 'id': 854, 'def': 'usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn', 'name': 'pumpkin'}, {'frequency': 'r', 'synset': 'punch.n.03', 'synonyms': ['puncher'], 'id': 855, 'def': 'a tool for making holes or indentations', 'name': 'puncher'}, {'frequency': 'r', 'synset': 'puppet.n.01', 'synonyms': ['puppet', 'marionette'], 'id': 856, 'def': 'a small figure of a person operated from above with strings by a puppeteer', 'name': 'puppet'}, {'frequency': 'c', 'synset': 'puppy.n.01', 'synonyms': ['puppy'], 'id': 857, 'def': 'a young dog', 'name': 'puppy'}, {'frequency': 'r', 'synset': 'quesadilla.n.01', 'synonyms': ['quesadilla'], 'id': 858, 'def': 'a tortilla that is filled with cheese and heated', 'name': 'quesadilla'}, {'frequency': 'r', 'synset': 'quiche.n.02', 'synonyms': ['quiche'], 'id': 859, 'def': 'a tart filled with rich unsweetened custard; often contains other ingredients (as cheese or ham or seafood or vegetables)', 'name': 'quiche'}, {'frequency': 'f', 'synset': 'quilt.n.01', 'synonyms': ['quilt', 'comforter'], 'id': 860, 'def': 'bedding made of two layers of cloth filled with stuffing and stitched together', 'name': 'quilt'}, {'frequency': 'c', 'synset': 'rabbit.n.01', 'synonyms': ['rabbit'], 'id': 861, 'def': 'any of various burrowing animals of the family Leporidae having long ears and short tails', 'name': 'rabbit'}, {'frequency': 'r', 'synset': 'racer.n.02', 'synonyms': ['race_car', 'racing_car'], 'id': 862, 'def': 'a fast car that competes in races', 'name': 'race_car'}, {'frequency': 'c', 'synset': 'racket.n.04', 'synonyms': ['racket', 'racquet'], 'id': 863, 'def': 'a sports implement used to strike a ball in various games', 'name': 'racket'}, {'frequency': 'r', 'synset': 'radar.n.01', 'synonyms': ['radar'], 'id': 864, 'def': 'measuring instrument in which the echo of a pulse of microwave radiation is used to detect and locate distant objects', 'name': 'radar'}, {'frequency': 'f', 'synset': 'radiator.n.03', 'synonyms': ['radiator'], 'id': 865, 'def': 'a mechanism consisting of a metal honeycomb through which hot fluids circulate', 'name': 'radiator'}, {'frequency': 'c', 'synset': 'radio_receiver.n.01', 'synonyms': ['radio_receiver', 'radio_set', 'radio', 'tuner_(radio)'], 'id': 866, 'def': 'an electronic receiver that detects and demodulates and amplifies transmitted radio signals', 'name': 'radio_receiver'}, {'frequency': 'c', 'synset': 'radish.n.03', 'synonyms': ['radish', 'daikon'], 'id': 867, 'def': 'pungent edible root of any of various cultivated radish plants', 'name': 'radish'}, {'frequency': 'c', 'synset': 'raft.n.01', 'synonyms': ['raft'], 'id': 868, 'def': 'a flat float (usually made of logs or planks) that can be used for transport or as a platform for swimmers', 'name': 'raft'}, {'frequency': 'r', 'synset': 'rag_doll.n.01', 'synonyms': ['rag_doll'], 'id': 869, 'def': 'a cloth doll that is stuffed and (usually) painted', 'name': 'rag_doll'}, {'frequency': 'c', 'synset': 'raincoat.n.01', 'synonyms': ['raincoat', 'waterproof_jacket'], 'id': 870, 'def': 'a water-resistant coat', 'name': 'raincoat'}, {'frequency': 'c', 'synset': 'ram.n.05', 'synonyms': ['ram_(animal)'], 'id': 871, 'def': 'uncastrated adult male sheep', 'name': 'ram_(animal)'}, {'frequency': 'c', 'synset': 'raspberry.n.02', 'synonyms': ['raspberry'], 'id': 872, 'def': 'red or black edible aggregate berries usually smaller than the related blackberries', 'name': 'raspberry'}, {'frequency': 'r', 'synset': 'rat.n.01', 'synonyms': ['rat'], 'id': 873, 'def': 'any of various long-tailed rodents similar to but larger than a mouse', 'name': 'rat'}, {'frequency': 'c', 'synset': 'razorblade.n.01', 'synonyms': ['razorblade'], 'id': 874, 'def': 'a blade that has very sharp edge', 'name': 'razorblade'}, {'frequency': 'c', 'synset': 'reamer.n.01', 'synonyms': ['reamer_(juicer)', 'juicer', 'juice_reamer'], 'id': 875, 'def': 'a squeezer with a conical ridged center that is used for squeezing juice from citrus fruit', 'name': 'reamer_(juicer)'}, {'frequency': 'f', 'synset': 'rearview_mirror.n.01', 'synonyms': ['rearview_mirror'], 'id': 876, 'def': 'vehicle mirror (side or rearview)', 'name': 'rearview_mirror'}, {'frequency': 'c', 'synset': 'receipt.n.02', 'synonyms': ['receipt'], 'id': 877, 'def': 'an acknowledgment (usually tangible) that payment has been made', 'name': 'receipt'}, {'frequency': 'c', 'synset': 'recliner.n.01', 'synonyms': ['recliner', 'reclining_chair', 'lounger_(chair)'], 'id': 878, 'def': 'an armchair whose back can be lowered and foot can be raised to allow the sitter to recline in it', 'name': 'recliner'}, {'frequency': 'c', 'synset': 'record_player.n.01', 'synonyms': ['record_player', 'phonograph_(record_player)', 'turntable'], 'id': 879, 'def': 'machine in which rotating records cause a stylus to vibrate and the vibrations are amplified acoustically or electronically', 'name': 'record_player'}, {'frequency': 'f', 'synset': 'reflector.n.01', 'synonyms': ['reflector'], 'id': 880, 'def': 'device that reflects light, radiation, etc.', 'name': 'reflector'}, {'frequency': 'f', 'synset': 'remote_control.n.01', 'synonyms': ['remote_control'], 'id': 881, 'def': 'a device that can be used to control a machine or apparatus from a distance', 'name': 'remote_control'}, {'frequency': 'c', 'synset': 'rhinoceros.n.01', 'synonyms': ['rhinoceros'], 'id': 882, 'def': 'massive powerful herbivorous odd-toed ungulate of southeast Asia and Africa having very thick skin and one or two horns on the snout', 'name': 'rhinoceros'}, {'frequency': 'r', 'synset': 'rib.n.03', 'synonyms': ['rib_(food)'], 'id': 883, 'def': 'cut of meat including one or more ribs', 'name': 'rib_(food)'}, {'frequency': 'c', 'synset': 'rifle.n.01', 'synonyms': ['rifle'], 'id': 884, 'def': 'a shoulder firearm with a long barrel', 'name': 'rifle'}, {'frequency': 'f', 'synset': 'ring.n.08', 'synonyms': ['ring'], 'id': 885, 'def': 'jewelry consisting of a circlet of precious metal (often set with jewels) worn on the finger', 'name': 'ring'}, {'frequency': 'r', 'synset': 'river_boat.n.01', 'synonyms': ['river_boat'], 'id': 886, 'def': 'a boat used on rivers or to ply a river', 'name': 'river_boat'}, {'frequency': 'r', 'synset': 'road_map.n.02', 'synonyms': ['road_map'], 'id': 887, 'def': '(NOT A ROAD) a MAP showing roads (for automobile travel)', 'name': 'road_map'}, {'frequency': 'c', 'synset': 'robe.n.01', 'synonyms': ['robe'], 'id': 888, 'def': 'any loose flowing garment', 'name': 'robe'}, {'frequency': 'c', 'synset': 'rocking_chair.n.01', 'synonyms': ['rocking_chair'], 'id': 889, 'def': 'a chair mounted on rockers', 'name': 'rocking_chair'}, {'frequency': 'r', 'synset': 'rodent.n.01', 'synonyms': ['rodent'], 'id': 890, 'def': 'relatively small placental mammals having a single pair of constantly growing incisor teeth specialized for gnawing', 'name': 'rodent'}, {'frequency': 'r', 'synset': 'roller_skate.n.01', 'synonyms': ['roller_skate'], 'id': 891, 'def': 'a shoe with pairs of rollers (small hard wheels) fixed to the sole', 'name': 'roller_skate'}, {'frequency': 'r', 'synset': 'rollerblade.n.01', 'synonyms': ['Rollerblade'], 'id': 892, 'def': 'an in-line variant of a roller skate', 'name': 'Rollerblade'}, {'frequency': 'c', 'synset': 'rolling_pin.n.01', 'synonyms': ['rolling_pin'], 'id': 893, 'def': 'utensil consisting of a cylinder (usually of wood) with a handle at each end; used to roll out dough', 'name': 'rolling_pin'}, {'frequency': 'r', 'synset': 'root_beer.n.01', 'synonyms': ['root_beer'], 'id': 894, 'def': 'carbonated drink containing extracts of roots and herbs', 'name': 'root_beer'}, {'frequency': 'c', 'synset': 'router.n.02', 'synonyms': ['router_(computer_equipment)'], 'id': 895, 'def': 'a device that forwards data packets between computer networks', 'name': 'router_(computer_equipment)'}, {'frequency': 'f', 'synset': 'rubber_band.n.01', 'synonyms': ['rubber_band', 'elastic_band'], 'id': 896, 'def': 'a narrow band of elastic rubber used to hold things (such as papers) together', 'name': 'rubber_band'}, {'frequency': 'c', 'synset': 'runner.n.08', 'synonyms': ['runner_(carpet)'], 'id': 897, 'def': 'a long narrow carpet', 'name': 'runner_(carpet)'}, {'frequency': 'f', 'synset': 'sack.n.01', 'synonyms': ['plastic_bag', 'paper_bag'], 'id': 898, 'def': "a bag made of paper or plastic for holding customer's purchases", 'name': 'plastic_bag'}, {'frequency': 'f', 'synset': 'saddle.n.01', 'synonyms': ['saddle_(on_an_animal)'], 'id': 899, 'def': 'a seat for the rider of a horse or camel', 'name': 'saddle_(on_an_animal)'}, {'frequency': 'f', 'synset': 'saddle_blanket.n.01', 'synonyms': ['saddle_blanket', 'saddlecloth', 'horse_blanket'], 'id': 900, 'def': 'stable gear consisting of a blanket placed under the saddle', 'name': 'saddle_blanket'}, {'frequency': 'c', 'synset': 'saddlebag.n.01', 'synonyms': ['saddlebag'], 'id': 901, 'def': 'a large bag (or pair of bags) hung over a saddle', 'name': 'saddlebag'}, {'frequency': 'r', 'synset': 'safety_pin.n.01', 'synonyms': ['safety_pin'], 'id': 902, 'def': 'a pin in the form of a clasp; has a guard so the point of the pin will not stick the user', 'name': 'safety_pin'}, {'frequency': 'f', 'synset': 'sail.n.01', 'synonyms': ['sail'], 'id': 903, 'def': 'a large piece of fabric by means of which wind is used to propel a sailing vessel', 'name': 'sail'}, {'frequency': 'f', 'synset': 'salad.n.01', 'synonyms': ['salad'], 'id': 904, 'def': 'food mixtures either arranged on a plate or tossed and served with a moist dressing; usually consisting of or including greens', 'name': 'salad'}, {'frequency': 'r', 'synset': 'salad_plate.n.01', 'synonyms': ['salad_plate', 'salad_bowl'], 'id': 905, 'def': 'a plate or bowl for individual servings of salad', 'name': 'salad_plate'}, {'frequency': 'c', 'synset': 'salami.n.01', 'synonyms': ['salami'], 'id': 906, 'def': 'highly seasoned fatty sausage of pork and beef usually dried', 'name': 'salami'}, {'frequency': 'c', 'synset': 'salmon.n.01', 'synonyms': ['salmon_(fish)'], 'id': 907, 'def': 'any of various large food and game fishes of northern waters', 'name': 'salmon_(fish)'}, {'frequency': 'r', 'synset': 'salmon.n.03', 'synonyms': ['salmon_(food)'], 'id': 908, 'def': 'flesh of any of various marine or freshwater fish of the family Salmonidae', 'name': 'salmon_(food)'}, {'frequency': 'c', 'synset': 'salsa.n.01', 'synonyms': ['salsa'], 'id': 909, 'def': 'spicy sauce of tomatoes and onions and chili peppers to accompany Mexican foods', 'name': 'salsa'}, {'frequency': 'f', 'synset': 'saltshaker.n.01', 'synonyms': ['saltshaker'], 'id': 910, 'def': 'a shaker with a perforated top for sprinkling salt', 'name': 'saltshaker'}, {'frequency': 'f', 'synset': 'sandal.n.01', 'synonyms': ['sandal_(type_of_shoe)'], 'id': 911, 'def': 'a shoe consisting of a sole fastened by straps to the foot', 'name': 'sandal_(type_of_shoe)'}, {'frequency': 'f', 'synset': 'sandwich.n.01', 'synonyms': ['sandwich'], 'id': 912, 'def': 'two (or more) slices of bread with a filling between them', 'name': 'sandwich'}, {'frequency': 'r', 'synset': 'satchel.n.01', 'synonyms': ['satchel'], 'id': 913, 'def': 'luggage consisting of a small case with a flat bottom and (usually) a shoulder strap', 'name': 'satchel'}, {'frequency': 'r', 'synset': 'saucepan.n.01', 'synonyms': ['saucepan'], 'id': 914, 'def': 'a deep pan with a handle; used for stewing or boiling', 'name': 'saucepan'}, {'frequency': 'f', 'synset': 'saucer.n.02', 'synonyms': ['saucer'], 'id': 915, 'def': 'a small shallow dish for holding a cup at the table', 'name': 'saucer'}, {'frequency': 'f', 'synset': 'sausage.n.01', 'synonyms': ['sausage'], 'id': 916, 'def': 'highly seasoned minced meat stuffed in casings', 'name': 'sausage'}, {'frequency': 'r', 'synset': 'sawhorse.n.01', 'synonyms': ['sawhorse', 'sawbuck'], 'id': 917, 'def': 'a framework for holding wood that is being sawed', 'name': 'sawhorse'}, {'frequency': 'r', 'synset': 'sax.n.02', 'synonyms': ['saxophone'], 'id': 918, 'def': "a wind instrument with a `J'-shaped form typically made of brass", 'name': 'saxophone'}, {'frequency': 'f', 'synset': 'scale.n.07', 'synonyms': ['scale_(measuring_instrument)'], 'id': 919, 'def': 'a measuring instrument for weighing; shows amount of mass', 'name': 'scale_(measuring_instrument)'}, {'frequency': 'r', 'synset': 'scarecrow.n.01', 'synonyms': ['scarecrow', 'strawman'], 'id': 920, 'def': 'an effigy in the shape of a man to frighten birds away from seeds', 'name': 'scarecrow'}, {'frequency': 'f', 'synset': 'scarf.n.01', 'synonyms': ['scarf'], 'id': 921, 'def': 'a garment worn around the head or neck or shoulders for warmth or decoration', 'name': 'scarf'}, {'frequency': 'c', 'synset': 'school_bus.n.01', 'synonyms': ['school_bus'], 'id': 922, 'def': 'a bus used to transport children to or from school', 'name': 'school_bus'}, {'frequency': 'f', 'synset': 'scissors.n.01', 'synonyms': ['scissors'], 'id': 923, 'def': 'a tool having two crossed pivoting blades with looped handles', 'name': 'scissors'}, {'frequency': 'f', 'synset': 'scoreboard.n.01', 'synonyms': ['scoreboard'], 'id': 924, 'def': 'a large board for displaying the score of a contest (and some other information)', 'name': 'scoreboard'}, {'frequency': 'r', 'synset': 'scraper.n.01', 'synonyms': ['scraper'], 'id': 925, 'def': 'any of various hand tools for scraping', 'name': 'scraper'}, {'frequency': 'c', 'synset': 'screwdriver.n.01', 'synonyms': ['screwdriver'], 'id': 926, 'def': 'a hand tool for driving screws; has a tip that fits into the head of a screw', 'name': 'screwdriver'}, {'frequency': 'f', 'synset': 'scrub_brush.n.01', 'synonyms': ['scrubbing_brush'], 'id': 927, 'def': 'a brush with short stiff bristles for heavy cleaning', 'name': 'scrubbing_brush'}, {'frequency': 'c', 'synset': 'sculpture.n.01', 'synonyms': ['sculpture'], 'id': 928, 'def': 'a three-dimensional work of art', 'name': 'sculpture'}, {'frequency': 'c', 'synset': 'seabird.n.01', 'synonyms': ['seabird', 'seafowl'], 'id': 929, 'def': 'a bird that frequents coastal waters and the open ocean: gulls; pelicans; gannets; cormorants; albatrosses; petrels; etc.', 'name': 'seabird'}, {'frequency': 'c', 'synset': 'seahorse.n.02', 'synonyms': ['seahorse'], 'id': 930, 'def': 'small fish with horse-like heads bent sharply downward and curled tails', 'name': 'seahorse'}, {'frequency': 'r', 'synset': 'seaplane.n.01', 'synonyms': ['seaplane', 'hydroplane'], 'id': 931, 'def': 'an airplane that can land on or take off from water', 'name': 'seaplane'}, {'frequency': 'c', 'synset': 'seashell.n.01', 'synonyms': ['seashell'], 'id': 932, 'def': 'the shell of a marine organism', 'name': 'seashell'}, {'frequency': 'c', 'synset': 'sewing_machine.n.01', 'synonyms': ['sewing_machine'], 'id': 933, 'def': 'a textile machine used as a home appliance for sewing', 'name': 'sewing_machine'}, {'frequency': 'c', 'synset': 'shaker.n.03', 'synonyms': ['shaker'], 'id': 934, 'def': 'a container in which something can be shaken', 'name': 'shaker'}, {'frequency': 'c', 'synset': 'shampoo.n.01', 'synonyms': ['shampoo'], 'id': 935, 'def': 'cleansing agent consisting of soaps or detergents used for washing the hair', 'name': 'shampoo'}, {'frequency': 'c', 'synset': 'shark.n.01', 'synonyms': ['shark'], 'id': 936, 'def': 'typically large carnivorous fishes with sharpe teeth', 'name': 'shark'}, {'frequency': 'r', 'synset': 'sharpener.n.01', 'synonyms': ['sharpener'], 'id': 937, 'def': 'any implement that is used to make something (an edge or a point) sharper', 'name': 'sharpener'}, {'frequency': 'r', 'synset': 'sharpie.n.03', 'synonyms': ['Sharpie'], 'id': 938, 'def': 'a pen with indelible ink that will write on any surface', 'name': 'Sharpie'}, {'frequency': 'r', 'synset': 'shaver.n.03', 'synonyms': ['shaver_(electric)', 'electric_shaver', 'electric_razor'], 'id': 939, 'def': 'a razor powered by an electric motor', 'name': 'shaver_(electric)'}, {'frequency': 'c', 'synset': 'shaving_cream.n.01', 'synonyms': ['shaving_cream', 'shaving_soap'], 'id': 940, 'def': 'toiletry consisting that forms a rich lather for softening the beard before shaving', 'name': 'shaving_cream'}, {'frequency': 'r', 'synset': 'shawl.n.01', 'synonyms': ['shawl'], 'id': 941, 'def': 'cloak consisting of an oblong piece of cloth used to cover the head and shoulders', 'name': 'shawl'}, {'frequency': 'r', 'synset': 'shears.n.01', 'synonyms': ['shears'], 'id': 942, 'def': 'large scissors with strong blades', 'name': 'shears'}, {'frequency': 'f', 'synset': 'sheep.n.01', 'synonyms': ['sheep'], 'id': 943, 'def': 'woolly usually horned ruminant mammal related to the goat', 'name': 'sheep'}, {'frequency': 'r', 'synset': 'shepherd_dog.n.01', 'synonyms': ['shepherd_dog', 'sheepdog'], 'id': 944, 'def': 'any of various usually long-haired breeds of dog reared to herd and guard sheep', 'name': 'shepherd_dog'}, {'frequency': 'r', 'synset': 'sherbert.n.01', 'synonyms': ['sherbert', 'sherbet'], 'id': 945, 'def': 'a frozen dessert made primarily of fruit juice and sugar', 'name': 'sherbert'}, {'frequency': 'c', 'synset': 'shield.n.02', 'synonyms': ['shield'], 'id': 946, 'def': 'armor carried on the arm to intercept blows', 'name': 'shield'}, {'frequency': 'f', 'synset': 'shirt.n.01', 'synonyms': ['shirt'], 'id': 947, 'def': 'a garment worn on the upper half of the body', 'name': 'shirt'}, {'frequency': 'f', 'synset': 'shoe.n.01', 'synonyms': ['shoe', 'sneaker_(type_of_shoe)', 'tennis_shoe'], 'id': 948, 'def': 'common footwear covering the foot', 'name': 'shoe'}, {'frequency': 'f', 'synset': 'shopping_bag.n.01', 'synonyms': ['shopping_bag'], 'id': 949, 'def': 'a bag made of plastic or strong paper (often with handles); used to transport goods after shopping', 'name': 'shopping_bag'}, {'frequency': 'c', 'synset': 'shopping_cart.n.01', 'synonyms': ['shopping_cart'], 'id': 950, 'def': 'a handcart that holds groceries or other goods while shopping', 'name': 'shopping_cart'}, {'frequency': 'f', 'synset': 'short_pants.n.01', 'synonyms': ['short_pants', 'shorts_(clothing)', 'trunks_(clothing)'], 'id': 951, 'def': 'trousers that end at or above the knee', 'name': 'short_pants'}, {'frequency': 'r', 'synset': 'shot_glass.n.01', 'synonyms': ['shot_glass'], 'id': 952, 'def': 'a small glass adequate to hold a single swallow of whiskey', 'name': 'shot_glass'}, {'frequency': 'f', 'synset': 'shoulder_bag.n.01', 'synonyms': ['shoulder_bag'], 'id': 953, 'def': 'a large handbag that can be carried by a strap looped over the shoulder', 'name': 'shoulder_bag'}, {'frequency': 'c', 'synset': 'shovel.n.01', 'synonyms': ['shovel'], 'id': 954, 'def': 'a hand tool for lifting loose material such as snow, dirt, etc.', 'name': 'shovel'}, {'frequency': 'f', 'synset': 'shower.n.01', 'synonyms': ['shower_head'], 'id': 955, 'def': 'a plumbing fixture that sprays water over you', 'name': 'shower_head'}, {'frequency': 'r', 'synset': 'shower_cap.n.01', 'synonyms': ['shower_cap'], 'id': 956, 'def': 'a tight cap worn to keep hair dry while showering', 'name': 'shower_cap'}, {'frequency': 'f', 'synset': 'shower_curtain.n.01', 'synonyms': ['shower_curtain'], 'id': 957, 'def': 'a curtain that keeps water from splashing out of the shower area', 'name': 'shower_curtain'}, {'frequency': 'r', 'synset': 'shredder.n.01', 'synonyms': ['shredder_(for_paper)'], 'id': 958, 'def': 'a device that shreds documents', 'name': 'shredder_(for_paper)'}, {'frequency': 'f', 'synset': 'signboard.n.01', 'synonyms': ['signboard'], 'id': 959, 'def': 'structure displaying a board on which advertisements can be posted', 'name': 'signboard'}, {'frequency': 'c', 'synset': 'silo.n.01', 'synonyms': ['silo'], 'id': 960, 'def': 'a cylindrical tower used for storing goods', 'name': 'silo'}, {'frequency': 'f', 'synset': 'sink.n.01', 'synonyms': ['sink'], 'id': 961, 'def': 'plumbing fixture consisting of a water basin fixed to a wall or floor and having a drainpipe', 'name': 'sink'}, {'frequency': 'f', 'synset': 'skateboard.n.01', 'synonyms': ['skateboard'], 'id': 962, 'def': 'a board with wheels that is ridden in a standing or crouching position and propelled by foot', 'name': 'skateboard'}, {'frequency': 'c', 'synset': 'skewer.n.01', 'synonyms': ['skewer'], 'id': 963, 'def': 'a long pin for holding meat in position while it is being roasted', 'name': 'skewer'}, {'frequency': 'f', 'synset': 'ski.n.01', 'synonyms': ['ski'], 'id': 964, 'def': 'sports equipment for skiing on snow', 'name': 'ski'}, {'frequency': 'f', 'synset': 'ski_boot.n.01', 'synonyms': ['ski_boot'], 'id': 965, 'def': 'a stiff boot that is fastened to a ski with a ski binding', 'name': 'ski_boot'}, {'frequency': 'f', 'synset': 'ski_parka.n.01', 'synonyms': ['ski_parka', 'ski_jacket'], 'id': 966, 'def': 'a parka to be worn while skiing', 'name': 'ski_parka'}, {'frequency': 'f', 'synset': 'ski_pole.n.01', 'synonyms': ['ski_pole'], 'id': 967, 'def': 'a pole with metal points used as an aid in skiing', 'name': 'ski_pole'}, {'frequency': 'f', 'synset': 'skirt.n.02', 'synonyms': ['skirt'], 'id': 968, 'def': 'a garment hanging from the waist; worn mainly by girls and women', 'name': 'skirt'}, {'frequency': 'r', 'synset': 'skullcap.n.01', 'synonyms': ['skullcap'], 'id': 969, 'def': 'rounded brimless cap fitting the crown of the head', 'name': 'skullcap'}, {'frequency': 'c', 'synset': 'sled.n.01', 'synonyms': ['sled', 'sledge', 'sleigh'], 'id': 970, 'def': 'a vehicle or flat object for transportation over snow by sliding or pulled by dogs, etc.', 'name': 'sled'}, {'frequency': 'c', 'synset': 'sleeping_bag.n.01', 'synonyms': ['sleeping_bag'], 'id': 971, 'def': 'large padded bag designed to be slept in outdoors', 'name': 'sleeping_bag'}, {'frequency': 'r', 'synset': 'sling.n.05', 'synonyms': ['sling_(bandage)', 'triangular_bandage'], 'id': 972, 'def': 'bandage to support an injured forearm; slung over the shoulder or neck', 'name': 'sling_(bandage)'}, {'frequency': 'c', 'synset': 'slipper.n.01', 'synonyms': ['slipper_(footwear)', 'carpet_slipper_(footwear)'], 'id': 973, 'def': 'low footwear that can be slipped on and off easily; usually worn indoors', 'name': 'slipper_(footwear)'}, {'frequency': 'r', 'synset': 'smoothie.n.02', 'synonyms': ['smoothie'], 'id': 974, 'def': 'a thick smooth drink consisting of fresh fruit pureed with ice cream or yoghurt or milk', 'name': 'smoothie'}, {'frequency': 'r', 'synset': 'snake.n.01', 'synonyms': ['snake', 'serpent'], 'id': 975, 'def': 'limbless scaly elongate reptile; some are venomous', 'name': 'snake'}, {'frequency': 'f', 'synset': 'snowboard.n.01', 'synonyms': ['snowboard'], 'id': 976, 'def': 'a board that resembles a broad ski or a small surfboard; used in a standing position to slide down snow-covered slopes', 'name': 'snowboard'}, {'frequency': 'c', 'synset': 'snowman.n.01', 'synonyms': ['snowman'], 'id': 977, 'def': 'a figure of a person made of packed snow', 'name': 'snowman'}, {'frequency': 'c', 'synset': 'snowmobile.n.01', 'synonyms': ['snowmobile'], 'id': 978, 'def': 'tracked vehicle for travel on snow having skis in front', 'name': 'snowmobile'}, {'frequency': 'f', 'synset': 'soap.n.01', 'synonyms': ['soap'], 'id': 979, 'def': 'a cleansing agent made from the salts of vegetable or animal fats', 'name': 'soap'}, {'frequency': 'f', 'synset': 'soccer_ball.n.01', 'synonyms': ['soccer_ball'], 'id': 980, 'def': "an inflated ball used in playing soccer (called `football' outside of the United States)", 'name': 'soccer_ball'}, {'frequency': 'f', 'synset': 'sock.n.01', 'synonyms': ['sock'], 'id': 981, 'def': 'cloth covering for the foot; worn inside the shoe; reaches to between the ankle and the knee', 'name': 'sock'}, {'frequency': 'f', 'synset': 'sofa.n.01', 'synonyms': ['sofa', 'couch', 'lounge'], 'id': 982, 'def': 'an upholstered seat for more than one person', 'name': 'sofa'}, {'frequency': 'r', 'synset': 'softball.n.01', 'synonyms': ['softball'], 'id': 983, 'def': 'ball used in playing softball', 'name': 'softball'}, {'frequency': 'c', 'synset': 'solar_array.n.01', 'synonyms': ['solar_array', 'solar_battery', 'solar_panel'], 'id': 984, 'def': 'electrical device consisting of a large array of connected solar cells', 'name': 'solar_array'}, {'frequency': 'r', 'synset': 'sombrero.n.02', 'synonyms': ['sombrero'], 'id': 985, 'def': 'a straw hat with a tall crown and broad brim; worn in American southwest and in Mexico', 'name': 'sombrero'}, {'frequency': 'f', 'synset': 'soup.n.01', 'synonyms': ['soup'], 'id': 986, 'def': 'liquid food especially of meat or fish or vegetable stock often containing pieces of solid food', 'name': 'soup'}, {'frequency': 'r', 'synset': 'soup_bowl.n.01', 'synonyms': ['soup_bowl'], 'id': 987, 'def': 'a bowl for serving soup', 'name': 'soup_bowl'}, {'frequency': 'c', 'synset': 'soupspoon.n.01', 'synonyms': ['soupspoon'], 'id': 988, 'def': 'a spoon with a rounded bowl for eating soup', 'name': 'soupspoon'}, {'frequency': 'c', 'synset': 'sour_cream.n.01', 'synonyms': ['sour_cream', 'soured_cream'], 'id': 989, 'def': 'soured light cream', 'name': 'sour_cream'}, {'frequency': 'r', 'synset': 'soya_milk.n.01', 'synonyms': ['soya_milk', 'soybean_milk', 'soymilk'], 'id': 990, 'def': 'a milk substitute containing soybean flour and water; used in some infant formulas and in making tofu', 'name': 'soya_milk'}, {'frequency': 'r', 'synset': 'space_shuttle.n.01', 'synonyms': ['space_shuttle'], 'id': 991, 'def': "a reusable spacecraft with wings for a controlled descent through the Earth's atmosphere", 'name': 'space_shuttle'}, {'frequency': 'r', 'synset': 'sparkler.n.02', 'synonyms': ['sparkler_(fireworks)'], 'id': 992, 'def': 'a firework that burns slowly and throws out a shower of sparks', 'name': 'sparkler_(fireworks)'}, {'frequency': 'f', 'synset': 'spatula.n.02', 'synonyms': ['spatula'], 'id': 993, 'def': 'a hand tool with a thin flexible blade used to mix or spread soft substances', 'name': 'spatula'}, {'frequency': 'r', 'synset': 'spear.n.01', 'synonyms': ['spear', 'lance'], 'id': 994, 'def': 'a long pointed rod used as a tool or weapon', 'name': 'spear'}, {'frequency': 'f', 'synset': 'spectacles.n.01', 'synonyms': ['spectacles', 'specs', 'eyeglasses', 'glasses'], 'id': 995, 'def': 'optical instrument consisting of a frame that holds a pair of lenses for correcting defective vision', 'name': 'spectacles'}, {'frequency': 'c', 'synset': 'spice_rack.n.01', 'synonyms': ['spice_rack'], 'id': 996, 'def': 'a rack for displaying containers filled with spices', 'name': 'spice_rack'}, {'frequency': 'c', 'synset': 'spider.n.01', 'synonyms': ['spider'], 'id': 997, 'def': 'predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'name': 'spider'}, {'frequency': 'r', 'synset': 'spiny_lobster.n.02', 'synonyms': ['crawfish', 'crayfish'], 'id': 998, 'def': 'large edible marine crustacean having a spiny carapace but lacking the large pincers of true lobsters', 'name': 'crawfish'}, {'frequency': 'c', 'synset': 'sponge.n.01', 'synonyms': ['sponge'], 'id': 999, 'def': 'a porous mass usable to absorb water typically used for cleaning', 'name': 'sponge'}, {'frequency': 'f', 'synset': 'spoon.n.01', 'synonyms': ['spoon'], 'id': 1000, 'def': 'a piece of cutlery with a shallow bowl-shaped container and a handle', 'name': 'spoon'}, {'frequency': 'c', 'synset': 'sportswear.n.01', 'synonyms': ['sportswear', 'athletic_wear', 'activewear'], 'id': 1001, 'def': 'attire worn for sport or for casual wear', 'name': 'sportswear'}, {'frequency': 'c', 'synset': 'spotlight.n.02', 'synonyms': ['spotlight'], 'id': 1002, 'def': 'a lamp that produces a strong beam of light to illuminate a restricted area; used to focus attention of a stage performer', 'name': 'spotlight'}, {'frequency': 'r', 'synset': 'squid.n.01', 'synonyms': ['squid_(food)', 'calamari', 'calamary'], 'id': 1003, 'def': '(Italian cuisine) squid prepared as food', 'name': 'squid_(food)'}, {'frequency': 'c', 'synset': 'squirrel.n.01', 'synonyms': ['squirrel'], 'id': 1004, 'def': 'a kind of arboreal rodent having a long bushy tail', 'name': 'squirrel'}, {'frequency': 'r', 'synset': 'stagecoach.n.01', 'synonyms': ['stagecoach'], 'id': 1005, 'def': 'a large coach-and-four formerly used to carry passengers and mail on regular routes between towns', 'name': 'stagecoach'}, {'frequency': 'c', 'synset': 'stapler.n.01', 'synonyms': ['stapler_(stapling_machine)'], 'id': 1006, 'def': 'a machine that inserts staples into sheets of paper in order to fasten them together', 'name': 'stapler_(stapling_machine)'}, {'frequency': 'c', 'synset': 'starfish.n.01', 'synonyms': ['starfish', 'sea_star'], 'id': 1007, 'def': 'echinoderms characterized by five arms extending from a central disk', 'name': 'starfish'}, {'frequency': 'f', 'synset': 'statue.n.01', 'synonyms': ['statue_(sculpture)'], 'id': 1008, 'def': 'a sculpture representing a human or animal', 'name': 'statue_(sculpture)'}, {'frequency': 'c', 'synset': 'steak.n.01', 'synonyms': ['steak_(food)'], 'id': 1009, 'def': 'a slice of meat cut from the fleshy part of an animal or large fish', 'name': 'steak_(food)'}, {'frequency': 'r', 'synset': 'steak_knife.n.01', 'synonyms': ['steak_knife'], 'id': 1010, 'def': 'a sharp table knife used in eating steak', 'name': 'steak_knife'}, {'frequency': 'f', 'synset': 'steering_wheel.n.01', 'synonyms': ['steering_wheel'], 'id': 1011, 'def': 'a handwheel that is used for steering', 'name': 'steering_wheel'}, {'frequency': 'r', 'synset': 'step_ladder.n.01', 'synonyms': ['stepladder'], 'id': 1012, 'def': 'a folding portable ladder hinged at the top', 'name': 'stepladder'}, {'frequency': 'c', 'synset': 'step_stool.n.01', 'synonyms': ['step_stool'], 'id': 1013, 'def': 'a stool that has one or two steps that fold under the seat', 'name': 'step_stool'}, {'frequency': 'c', 'synset': 'stereo.n.01', 'synonyms': ['stereo_(sound_system)'], 'id': 1014, 'def': 'electronic device for playing audio', 'name': 'stereo_(sound_system)'}, {'frequency': 'r', 'synset': 'stew.n.02', 'synonyms': ['stew'], 'id': 1015, 'def': 'food prepared by stewing especially meat or fish with vegetables', 'name': 'stew'}, {'frequency': 'r', 'synset': 'stirrer.n.02', 'synonyms': ['stirrer'], 'id': 1016, 'def': 'an implement used for stirring', 'name': 'stirrer'}, {'frequency': 'f', 'synset': 'stirrup.n.01', 'synonyms': ['stirrup'], 'id': 1017, 'def': "support consisting of metal loops into which rider's feet go", 'name': 'stirrup'}, {'frequency': 'f', 'synset': 'stool.n.01', 'synonyms': ['stool'], 'id': 1018, 'def': 'a simple seat without a back or arms', 'name': 'stool'}, {'frequency': 'f', 'synset': 'stop_sign.n.01', 'synonyms': ['stop_sign'], 'id': 1019, 'def': 'a traffic sign to notify drivers that they must come to a complete stop', 'name': 'stop_sign'}, {'frequency': 'f', 'synset': 'stoplight.n.01', 'synonyms': ['brake_light'], 'id': 1020, 'def': 'a red light on the rear of a motor vehicle that signals when the brakes are applied', 'name': 'brake_light'}, {'frequency': 'f', 'synset': 'stove.n.01', 'synonyms': ['stove', 'kitchen_stove', 'range_(kitchen_appliance)', 'kitchen_range', 'cooking_stove'], 'id': 1021, 'def': 'a kitchen appliance used for cooking food', 'name': 'stove'}, {'frequency': 'c', 'synset': 'strainer.n.01', 'synonyms': ['strainer'], 'id': 1022, 'def': 'a filter to retain larger pieces while smaller pieces and liquids pass through', 'name': 'strainer'}, {'frequency': 'f', 'synset': 'strap.n.01', 'synonyms': ['strap'], 'id': 1023, 'def': 'an elongated strip of material for binding things together or holding', 'name': 'strap'}, {'frequency': 'f', 'synset': 'straw.n.04', 'synonyms': ['straw_(for_drinking)', 'drinking_straw'], 'id': 1024, 'def': 'a thin paper or plastic tube used to suck liquids into the mouth', 'name': 'straw_(for_drinking)'}, {'frequency': 'f', 'synset': 'strawberry.n.01', 'synonyms': ['strawberry'], 'id': 1025, 'def': 'sweet fleshy red fruit', 'name': 'strawberry'}, {'frequency': 'f', 'synset': 'street_sign.n.01', 'synonyms': ['street_sign'], 'id': 1026, 'def': 'a sign visible from the street', 'name': 'street_sign'}, {'frequency': 'f', 'synset': 'streetlight.n.01', 'synonyms': ['streetlight', 'street_lamp'], 'id': 1027, 'def': 'a lamp supported on a lamppost; for illuminating a street', 'name': 'streetlight'}, {'frequency': 'r', 'synset': 'string_cheese.n.01', 'synonyms': ['string_cheese'], 'id': 1028, 'def': 'cheese formed in long strings twisted together', 'name': 'string_cheese'}, {'frequency': 'r', 'synset': 'stylus.n.02', 'synonyms': ['stylus'], 'id': 1029, 'def': 'a pointed tool for writing or drawing or engraving, including pens', 'name': 'stylus'}, {'frequency': 'r', 'synset': 'subwoofer.n.01', 'synonyms': ['subwoofer'], 'id': 1030, 'def': 'a loudspeaker that is designed to reproduce very low bass frequencies', 'name': 'subwoofer'}, {'frequency': 'r', 'synset': 'sugar_bowl.n.01', 'synonyms': ['sugar_bowl'], 'id': 1031, 'def': 'a dish in which sugar is served', 'name': 'sugar_bowl'}, {'frequency': 'r', 'synset': 'sugarcane.n.01', 'synonyms': ['sugarcane_(plant)'], 'id': 1032, 'def': 'juicy canes whose sap is a source of molasses and commercial sugar; fresh canes are sometimes chewed for the juice', 'name': 'sugarcane_(plant)'}, {'frequency': 'f', 'synset': 'suit.n.01', 'synonyms': ['suit_(clothing)'], 'id': 1033, 'def': 'a set of garments (usually including a jacket and trousers or skirt) for outerwear all of the same fabric and color', 'name': 'suit_(clothing)'}, {'frequency': 'c', 'synset': 'sunflower.n.01', 'synonyms': ['sunflower'], 'id': 1034, 'def': 'any plant of the genus Helianthus having large flower heads with dark disk florets and showy yellow rays', 'name': 'sunflower'}, {'frequency': 'f', 'synset': 'sunglasses.n.01', 'synonyms': ['sunglasses'], 'id': 1035, 'def': 'spectacles that are darkened or polarized to protect the eyes from the glare of the sun', 'name': 'sunglasses'}, {'frequency': 'c', 'synset': 'sunhat.n.01', 'synonyms': ['sunhat'], 'id': 1036, 'def': 'a hat with a broad brim that protects the face from direct exposure to the sun', 'name': 'sunhat'}, {'frequency': 'f', 'synset': 'surfboard.n.01', 'synonyms': ['surfboard'], 'id': 1037, 'def': 'a narrow buoyant board for riding surf', 'name': 'surfboard'}, {'frequency': 'c', 'synset': 'sushi.n.01', 'synonyms': ['sushi'], 'id': 1038, 'def': 'rice (with raw fish) wrapped in seaweed', 'name': 'sushi'}, {'frequency': 'c', 'synset': 'swab.n.02', 'synonyms': ['mop'], 'id': 1039, 'def': 'cleaning implement consisting of absorbent material fastened to a handle; for cleaning floors', 'name': 'mop'}, {'frequency': 'c', 'synset': 'sweat_pants.n.01', 'synonyms': ['sweat_pants'], 'id': 1040, 'def': 'loose-fitting trousers with elastic cuffs; worn by athletes', 'name': 'sweat_pants'}, {'frequency': 'c', 'synset': 'sweatband.n.02', 'synonyms': ['sweatband'], 'id': 1041, 'def': 'a band of material tied around the forehead or wrist to absorb sweat', 'name': 'sweatband'}, {'frequency': 'f', 'synset': 'sweater.n.01', 'synonyms': ['sweater'], 'id': 1042, 'def': 'a crocheted or knitted garment covering the upper part of the body', 'name': 'sweater'}, {'frequency': 'f', 'synset': 'sweatshirt.n.01', 'synonyms': ['sweatshirt'], 'id': 1043, 'def': 'cotton knit pullover with long sleeves worn during athletic activity', 'name': 'sweatshirt'}, {'frequency': 'c', 'synset': 'sweet_potato.n.02', 'synonyms': ['sweet_potato'], 'id': 1044, 'def': 'the edible tuberous root of the sweet potato vine', 'name': 'sweet_potato'}, {'frequency': 'f', 'synset': 'swimsuit.n.01', 'synonyms': ['swimsuit', 'swimwear', 'bathing_suit', 'swimming_costume', 'bathing_costume', 'swimming_trunks', 'bathing_trunks'], 'id': 1045, 'def': 'garment worn for swimming', 'name': 'swimsuit'}, {'frequency': 'c', 'synset': 'sword.n.01', 'synonyms': ['sword'], 'id': 1046, 'def': 'a cutting or thrusting weapon that has a long metal blade', 'name': 'sword'}, {'frequency': 'r', 'synset': 'syringe.n.01', 'synonyms': ['syringe'], 'id': 1047, 'def': 'a medical instrument used to inject or withdraw fluids', 'name': 'syringe'}, {'frequency': 'r', 'synset': 'tabasco.n.02', 'synonyms': ['Tabasco_sauce'], 'id': 1048, 'def': 'very spicy sauce (trade name Tabasco) made from fully-aged red peppers', 'name': 'Tabasco_sauce'}, {'frequency': 'r', 'synset': 'table-tennis_table.n.01', 'synonyms': ['table-tennis_table', 'ping-pong_table'], 'id': 1049, 'def': 'a table used for playing table tennis', 'name': 'table-tennis_table'}, {'frequency': 'f', 'synset': 'table.n.02', 'synonyms': ['table'], 'id': 1050, 'def': 'a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs', 'name': 'table'}, {'frequency': 'c', 'synset': 'table_lamp.n.01', 'synonyms': ['table_lamp'], 'id': 1051, 'def': 'a lamp that sits on a table', 'name': 'table_lamp'}, {'frequency': 'f', 'synset': 'tablecloth.n.01', 'synonyms': ['tablecloth'], 'id': 1052, 'def': 'a covering spread over a dining table', 'name': 'tablecloth'}, {'frequency': 'r', 'synset': 'tachometer.n.01', 'synonyms': ['tachometer'], 'id': 1053, 'def': 'measuring instrument for indicating speed of rotation', 'name': 'tachometer'}, {'frequency': 'r', 'synset': 'taco.n.02', 'synonyms': ['taco'], 'id': 1054, 'def': 'a small tortilla cupped around a filling', 'name': 'taco'}, {'frequency': 'f', 'synset': 'tag.n.02', 'synonyms': ['tag'], 'id': 1055, 'def': 'a label associated with something for the purpose of identification or information', 'name': 'tag'}, {'frequency': 'f', 'synset': 'taillight.n.01', 'synonyms': ['taillight', 'rear_light'], 'id': 1056, 'def': 'lamp (usually red) mounted at the rear of a motor vehicle', 'name': 'taillight'}, {'frequency': 'r', 'synset': 'tambourine.n.01', 'synonyms': ['tambourine'], 'id': 1057, 'def': 'a shallow drum with a single drumhead and with metallic disks in the sides', 'name': 'tambourine'}, {'frequency': 'r', 'synset': 'tank.n.01', 'synonyms': ['army_tank', 'armored_combat_vehicle', 'armoured_combat_vehicle'], 'id': 1058, 'def': 'an enclosed armored military vehicle; has a cannon and moves on caterpillar treads', 'name': 'army_tank'}, {'frequency': 'f', 'synset': 'tank.n.02', 'synonyms': ['tank_(storage_vessel)', 'storage_tank'], 'id': 1059, 'def': 'a large (usually metallic) vessel for holding gases or liquids', 'name': 'tank_(storage_vessel)'}, {'frequency': 'f', 'synset': 'tank_top.n.01', 'synonyms': ['tank_top_(clothing)'], 'id': 1060, 'def': 'a tight-fitting sleeveless shirt with wide shoulder straps and low neck and no front opening', 'name': 'tank_top_(clothing)'}, {'frequency': 'f', 'synset': 'tape.n.01', 'synonyms': ['tape_(sticky_cloth_or_paper)'], 'id': 1061, 'def': 'a long thin piece of cloth or paper as used for binding or fastening', 'name': 'tape_(sticky_cloth_or_paper)'}, {'frequency': 'c', 'synset': 'tape.n.04', 'synonyms': ['tape_measure', 'measuring_tape'], 'id': 1062, 'def': 'measuring instrument consisting of a narrow strip (cloth or metal) marked in inches or centimeters and used for measuring lengths', 'name': 'tape_measure'}, {'frequency': 'c', 'synset': 'tapestry.n.02', 'synonyms': ['tapestry'], 'id': 1063, 'def': 'a heavy textile with a woven design; used for curtains and upholstery', 'name': 'tapestry'}, {'frequency': 'f', 'synset': 'tarpaulin.n.01', 'synonyms': ['tarp'], 'id': 1064, 'def': 'waterproofed canvas', 'name': 'tarp'}, {'frequency': 'c', 'synset': 'tartan.n.01', 'synonyms': ['tartan', 'plaid'], 'id': 1065, 'def': 'a cloth having a crisscross design', 'name': 'tartan'}, {'frequency': 'c', 'synset': 'tassel.n.01', 'synonyms': ['tassel'], 'id': 1066, 'def': 'adornment consisting of a bunch of cords fastened at one end', 'name': 'tassel'}, {'frequency': 'c', 'synset': 'tea_bag.n.01', 'synonyms': ['tea_bag'], 'id': 1067, 'def': 'a measured amount of tea in a bag for an individual serving of tea', 'name': 'tea_bag'}, {'frequency': 'c', 'synset': 'teacup.n.02', 'synonyms': ['teacup'], 'id': 1068, 'def': 'a cup from which tea is drunk', 'name': 'teacup'}, {'frequency': 'c', 'synset': 'teakettle.n.01', 'synonyms': ['teakettle'], 'id': 1069, 'def': 'kettle for boiling water to make tea', 'name': 'teakettle'}, {'frequency': 'f', 'synset': 'teapot.n.01', 'synonyms': ['teapot'], 'id': 1070, 'def': 'pot for brewing tea; usually has a spout and handle', 'name': 'teapot'}, {'frequency': 'f', 'synset': 'teddy.n.01', 'synonyms': ['teddy_bear'], 'id': 1071, 'def': "plaything consisting of a child's toy bear (usually plush and stuffed with soft materials)", 'name': 'teddy_bear'}, {'frequency': 'f', 'synset': 'telephone.n.01', 'synonyms': ['telephone', 'phone', 'telephone_set'], 'id': 1072, 'def': 'electronic device for communicating by voice over long distances (includes wired and wireless/cell phones)', 'name': 'telephone'}, {'frequency': 'c', 'synset': 'telephone_booth.n.01', 'synonyms': ['telephone_booth', 'phone_booth', 'call_box', 'telephone_box', 'telephone_kiosk'], 'id': 1073, 'def': 'booth for using a telephone', 'name': 'telephone_booth'}, {'frequency': 'f', 'synset': 'telephone_pole.n.01', 'synonyms': ['telephone_pole', 'telegraph_pole', 'telegraph_post'], 'id': 1074, 'def': 'tall pole supporting telephone wires', 'name': 'telephone_pole'}, {'frequency': 'r', 'synset': 'telephoto_lens.n.01', 'synonyms': ['telephoto_lens', 'zoom_lens'], 'id': 1075, 'def': 'a camera lens that magnifies the image', 'name': 'telephoto_lens'}, {'frequency': 'c', 'synset': 'television_camera.n.01', 'synonyms': ['television_camera', 'tv_camera'], 'id': 1076, 'def': 'television equipment for capturing and recording video', 'name': 'television_camera'}, {'frequency': 'f', 'synset': 'television_receiver.n.01', 'synonyms': ['television_set', 'tv', 'tv_set'], 'id': 1077, 'def': 'an electronic device that receives television signals and displays them on a screen', 'name': 'television_set'}, {'frequency': 'f', 'synset': 'tennis_ball.n.01', 'synonyms': ['tennis_ball'], 'id': 1078, 'def': 'ball about the size of a fist used in playing tennis', 'name': 'tennis_ball'}, {'frequency': 'f', 'synset': 'tennis_racket.n.01', 'synonyms': ['tennis_racket'], 'id': 1079, 'def': 'a racket used to play tennis', 'name': 'tennis_racket'}, {'frequency': 'r', 'synset': 'tequila.n.01', 'synonyms': ['tequila'], 'id': 1080, 'def': 'Mexican liquor made from fermented juices of an agave plant', 'name': 'tequila'}, {'frequency': 'c', 'synset': 'thermometer.n.01', 'synonyms': ['thermometer'], 'id': 1081, 'def': 'measuring instrument for measuring temperature', 'name': 'thermometer'}, {'frequency': 'c', 'synset': 'thermos.n.01', 'synonyms': ['thermos_bottle'], 'id': 1082, 'def': 'vacuum flask that preserves temperature of hot or cold drinks', 'name': 'thermos_bottle'}, {'frequency': 'f', 'synset': 'thermostat.n.01', 'synonyms': ['thermostat'], 'id': 1083, 'def': 'a regulator for automatically regulating temperature by starting or stopping the supply of heat', 'name': 'thermostat'}, {'frequency': 'r', 'synset': 'thimble.n.02', 'synonyms': ['thimble'], 'id': 1084, 'def': 'a small metal cap to protect the finger while sewing; can be used as a small container', 'name': 'thimble'}, {'frequency': 'c', 'synset': 'thread.n.01', 'synonyms': ['thread', 'yarn'], 'id': 1085, 'def': 'a fine cord of twisted fibers (of cotton or silk or wool or nylon etc.) used in sewing and weaving', 'name': 'thread'}, {'frequency': 'c', 'synset': 'thumbtack.n.01', 'synonyms': ['thumbtack', 'drawing_pin', 'pushpin'], 'id': 1086, 'def': 'a tack for attaching papers to a bulletin board or drawing board', 'name': 'thumbtack'}, {'frequency': 'c', 'synset': 'tiara.n.01', 'synonyms': ['tiara'], 'id': 1087, 'def': 'a jeweled headdress worn by women on formal occasions', 'name': 'tiara'}, {'frequency': 'c', 'synset': 'tiger.n.02', 'synonyms': ['tiger'], 'id': 1088, 'def': 'large feline of forests in most of Asia having a tawny coat with black stripes', 'name': 'tiger'}, {'frequency': 'c', 'synset': 'tights.n.01', 'synonyms': ['tights_(clothing)', 'leotards'], 'id': 1089, 'def': 'skintight knit hose covering the body from the waist to the feet worn by acrobats and dancers and as stockings by women and girls', 'name': 'tights_(clothing)'}, {'frequency': 'c', 'synset': 'timer.n.01', 'synonyms': ['timer', 'stopwatch'], 'id': 1090, 'def': 'a timepiece that measures a time interval and signals its end', 'name': 'timer'}, {'frequency': 'f', 'synset': 'tinfoil.n.01', 'synonyms': ['tinfoil'], 'id': 1091, 'def': 'foil made of tin or an alloy of tin and lead', 'name': 'tinfoil'}, {'frequency': 'c', 'synset': 'tinsel.n.01', 'synonyms': ['tinsel'], 'id': 1092, 'def': 'a showy decoration that is basically valueless', 'name': 'tinsel'}, {'frequency': 'f', 'synset': 'tissue.n.02', 'synonyms': ['tissue_paper'], 'id': 1093, 'def': 'a soft thin (usually translucent) paper', 'name': 'tissue_paper'}, {'frequency': 'c', 'synset': 'toast.n.01', 'synonyms': ['toast_(food)'], 'id': 1094, 'def': 'slice of bread that has been toasted', 'name': 'toast_(food)'}, {'frequency': 'f', 'synset': 'toaster.n.02', 'synonyms': ['toaster'], 'id': 1095, 'def': 'a kitchen appliance (usually electric) for toasting bread', 'name': 'toaster'}, {'frequency': 'f', 'synset': 'toaster_oven.n.01', 'synonyms': ['toaster_oven'], 'id': 1096, 'def': 'kitchen appliance consisting of a small electric oven for toasting or warming food', 'name': 'toaster_oven'}, {'frequency': 'f', 'synset': 'toilet.n.02', 'synonyms': ['toilet'], 'id': 1097, 'def': 'a plumbing fixture for defecation and urination', 'name': 'toilet'}, {'frequency': 'f', 'synset': 'toilet_tissue.n.01', 'synonyms': ['toilet_tissue', 'toilet_paper', 'bathroom_tissue'], 'id': 1098, 'def': 'a soft thin absorbent paper for use in toilets', 'name': 'toilet_tissue'}, {'frequency': 'f', 'synset': 'tomato.n.01', 'synonyms': ['tomato'], 'id': 1099, 'def': 'mildly acid red or yellow pulpy fruit eaten as a vegetable', 'name': 'tomato'}, {'frequency': 'f', 'synset': 'tongs.n.01', 'synonyms': ['tongs'], 'id': 1100, 'def': 'any of various devices for taking hold of objects; usually have two hinged legs with handles above and pointed hooks below', 'name': 'tongs'}, {'frequency': 'c', 'synset': 'toolbox.n.01', 'synonyms': ['toolbox'], 'id': 1101, 'def': 'a box or chest or cabinet for holding hand tools', 'name': 'toolbox'}, {'frequency': 'f', 'synset': 'toothbrush.n.01', 'synonyms': ['toothbrush'], 'id': 1102, 'def': 'small brush; has long handle; used to clean teeth', 'name': 'toothbrush'}, {'frequency': 'f', 'synset': 'toothpaste.n.01', 'synonyms': ['toothpaste'], 'id': 1103, 'def': 'a dentifrice in the form of a paste', 'name': 'toothpaste'}, {'frequency': 'f', 'synset': 'toothpick.n.01', 'synonyms': ['toothpick'], 'id': 1104, 'def': 'pick consisting of a small strip of wood or plastic; used to pick food from between the teeth', 'name': 'toothpick'}, {'frequency': 'f', 'synset': 'top.n.09', 'synonyms': ['cover'], 'id': 1105, 'def': 'covering for a hole (especially a hole in the top of a container)', 'name': 'cover'}, {'frequency': 'c', 'synset': 'tortilla.n.01', 'synonyms': ['tortilla'], 'id': 1106, 'def': 'thin unleavened pancake made from cornmeal or wheat flour', 'name': 'tortilla'}, {'frequency': 'c', 'synset': 'tow_truck.n.01', 'synonyms': ['tow_truck'], 'id': 1107, 'def': 'a truck equipped to hoist and pull wrecked cars (or to remove cars from no-parking zones)', 'name': 'tow_truck'}, {'frequency': 'f', 'synset': 'towel.n.01', 'synonyms': ['towel'], 'id': 1108, 'def': 'a rectangular piece of absorbent cloth (or paper) for drying or wiping', 'name': 'towel'}, {'frequency': 'f', 'synset': 'towel_rack.n.01', 'synonyms': ['towel_rack', 'towel_rail', 'towel_bar'], 'id': 1109, 'def': 'a rack consisting of one or more bars on which towels can be hung', 'name': 'towel_rack'}, {'frequency': 'f', 'synset': 'toy.n.03', 'synonyms': ['toy'], 'id': 1110, 'def': 'a device regarded as providing amusement', 'name': 'toy'}, {'frequency': 'c', 'synset': 'tractor.n.01', 'synonyms': ['tractor_(farm_equipment)'], 'id': 1111, 'def': 'a wheeled vehicle with large wheels; used in farming and other applications', 'name': 'tractor_(farm_equipment)'}, {'frequency': 'f', 'synset': 'traffic_light.n.01', 'synonyms': ['traffic_light'], 'id': 1112, 'def': 'a device to control vehicle traffic often consisting of three or more lights', 'name': 'traffic_light'}, {'frequency': 'c', 'synset': 'trail_bike.n.01', 'synonyms': ['dirt_bike'], 'id': 1113, 'def': 'a lightweight motorcycle equipped with rugged tires and suspension for off-road use', 'name': 'dirt_bike'}, {'frequency': 'f', 'synset': 'trailer_truck.n.01', 'synonyms': ['trailer_truck', 'tractor_trailer', 'trucking_rig', 'articulated_lorry', 'semi_truck'], 'id': 1114, 'def': 'a truck consisting of a tractor and trailer together', 'name': 'trailer_truck'}, {'frequency': 'f', 'synset': 'train.n.01', 'synonyms': ['train_(railroad_vehicle)', 'railroad_train'], 'id': 1115, 'def': 'public or private transport provided by a line of railway cars coupled together and drawn by a locomotive', 'name': 'train_(railroad_vehicle)'}, {'frequency': 'r', 'synset': 'trampoline.n.01', 'synonyms': ['trampoline'], 'id': 1116, 'def': 'gymnastic apparatus consisting of a strong canvas sheet attached with springs to a metal frame', 'name': 'trampoline'}, {'frequency': 'f', 'synset': 'tray.n.01', 'synonyms': ['tray'], 'id': 1117, 'def': 'an open receptacle for holding or displaying or serving articles or food', 'name': 'tray'}, {'frequency': 'r', 'synset': 'trench_coat.n.01', 'synonyms': ['trench_coat'], 'id': 1118, 'def': 'a military style raincoat; belted with deep pockets', 'name': 'trench_coat'}, {'frequency': 'r', 'synset': 'triangle.n.05', 'synonyms': ['triangle_(musical_instrument)'], 'id': 1119, 'def': 'a percussion instrument consisting of a metal bar bent in the shape of an open triangle', 'name': 'triangle_(musical_instrument)'}, {'frequency': 'c', 'synset': 'tricycle.n.01', 'synonyms': ['tricycle'], 'id': 1120, 'def': 'a vehicle with three wheels that is moved by foot pedals', 'name': 'tricycle'}, {'frequency': 'f', 'synset': 'tripod.n.01', 'synonyms': ['tripod'], 'id': 1121, 'def': 'a three-legged rack used for support', 'name': 'tripod'}, {'frequency': 'f', 'synset': 'trouser.n.01', 'synonyms': ['trousers', 'pants_(clothing)'], 'id': 1122, 'def': 'a garment extending from the waist to the knee or ankle, covering each leg separately', 'name': 'trousers'}, {'frequency': 'f', 'synset': 'truck.n.01', 'synonyms': ['truck'], 'id': 1123, 'def': 'an automotive vehicle suitable for hauling', 'name': 'truck'}, {'frequency': 'r', 'synset': 'truffle.n.03', 'synonyms': ['truffle_(chocolate)', 'chocolate_truffle'], 'id': 1124, 'def': 'creamy chocolate candy', 'name': 'truffle_(chocolate)'}, {'frequency': 'c', 'synset': 'trunk.n.02', 'synonyms': ['trunk'], 'id': 1125, 'def': 'luggage consisting of a large strong case used when traveling or for storage', 'name': 'trunk'}, {'frequency': 'r', 'synset': 'tub.n.02', 'synonyms': ['vat'], 'id': 1126, 'def': 'a large vessel for holding or storing liquids', 'name': 'vat'}, {'frequency': 'c', 'synset': 'turban.n.01', 'synonyms': ['turban'], 'id': 1127, 'def': 'a traditional headdress consisting of a long scarf wrapped around the head', 'name': 'turban'}, {'frequency': 'c', 'synset': 'turkey.n.04', 'synonyms': ['turkey_(food)'], 'id': 1128, 'def': 'flesh of large domesticated fowl usually roasted', 'name': 'turkey_(food)'}, {'frequency': 'r', 'synset': 'turnip.n.01', 'synonyms': ['turnip'], 'id': 1129, 'def': 'widely cultivated plant having a large fleshy edible white or yellow root', 'name': 'turnip'}, {'frequency': 'c', 'synset': 'turtle.n.02', 'synonyms': ['turtle'], 'id': 1130, 'def': 'any of various aquatic and land reptiles having a bony shell and flipper-like limbs for swimming', 'name': 'turtle'}, {'frequency': 'c', 'synset': 'turtleneck.n.01', 'synonyms': ['turtleneck_(clothing)', 'polo-neck'], 'id': 1131, 'def': 'a sweater or jersey with a high close-fitting collar', 'name': 'turtleneck_(clothing)'}, {'frequency': 'c', 'synset': 'typewriter.n.01', 'synonyms': ['typewriter'], 'id': 1132, 'def': 'hand-operated character printer for printing written messages one character at a time', 'name': 'typewriter'}, {'frequency': 'f', 'synset': 'umbrella.n.01', 'synonyms': ['umbrella'], 'id': 1133, 'def': 'a lightweight handheld collapsible canopy', 'name': 'umbrella'}, {'frequency': 'f', 'synset': 'underwear.n.01', 'synonyms': ['underwear', 'underclothes', 'underclothing', 'underpants'], 'id': 1134, 'def': 'undergarment worn next to the skin and under the outer garments', 'name': 'underwear'}, {'frequency': 'r', 'synset': 'unicycle.n.01', 'synonyms': ['unicycle'], 'id': 1135, 'def': 'a vehicle with a single wheel that is driven by pedals', 'name': 'unicycle'}, {'frequency': 'f', 'synset': 'urinal.n.01', 'synonyms': ['urinal'], 'id': 1136, 'def': 'a plumbing fixture (usually attached to the wall) used by men to urinate', 'name': 'urinal'}, {'frequency': 'c', 'synset': 'urn.n.01', 'synonyms': ['urn'], 'id': 1137, 'def': 'a large vase that usually has a pedestal or feet', 'name': 'urn'}, {'frequency': 'c', 'synset': 'vacuum.n.04', 'synonyms': ['vacuum_cleaner'], 'id': 1138, 'def': 'an electrical home appliance that cleans by suction', 'name': 'vacuum_cleaner'}, {'frequency': 'f', 'synset': 'vase.n.01', 'synonyms': ['vase'], 'id': 1139, 'def': 'an open jar of glass or porcelain used as an ornament or to hold flowers', 'name': 'vase'}, {'frequency': 'c', 'synset': 'vending_machine.n.01', 'synonyms': ['vending_machine'], 'id': 1140, 'def': 'a slot machine for selling goods', 'name': 'vending_machine'}, {'frequency': 'f', 'synset': 'vent.n.01', 'synonyms': ['vent', 'blowhole', 'air_vent'], 'id': 1141, 'def': 'a hole for the escape of gas or air', 'name': 'vent'}, {'frequency': 'f', 'synset': 'vest.n.01', 'synonyms': ['vest', 'waistcoat'], 'id': 1142, 'def': "a man's sleeveless garment worn underneath a coat", 'name': 'vest'}, {'frequency': 'c', 'synset': 'videotape.n.01', 'synonyms': ['videotape'], 'id': 1143, 'def': 'a video recording made on magnetic tape', 'name': 'videotape'}, {'frequency': 'r', 'synset': 'vinegar.n.01', 'synonyms': ['vinegar'], 'id': 1144, 'def': 'sour-tasting liquid produced usually by oxidation of the alcohol in wine or cider and used as a condiment or food preservative', 'name': 'vinegar'}, {'frequency': 'r', 'synset': 'violin.n.01', 'synonyms': ['violin', 'fiddle'], 'id': 1145, 'def': 'bowed stringed instrument that is the highest member of the violin family', 'name': 'violin'}, {'frequency': 'r', 'synset': 'vodka.n.01', 'synonyms': ['vodka'], 'id': 1146, 'def': 'unaged colorless liquor originating in Russia', 'name': 'vodka'}, {'frequency': 'c', 'synset': 'volleyball.n.02', 'synonyms': ['volleyball'], 'id': 1147, 'def': 'an inflated ball used in playing volleyball', 'name': 'volleyball'}, {'frequency': 'r', 'synset': 'vulture.n.01', 'synonyms': ['vulture'], 'id': 1148, 'def': 'any of various large birds of prey having naked heads and weak claws and feeding chiefly on carrion', 'name': 'vulture'}, {'frequency': 'c', 'synset': 'waffle.n.01', 'synonyms': ['waffle'], 'id': 1149, 'def': 'pancake batter baked in a waffle iron', 'name': 'waffle'}, {'frequency': 'r', 'synset': 'waffle_iron.n.01', 'synonyms': ['waffle_iron'], 'id': 1150, 'def': 'a kitchen appliance for baking waffles', 'name': 'waffle_iron'}, {'frequency': 'c', 'synset': 'wagon.n.01', 'synonyms': ['wagon'], 'id': 1151, 'def': 'any of various kinds of wheeled vehicles drawn by an animal or a tractor', 'name': 'wagon'}, {'frequency': 'c', 'synset': 'wagon_wheel.n.01', 'synonyms': ['wagon_wheel'], 'id': 1152, 'def': 'a wheel of a wagon', 'name': 'wagon_wheel'}, {'frequency': 'c', 'synset': 'walking_stick.n.01', 'synonyms': ['walking_stick'], 'id': 1153, 'def': 'a stick carried in the hand for support in walking', 'name': 'walking_stick'}, {'frequency': 'c', 'synset': 'wall_clock.n.01', 'synonyms': ['wall_clock'], 'id': 1154, 'def': 'a clock mounted on a wall', 'name': 'wall_clock'}, {'frequency': 'f', 'synset': 'wall_socket.n.01', 'synonyms': ['wall_socket', 'wall_plug', 'electric_outlet', 'electrical_outlet', 'outlet', 'electric_receptacle'], 'id': 1155, 'def': 'receptacle providing a place in a wiring system where current can be taken to run electrical devices', 'name': 'wall_socket'}, {'frequency': 'f', 'synset': 'wallet.n.01', 'synonyms': ['wallet', 'billfold'], 'id': 1156, 'def': 'a pocket-size case for holding papers and paper money', 'name': 'wallet'}, {'frequency': 'r', 'synset': 'walrus.n.01', 'synonyms': ['walrus'], 'id': 1157, 'def': 'either of two large northern marine mammals having ivory tusks and tough hide over thick blubber', 'name': 'walrus'}, {'frequency': 'r', 'synset': 'wardrobe.n.01', 'synonyms': ['wardrobe'], 'id': 1158, 'def': 'a tall piece of furniture that provides storage space for clothes; has a door and rails or hooks for hanging clothes', 'name': 'wardrobe'}, {'frequency': 'r', 'synset': 'washbasin.n.01', 'synonyms': ['washbasin', 'basin_(for_washing)', 'washbowl', 'washstand', 'handbasin'], 'id': 1159, 'def': 'a bathroom sink that is permanently installed and connected to a water supply and drainpipe; where you can wash your hands and face', 'name': 'washbasin'}, {'frequency': 'c', 'synset': 'washer.n.03', 'synonyms': ['automatic_washer', 'washing_machine'], 'id': 1160, 'def': 'a home appliance for washing clothes and linens automatically', 'name': 'automatic_washer'}, {'frequency': 'f', 'synset': 'watch.n.01', 'synonyms': ['watch', 'wristwatch'], 'id': 1161, 'def': 'a small, portable timepiece', 'name': 'watch'}, {'frequency': 'f', 'synset': 'water_bottle.n.01', 'synonyms': ['water_bottle'], 'id': 1162, 'def': 'a bottle for holding water', 'name': 'water_bottle'}, {'frequency': 'c', 'synset': 'water_cooler.n.01', 'synonyms': ['water_cooler'], 'id': 1163, 'def': 'a device for cooling and dispensing drinking water', 'name': 'water_cooler'}, {'frequency': 'c', 'synset': 'water_faucet.n.01', 'synonyms': ['water_faucet', 'water_tap', 'tap_(water_faucet)'], 'id': 1164, 'def': 'a faucet for drawing water from a pipe or cask', 'name': 'water_faucet'}, {'frequency': 'r', 'synset': 'water_heater.n.01', 'synonyms': ['water_heater', 'hot-water_heater'], 'id': 1165, 'def': 'a heater and storage tank to supply heated water', 'name': 'water_heater'}, {'frequency': 'c', 'synset': 'water_jug.n.01', 'synonyms': ['water_jug'], 'id': 1166, 'def': 'a jug that holds water', 'name': 'water_jug'}, {'frequency': 'r', 'synset': 'water_pistol.n.01', 'synonyms': ['water_gun', 'squirt_gun'], 'id': 1167, 'def': 'plaything consisting of a toy pistol that squirts water', 'name': 'water_gun'}, {'frequency': 'c', 'synset': 'water_scooter.n.01', 'synonyms': ['water_scooter', 'sea_scooter', 'jet_ski'], 'id': 1168, 'def': 'a motorboat resembling a motor scooter (NOT A SURFBOARD OR WATER SKI)', 'name': 'water_scooter'}, {'frequency': 'c', 'synset': 'water_ski.n.01', 'synonyms': ['water_ski'], 'id': 1169, 'def': 'broad ski for skimming over water towed by a speedboat (DO NOT MARK WATER)', 'name': 'water_ski'}, {'frequency': 'c', 'synset': 'water_tower.n.01', 'synonyms': ['water_tower'], 'id': 1170, 'def': 'a large reservoir for water', 'name': 'water_tower'}, {'frequency': 'c', 'synset': 'watering_can.n.01', 'synonyms': ['watering_can'], 'id': 1171, 'def': 'a container with a handle and a spout with a perforated nozzle; used to sprinkle water over plants', 'name': 'watering_can'}, {'frequency': 'f', 'synset': 'watermelon.n.02', 'synonyms': ['watermelon'], 'id': 1172, 'def': 'large oblong or roundish melon with a hard green rind and sweet watery red or occasionally yellowish pulp', 'name': 'watermelon'}, {'frequency': 'f', 'synset': 'weathervane.n.01', 'synonyms': ['weathervane', 'vane_(weathervane)', 'wind_vane'], 'id': 1173, 'def': 'mechanical device attached to an elevated structure; rotates freely to show the direction of the wind', 'name': 'weathervane'}, {'frequency': 'c', 'synset': 'webcam.n.01', 'synonyms': ['webcam'], 'id': 1174, 'def': 'a digital camera designed to take digital photographs and transmit them over the internet', 'name': 'webcam'}, {'frequency': 'c', 'synset': 'wedding_cake.n.01', 'synonyms': ['wedding_cake', 'bridecake'], 'id': 1175, 'def': 'a rich cake with two or more tiers and covered with frosting and decorations; served at a wedding reception', 'name': 'wedding_cake'}, {'frequency': 'c', 'synset': 'wedding_ring.n.01', 'synonyms': ['wedding_ring', 'wedding_band'], 'id': 1176, 'def': 'a ring given to the bride and/or groom at the wedding', 'name': 'wedding_ring'}, {'frequency': 'f', 'synset': 'wet_suit.n.01', 'synonyms': ['wet_suit'], 'id': 1177, 'def': 'a close-fitting garment made of a permeable material; worn in cold water to retain body heat', 'name': 'wet_suit'}, {'frequency': 'f', 'synset': 'wheel.n.01', 'synonyms': ['wheel'], 'id': 1178, 'def': 'a circular frame with spokes (or a solid disc) that can rotate on a shaft or axle', 'name': 'wheel'}, {'frequency': 'c', 'synset': 'wheelchair.n.01', 'synonyms': ['wheelchair'], 'id': 1179, 'def': 'a movable chair mounted on large wheels', 'name': 'wheelchair'}, {'frequency': 'c', 'synset': 'whipped_cream.n.01', 'synonyms': ['whipped_cream'], 'id': 1180, 'def': 'cream that has been beaten until light and fluffy', 'name': 'whipped_cream'}, {'frequency': 'c', 'synset': 'whistle.n.03', 'synonyms': ['whistle'], 'id': 1181, 'def': 'a small wind instrument that produces a whistling sound by blowing into it', 'name': 'whistle'}, {'frequency': 'c', 'synset': 'wig.n.01', 'synonyms': ['wig'], 'id': 1182, 'def': 'hairpiece covering the head and made of real or synthetic hair', 'name': 'wig'}, {'frequency': 'c', 'synset': 'wind_chime.n.01', 'synonyms': ['wind_chime'], 'id': 1183, 'def': 'a decorative arrangement of pieces of metal or glass or pottery that hang together loosely so the wind can cause them to tinkle', 'name': 'wind_chime'}, {'frequency': 'c', 'synset': 'windmill.n.01', 'synonyms': ['windmill'], 'id': 1184, 'def': 'A mill or turbine that is powered by wind', 'name': 'windmill'}, {'frequency': 'c', 'synset': 'window_box.n.01', 'synonyms': ['window_box_(for_plants)'], 'id': 1185, 'def': 'a container for growing plants on a windowsill', 'name': 'window_box_(for_plants)'}, {'frequency': 'f', 'synset': 'windshield_wiper.n.01', 'synonyms': ['windshield_wiper', 'windscreen_wiper', 'wiper_(for_windshield/screen)'], 'id': 1186, 'def': 'a mechanical device that cleans the windshield', 'name': 'windshield_wiper'}, {'frequency': 'c', 'synset': 'windsock.n.01', 'synonyms': ['windsock', 'air_sock', 'air-sleeve', 'wind_sleeve', 'wind_cone'], 'id': 1187, 'def': 'a truncated cloth cone mounted on a mast/pole; shows wind direction', 'name': 'windsock'}, {'frequency': 'f', 'synset': 'wine_bottle.n.01', 'synonyms': ['wine_bottle'], 'id': 1188, 'def': 'a bottle for holding wine', 'name': 'wine_bottle'}, {'frequency': 'c', 'synset': 'wine_bucket.n.01', 'synonyms': ['wine_bucket', 'wine_cooler'], 'id': 1189, 'def': 'a bucket of ice used to chill a bottle of wine', 'name': 'wine_bucket'}, {'frequency': 'f', 'synset': 'wineglass.n.01', 'synonyms': ['wineglass'], 'id': 1190, 'def': 'a glass that has a stem and in which wine is served', 'name': 'wineglass'}, {'frequency': 'f', 'synset': 'winker.n.02', 'synonyms': ['blinder_(for_horses)'], 'id': 1191, 'def': 'blinds that prevent a horse from seeing something on either side', 'name': 'blinder_(for_horses)'}, {'frequency': 'c', 'synset': 'wok.n.01', 'synonyms': ['wok'], 'id': 1192, 'def': 'pan with a convex bottom; used for frying in Chinese cooking', 'name': 'wok'}, {'frequency': 'r', 'synset': 'wolf.n.01', 'synonyms': ['wolf'], 'id': 1193, 'def': 'a wild carnivorous mammal of the dog family, living and hunting in packs', 'name': 'wolf'}, {'frequency': 'c', 'synset': 'wooden_spoon.n.02', 'synonyms': ['wooden_spoon'], 'id': 1194, 'def': 'a spoon made of wood', 'name': 'wooden_spoon'}, {'frequency': 'c', 'synset': 'wreath.n.01', 'synonyms': ['wreath'], 'id': 1195, 'def': 'an arrangement of flowers, leaves, or stems fastened in a ring', 'name': 'wreath'}, {'frequency': 'c', 'synset': 'wrench.n.03', 'synonyms': ['wrench', 'spanner'], 'id': 1196, 'def': 'a hand tool that is used to hold or twist a nut or bolt', 'name': 'wrench'}, {'frequency': 'f', 'synset': 'wristband.n.01', 'synonyms': ['wristband'], 'id': 1197, 'def': 'band consisting of a part of a sleeve that covers the wrist', 'name': 'wristband'}, {'frequency': 'f', 'synset': 'wristlet.n.01', 'synonyms': ['wristlet', 'wrist_band'], 'id': 1198, 'def': 'a band or bracelet worn around the wrist', 'name': 'wristlet'}, {'frequency': 'c', 'synset': 'yacht.n.01', 'synonyms': ['yacht'], 'id': 1199, 'def': 'an expensive vessel propelled by sail or power and used for cruising or racing', 'name': 'yacht'}, {'frequency': 'c', 'synset': 'yogurt.n.01', 'synonyms': ['yogurt', 'yoghurt', 'yoghourt'], 'id': 1200, 'def': 'a custard-like food made from curdled milk', 'name': 'yogurt'}, {'frequency': 'c', 'synset': 'yoke.n.07', 'synonyms': ['yoke_(animal_equipment)'], 'id': 1201, 'def': 'gear joining two animals at the neck; NOT egg yolk', 'name': 'yoke_(animal_equipment)'}, {'frequency': 'f', 'synset': 'zebra.n.01', 'synonyms': ['zebra'], 'id': 1202, 'def': 'any of several fleet black-and-white striped African equines', 'name': 'zebra'}, {'frequency': 'c', 'synset': 'zucchini.n.02', 'synonyms': ['zucchini', 'courgette'], 'id': 1203, 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}] # noqa # fmt: on ================================================ FILE: annotator/oneformer/detectron2/data/datasets/lvis_v1_category_image_count.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Autogen with # with open("lvis_v1_train.json", "r") as f: # a = json.load(f) # c = a["categories"] # for x in c: # del x["name"] # del x["instance_count"] # del x["def"] # del x["synonyms"] # del x["frequency"] # del x["synset"] # LVIS_CATEGORY_IMAGE_COUNT = repr(c) + " # noqa" # with open("/tmp/lvis_category_image_count.py", "wt") as f: # f.write(f"LVIS_CATEGORY_IMAGE_COUNT = {LVIS_CATEGORY_IMAGE_COUNT}") # Then paste the contents of that file below # fmt: off LVIS_CATEGORY_IMAGE_COUNT = [{'id': 1, 'image_count': 64}, {'id': 2, 'image_count': 364}, {'id': 3, 'image_count': 1911}, {'id': 4, 'image_count': 149}, {'id': 5, 'image_count': 29}, {'id': 6, 'image_count': 26}, {'id': 7, 'image_count': 59}, {'id': 8, 'image_count': 22}, {'id': 9, 'image_count': 12}, {'id': 10, 'image_count': 28}, {'id': 11, 'image_count': 505}, {'id': 12, 'image_count': 1207}, {'id': 13, 'image_count': 4}, {'id': 14, 'image_count': 10}, {'id': 15, 'image_count': 500}, {'id': 16, 'image_count': 33}, {'id': 17, 'image_count': 3}, {'id': 18, 'image_count': 44}, {'id': 19, 'image_count': 561}, {'id': 20, 'image_count': 8}, {'id': 21, 'image_count': 9}, {'id': 22, 'image_count': 33}, {'id': 23, 'image_count': 1883}, {'id': 24, 'image_count': 98}, {'id': 25, 'image_count': 70}, {'id': 26, 'image_count': 46}, {'id': 27, 'image_count': 117}, {'id': 28, 'image_count': 41}, {'id': 29, 'image_count': 1395}, {'id': 30, 'image_count': 7}, {'id': 31, 'image_count': 1}, {'id': 32, 'image_count': 314}, {'id': 33, 'image_count': 31}, {'id': 34, 'image_count': 1905}, {'id': 35, 'image_count': 1859}, {'id': 36, 'image_count': 1623}, {'id': 37, 'image_count': 47}, {'id': 38, 'image_count': 3}, {'id': 39, 'image_count': 3}, {'id': 40, 'image_count': 1}, {'id': 41, 'image_count': 305}, {'id': 42, 'image_count': 6}, {'id': 43, 'image_count': 210}, {'id': 44, 'image_count': 36}, {'id': 45, 'image_count': 1787}, {'id': 46, 'image_count': 17}, {'id': 47, 'image_count': 51}, {'id': 48, 'image_count': 138}, {'id': 49, 'image_count': 3}, {'id': 50, 'image_count': 1470}, {'id': 51, 'image_count': 3}, {'id': 52, 'image_count': 2}, {'id': 53, 'image_count': 186}, {'id': 54, 'image_count': 76}, {'id': 55, 'image_count': 26}, {'id': 56, 'image_count': 303}, {'id': 57, 'image_count': 738}, {'id': 58, 'image_count': 1799}, {'id': 59, 'image_count': 1934}, {'id': 60, 'image_count': 1609}, {'id': 61, 'image_count': 1622}, {'id': 62, 'image_count': 41}, {'id': 63, 'image_count': 4}, {'id': 64, 'image_count': 11}, {'id': 65, 'image_count': 270}, {'id': 66, 'image_count': 349}, {'id': 67, 'image_count': 42}, {'id': 68, 'image_count': 823}, {'id': 69, 'image_count': 6}, {'id': 70, 'image_count': 48}, {'id': 71, 'image_count': 3}, {'id': 72, 'image_count': 42}, {'id': 73, 'image_count': 24}, {'id': 74, 'image_count': 16}, {'id': 75, 'image_count': 605}, {'id': 76, 'image_count': 646}, {'id': 77, 'image_count': 1765}, {'id': 78, 'image_count': 2}, {'id': 79, 'image_count': 125}, {'id': 80, 'image_count': 1420}, {'id': 81, 'image_count': 140}, {'id': 82, 'image_count': 4}, {'id': 83, 'image_count': 322}, {'id': 84, 'image_count': 60}, {'id': 85, 'image_count': 2}, {'id': 86, 'image_count': 231}, {'id': 87, 'image_count': 333}, {'id': 88, 'image_count': 1941}, {'id': 89, 'image_count': 367}, {'id': 90, 'image_count': 1922}, {'id': 91, 'image_count': 18}, {'id': 92, 'image_count': 81}, {'id': 93, 'image_count': 1}, {'id': 94, 'image_count': 1852}, {'id': 95, 'image_count': 430}, {'id': 96, 'image_count': 247}, {'id': 97, 'image_count': 94}, {'id': 98, 'image_count': 21}, {'id': 99, 'image_count': 1821}, {'id': 100, 'image_count': 16}, {'id': 101, 'image_count': 12}, {'id': 102, 'image_count': 25}, {'id': 103, 'image_count': 41}, {'id': 104, 'image_count': 244}, {'id': 105, 'image_count': 7}, {'id': 106, 'image_count': 1}, {'id': 107, 'image_count': 40}, {'id': 108, 'image_count': 40}, {'id': 109, 'image_count': 104}, {'id': 110, 'image_count': 1671}, {'id': 111, 'image_count': 49}, {'id': 112, 'image_count': 243}, {'id': 113, 'image_count': 2}, {'id': 114, 'image_count': 242}, {'id': 115, 'image_count': 271}, {'id': 116, 'image_count': 104}, {'id': 117, 'image_count': 8}, {'id': 118, 'image_count': 1758}, {'id': 119, 'image_count': 1}, {'id': 120, 'image_count': 48}, {'id': 121, 'image_count': 14}, {'id': 122, 'image_count': 40}, {'id': 123, 'image_count': 1}, {'id': 124, 'image_count': 37}, {'id': 125, 'image_count': 1510}, {'id': 126, 'image_count': 6}, {'id': 127, 'image_count': 1903}, {'id': 128, 'image_count': 70}, {'id': 129, 'image_count': 86}, {'id': 130, 'image_count': 7}, {'id': 131, 'image_count': 5}, {'id': 132, 'image_count': 1406}, {'id': 133, 'image_count': 1901}, {'id': 134, 'image_count': 15}, {'id': 135, 'image_count': 28}, {'id': 136, 'image_count': 6}, {'id': 137, 'image_count': 494}, {'id': 138, 'image_count': 234}, {'id': 139, 'image_count': 1922}, {'id': 140, 'image_count': 1}, {'id': 141, 'image_count': 35}, {'id': 142, 'image_count': 5}, {'id': 143, 'image_count': 1828}, {'id': 144, 'image_count': 8}, {'id': 145, 'image_count': 63}, {'id': 146, 'image_count': 1668}, {'id': 147, 'image_count': 4}, {'id': 148, 'image_count': 95}, {'id': 149, 'image_count': 17}, {'id': 150, 'image_count': 1567}, {'id': 151, 'image_count': 2}, {'id': 152, 'image_count': 103}, {'id': 153, 'image_count': 50}, {'id': 154, 'image_count': 1309}, {'id': 155, 'image_count': 6}, {'id': 156, 'image_count': 92}, {'id': 157, 'image_count': 19}, {'id': 158, 'image_count': 37}, {'id': 159, 'image_count': 4}, {'id': 160, 'image_count': 709}, {'id': 161, 'image_count': 9}, {'id': 162, 'image_count': 82}, {'id': 163, 'image_count': 15}, {'id': 164, 'image_count': 3}, {'id': 165, 'image_count': 61}, {'id': 166, 'image_count': 51}, {'id': 167, 'image_count': 5}, {'id': 168, 'image_count': 13}, {'id': 169, 'image_count': 642}, {'id': 170, 'image_count': 24}, {'id': 171, 'image_count': 255}, {'id': 172, 'image_count': 9}, {'id': 173, 'image_count': 1808}, {'id': 174, 'image_count': 31}, {'id': 175, 'image_count': 158}, {'id': 176, 'image_count': 80}, {'id': 177, 'image_count': 1884}, {'id': 178, 'image_count': 158}, {'id': 179, 'image_count': 2}, {'id': 180, 'image_count': 12}, {'id': 181, 'image_count': 1659}, {'id': 182, 'image_count': 7}, {'id': 183, 'image_count': 834}, {'id': 184, 'image_count': 57}, {'id': 185, 'image_count': 174}, {'id': 186, 'image_count': 95}, {'id': 187, 'image_count': 27}, {'id': 188, 'image_count': 22}, {'id': 189, 'image_count': 1391}, {'id': 190, 'image_count': 90}, {'id': 191, 'image_count': 40}, {'id': 192, 'image_count': 445}, {'id': 193, 'image_count': 21}, {'id': 194, 'image_count': 1132}, {'id': 195, 'image_count': 177}, {'id': 196, 'image_count': 4}, {'id': 197, 'image_count': 17}, {'id': 198, 'image_count': 84}, {'id': 199, 'image_count': 55}, {'id': 200, 'image_count': 30}, {'id': 201, 'image_count': 25}, {'id': 202, 'image_count': 2}, {'id': 203, 'image_count': 125}, {'id': 204, 'image_count': 1135}, {'id': 205, 'image_count': 19}, {'id': 206, 'image_count': 72}, {'id': 207, 'image_count': 1926}, {'id': 208, 'image_count': 159}, {'id': 209, 'image_count': 7}, {'id': 210, 'image_count': 1}, {'id': 211, 'image_count': 13}, {'id': 212, 'image_count': 35}, {'id': 213, 'image_count': 18}, {'id': 214, 'image_count': 8}, {'id': 215, 'image_count': 6}, {'id': 216, 'image_count': 35}, {'id': 217, 'image_count': 1222}, {'id': 218, 'image_count': 103}, {'id': 219, 'image_count': 28}, {'id': 220, 'image_count': 63}, {'id': 221, 'image_count': 28}, {'id': 222, 'image_count': 5}, {'id': 223, 'image_count': 7}, {'id': 224, 'image_count': 14}, {'id': 225, 'image_count': 1918}, {'id': 226, 'image_count': 133}, {'id': 227, 'image_count': 16}, {'id': 228, 'image_count': 27}, {'id': 229, 'image_count': 110}, {'id': 230, 'image_count': 1895}, {'id': 231, 'image_count': 4}, {'id': 232, 'image_count': 1927}, {'id': 233, 'image_count': 8}, {'id': 234, 'image_count': 1}, {'id': 235, 'image_count': 263}, {'id': 236, 'image_count': 10}, {'id': 237, 'image_count': 2}, {'id': 238, 'image_count': 3}, {'id': 239, 'image_count': 87}, {'id': 240, 'image_count': 9}, {'id': 241, 'image_count': 71}, {'id': 242, 'image_count': 13}, {'id': 243, 'image_count': 18}, {'id': 244, 'image_count': 2}, {'id': 245, 'image_count': 5}, {'id': 246, 'image_count': 45}, {'id': 247, 'image_count': 1}, {'id': 248, 'image_count': 23}, {'id': 249, 'image_count': 32}, {'id': 250, 'image_count': 4}, {'id': 251, 'image_count': 1}, {'id': 252, 'image_count': 858}, {'id': 253, 'image_count': 661}, {'id': 254, 'image_count': 168}, {'id': 255, 'image_count': 210}, {'id': 256, 'image_count': 65}, {'id': 257, 'image_count': 4}, {'id': 258, 'image_count': 2}, {'id': 259, 'image_count': 159}, {'id': 260, 'image_count': 31}, {'id': 261, 'image_count': 811}, {'id': 262, 'image_count': 1}, {'id': 263, 'image_count': 42}, {'id': 264, 'image_count': 27}, {'id': 265, 'image_count': 2}, {'id': 266, 'image_count': 5}, {'id': 267, 'image_count': 95}, {'id': 268, 'image_count': 32}, {'id': 269, 'image_count': 1}, {'id': 270, 'image_count': 1}, {'id': 271, 'image_count': 1844}, {'id': 272, 'image_count': 897}, {'id': 273, 'image_count': 31}, {'id': 274, 'image_count': 23}, {'id': 275, 'image_count': 1}, {'id': 276, 'image_count': 202}, {'id': 277, 'image_count': 746}, {'id': 278, 'image_count': 44}, {'id': 279, 'image_count': 14}, {'id': 280, 'image_count': 26}, {'id': 281, 'image_count': 1}, {'id': 282, 'image_count': 2}, {'id': 283, 'image_count': 25}, {'id': 284, 'image_count': 238}, {'id': 285, 'image_count': 592}, {'id': 286, 'image_count': 26}, {'id': 287, 'image_count': 5}, {'id': 288, 'image_count': 42}, {'id': 289, 'image_count': 13}, {'id': 290, 'image_count': 46}, {'id': 291, 'image_count': 1}, {'id': 292, 'image_count': 8}, {'id': 293, 'image_count': 34}, {'id': 294, 'image_count': 5}, {'id': 295, 'image_count': 1}, {'id': 296, 'image_count': 1871}, {'id': 297, 'image_count': 717}, {'id': 298, 'image_count': 1010}, {'id': 299, 'image_count': 679}, {'id': 300, 'image_count': 3}, {'id': 301, 'image_count': 4}, {'id': 302, 'image_count': 1}, {'id': 303, 'image_count': 166}, {'id': 304, 'image_count': 2}, {'id': 305, 'image_count': 266}, {'id': 306, 'image_count': 101}, {'id': 307, 'image_count': 6}, {'id': 308, 'image_count': 14}, {'id': 309, 'image_count': 133}, {'id': 310, 'image_count': 2}, {'id': 311, 'image_count': 38}, {'id': 312, 'image_count': 95}, {'id': 313, 'image_count': 1}, {'id': 314, 'image_count': 12}, {'id': 315, 'image_count': 49}, {'id': 316, 'image_count': 5}, {'id': 317, 'image_count': 5}, {'id': 318, 'image_count': 16}, {'id': 319, 'image_count': 216}, {'id': 320, 'image_count': 12}, {'id': 321, 'image_count': 1}, {'id': 322, 'image_count': 54}, {'id': 323, 'image_count': 5}, {'id': 324, 'image_count': 245}, {'id': 325, 'image_count': 12}, {'id': 326, 'image_count': 7}, {'id': 327, 'image_count': 35}, {'id': 328, 'image_count': 36}, {'id': 329, 'image_count': 32}, {'id': 330, 'image_count': 1027}, {'id': 331, 'image_count': 10}, {'id': 332, 'image_count': 12}, {'id': 333, 'image_count': 1}, {'id': 334, 'image_count': 67}, {'id': 335, 'image_count': 71}, {'id': 336, 'image_count': 30}, {'id': 337, 'image_count': 48}, {'id': 338, 'image_count': 249}, {'id': 339, 'image_count': 13}, {'id': 340, 'image_count': 29}, {'id': 341, 'image_count': 14}, {'id': 342, 'image_count': 236}, {'id': 343, 'image_count': 15}, {'id': 344, 'image_count': 1521}, {'id': 345, 'image_count': 25}, {'id': 346, 'image_count': 249}, {'id': 347, 'image_count': 139}, {'id': 348, 'image_count': 2}, {'id': 349, 'image_count': 2}, {'id': 350, 'image_count': 1890}, {'id': 351, 'image_count': 1240}, {'id': 352, 'image_count': 1}, {'id': 353, 'image_count': 9}, {'id': 354, 'image_count': 1}, {'id': 355, 'image_count': 3}, {'id': 356, 'image_count': 11}, {'id': 357, 'image_count': 4}, {'id': 358, 'image_count': 236}, {'id': 359, 'image_count': 44}, {'id': 360, 'image_count': 19}, {'id': 361, 'image_count': 1100}, {'id': 362, 'image_count': 7}, {'id': 363, 'image_count': 69}, {'id': 364, 'image_count': 2}, {'id': 365, 'image_count': 8}, {'id': 366, 'image_count': 5}, {'id': 367, 'image_count': 227}, {'id': 368, 'image_count': 6}, {'id': 369, 'image_count': 106}, {'id': 370, 'image_count': 81}, {'id': 371, 'image_count': 17}, {'id': 372, 'image_count': 134}, {'id': 373, 'image_count': 312}, {'id': 374, 'image_count': 8}, {'id': 375, 'image_count': 271}, {'id': 376, 'image_count': 2}, {'id': 377, 'image_count': 103}, {'id': 378, 'image_count': 1938}, {'id': 379, 'image_count': 574}, {'id': 380, 'image_count': 120}, {'id': 381, 'image_count': 2}, {'id': 382, 'image_count': 2}, {'id': 383, 'image_count': 13}, {'id': 384, 'image_count': 29}, {'id': 385, 'image_count': 1710}, {'id': 386, 'image_count': 66}, {'id': 387, 'image_count': 1008}, {'id': 388, 'image_count': 1}, {'id': 389, 'image_count': 3}, {'id': 390, 'image_count': 1942}, {'id': 391, 'image_count': 19}, {'id': 392, 'image_count': 1488}, {'id': 393, 'image_count': 46}, {'id': 394, 'image_count': 106}, {'id': 395, 'image_count': 115}, {'id': 396, 'image_count': 19}, {'id': 397, 'image_count': 2}, {'id': 398, 'image_count': 1}, {'id': 399, 'image_count': 28}, {'id': 400, 'image_count': 9}, {'id': 401, 'image_count': 192}, {'id': 402, 'image_count': 12}, {'id': 403, 'image_count': 21}, {'id': 404, 'image_count': 247}, {'id': 405, 'image_count': 6}, {'id': 406, 'image_count': 64}, {'id': 407, 'image_count': 7}, {'id': 408, 'image_count': 40}, {'id': 409, 'image_count': 542}, {'id': 410, 'image_count': 2}, {'id': 411, 'image_count': 1898}, {'id': 412, 'image_count': 36}, {'id': 413, 'image_count': 4}, {'id': 414, 'image_count': 1}, {'id': 415, 'image_count': 191}, {'id': 416, 'image_count': 6}, {'id': 417, 'image_count': 41}, {'id': 418, 'image_count': 39}, {'id': 419, 'image_count': 46}, {'id': 420, 'image_count': 1}, {'id': 421, 'image_count': 1451}, {'id': 422, 'image_count': 1878}, {'id': 423, 'image_count': 11}, {'id': 424, 'image_count': 82}, {'id': 425, 'image_count': 18}, {'id': 426, 'image_count': 1}, {'id': 427, 'image_count': 7}, {'id': 428, 'image_count': 3}, {'id': 429, 'image_count': 575}, {'id': 430, 'image_count': 1907}, {'id': 431, 'image_count': 8}, {'id': 432, 'image_count': 4}, {'id': 433, 'image_count': 32}, {'id': 434, 'image_count': 11}, {'id': 435, 'image_count': 4}, {'id': 436, 'image_count': 54}, {'id': 437, 'image_count': 202}, {'id': 438, 'image_count': 32}, {'id': 439, 'image_count': 3}, {'id': 440, 'image_count': 130}, {'id': 441, 'image_count': 119}, {'id': 442, 'image_count': 141}, {'id': 443, 'image_count': 29}, {'id': 444, 'image_count': 525}, {'id': 445, 'image_count': 1323}, {'id': 446, 'image_count': 2}, {'id': 447, 'image_count': 113}, {'id': 448, 'image_count': 16}, {'id': 449, 'image_count': 7}, {'id': 450, 'image_count': 35}, {'id': 451, 'image_count': 1908}, {'id': 452, 'image_count': 353}, {'id': 453, 'image_count': 18}, {'id': 454, 'image_count': 14}, {'id': 455, 'image_count': 77}, {'id': 456, 'image_count': 8}, {'id': 457, 'image_count': 37}, {'id': 458, 'image_count': 1}, {'id': 459, 'image_count': 346}, {'id': 460, 'image_count': 19}, {'id': 461, 'image_count': 1779}, {'id': 462, 'image_count': 23}, {'id': 463, 'image_count': 25}, {'id': 464, 'image_count': 67}, {'id': 465, 'image_count': 19}, {'id': 466, 'image_count': 28}, {'id': 467, 'image_count': 4}, {'id': 468, 'image_count': 27}, {'id': 469, 'image_count': 1861}, {'id': 470, 'image_count': 11}, {'id': 471, 'image_count': 13}, {'id': 472, 'image_count': 13}, {'id': 473, 'image_count': 32}, {'id': 474, 'image_count': 1767}, {'id': 475, 'image_count': 42}, {'id': 476, 'image_count': 17}, {'id': 477, 'image_count': 128}, {'id': 478, 'image_count': 1}, {'id': 479, 'image_count': 9}, {'id': 480, 'image_count': 10}, {'id': 481, 'image_count': 4}, {'id': 482, 'image_count': 9}, {'id': 483, 'image_count': 18}, {'id': 484, 'image_count': 41}, {'id': 485, 'image_count': 28}, {'id': 486, 'image_count': 3}, {'id': 487, 'image_count': 65}, {'id': 488, 'image_count': 9}, {'id': 489, 'image_count': 23}, {'id': 490, 'image_count': 24}, {'id': 491, 'image_count': 1}, {'id': 492, 'image_count': 2}, {'id': 493, 'image_count': 59}, {'id': 494, 'image_count': 48}, {'id': 495, 'image_count': 17}, {'id': 496, 'image_count': 1877}, {'id': 497, 'image_count': 18}, {'id': 498, 'image_count': 1920}, {'id': 499, 'image_count': 50}, {'id': 500, 'image_count': 1890}, {'id': 501, 'image_count': 99}, {'id': 502, 'image_count': 1530}, {'id': 503, 'image_count': 3}, {'id': 504, 'image_count': 11}, {'id': 505, 'image_count': 19}, {'id': 506, 'image_count': 3}, {'id': 507, 'image_count': 63}, {'id': 508, 'image_count': 5}, {'id': 509, 'image_count': 6}, {'id': 510, 'image_count': 233}, {'id': 511, 'image_count': 54}, {'id': 512, 'image_count': 36}, {'id': 513, 'image_count': 10}, {'id': 514, 'image_count': 124}, {'id': 515, 'image_count': 101}, {'id': 516, 'image_count': 3}, {'id': 517, 'image_count': 363}, {'id': 518, 'image_count': 3}, {'id': 519, 'image_count': 30}, {'id': 520, 'image_count': 18}, {'id': 521, 'image_count': 199}, {'id': 522, 'image_count': 97}, {'id': 523, 'image_count': 32}, {'id': 524, 'image_count': 121}, {'id': 525, 'image_count': 16}, {'id': 526, 'image_count': 12}, {'id': 527, 'image_count': 2}, {'id': 528, 'image_count': 214}, {'id': 529, 'image_count': 48}, {'id': 530, 'image_count': 26}, {'id': 531, 'image_count': 13}, {'id': 532, 'image_count': 4}, {'id': 533, 'image_count': 11}, {'id': 534, 'image_count': 123}, {'id': 535, 'image_count': 7}, {'id': 536, 'image_count': 200}, {'id': 537, 'image_count': 91}, {'id': 538, 'image_count': 9}, {'id': 539, 'image_count': 72}, {'id': 540, 'image_count': 1886}, {'id': 541, 'image_count': 4}, {'id': 542, 'image_count': 1}, {'id': 543, 'image_count': 1}, {'id': 544, 'image_count': 1932}, {'id': 545, 'image_count': 4}, {'id': 546, 'image_count': 56}, {'id': 547, 'image_count': 854}, {'id': 548, 'image_count': 755}, {'id': 549, 'image_count': 1843}, {'id': 550, 'image_count': 96}, {'id': 551, 'image_count': 7}, {'id': 552, 'image_count': 74}, {'id': 553, 'image_count': 66}, {'id': 554, 'image_count': 57}, {'id': 555, 'image_count': 44}, {'id': 556, 'image_count': 1905}, {'id': 557, 'image_count': 4}, {'id': 558, 'image_count': 90}, {'id': 559, 'image_count': 1635}, {'id': 560, 'image_count': 8}, {'id': 561, 'image_count': 5}, {'id': 562, 'image_count': 50}, {'id': 563, 'image_count': 545}, {'id': 564, 'image_count': 20}, {'id': 565, 'image_count': 193}, {'id': 566, 'image_count': 285}, {'id': 567, 'image_count': 3}, {'id': 568, 'image_count': 1}, {'id': 569, 'image_count': 1904}, {'id': 570, 'image_count': 294}, {'id': 571, 'image_count': 3}, {'id': 572, 'image_count': 5}, {'id': 573, 'image_count': 24}, {'id': 574, 'image_count': 2}, {'id': 575, 'image_count': 2}, {'id': 576, 'image_count': 16}, {'id': 577, 'image_count': 8}, {'id': 578, 'image_count': 154}, {'id': 579, 'image_count': 66}, {'id': 580, 'image_count': 1}, {'id': 581, 'image_count': 24}, {'id': 582, 'image_count': 1}, {'id': 583, 'image_count': 4}, {'id': 584, 'image_count': 75}, {'id': 585, 'image_count': 6}, {'id': 586, 'image_count': 126}, {'id': 587, 'image_count': 24}, {'id': 588, 'image_count': 22}, {'id': 589, 'image_count': 1872}, {'id': 590, 'image_count': 16}, {'id': 591, 'image_count': 423}, {'id': 592, 'image_count': 1927}, {'id': 593, 'image_count': 38}, {'id': 594, 'image_count': 3}, {'id': 595, 'image_count': 1945}, {'id': 596, 'image_count': 35}, {'id': 597, 'image_count': 1}, {'id': 598, 'image_count': 13}, {'id': 599, 'image_count': 9}, {'id': 600, 'image_count': 14}, {'id': 601, 'image_count': 37}, {'id': 602, 'image_count': 3}, {'id': 603, 'image_count': 4}, {'id': 604, 'image_count': 100}, {'id': 605, 'image_count': 195}, {'id': 606, 'image_count': 1}, {'id': 607, 'image_count': 12}, {'id': 608, 'image_count': 24}, {'id': 609, 'image_count': 489}, {'id': 610, 'image_count': 10}, {'id': 611, 'image_count': 1689}, {'id': 612, 'image_count': 42}, {'id': 613, 'image_count': 81}, {'id': 614, 'image_count': 894}, {'id': 615, 'image_count': 1868}, {'id': 616, 'image_count': 7}, {'id': 617, 'image_count': 1567}, {'id': 618, 'image_count': 10}, {'id': 619, 'image_count': 8}, {'id': 620, 'image_count': 7}, {'id': 621, 'image_count': 629}, {'id': 622, 'image_count': 89}, {'id': 623, 'image_count': 15}, {'id': 624, 'image_count': 134}, {'id': 625, 'image_count': 4}, {'id': 626, 'image_count': 1802}, {'id': 627, 'image_count': 595}, {'id': 628, 'image_count': 1210}, {'id': 629, 'image_count': 48}, {'id': 630, 'image_count': 418}, {'id': 631, 'image_count': 1846}, {'id': 632, 'image_count': 5}, {'id': 633, 'image_count': 221}, {'id': 634, 'image_count': 10}, {'id': 635, 'image_count': 7}, {'id': 636, 'image_count': 76}, {'id': 637, 'image_count': 22}, {'id': 638, 'image_count': 10}, {'id': 639, 'image_count': 341}, {'id': 640, 'image_count': 1}, {'id': 641, 'image_count': 705}, {'id': 642, 'image_count': 1900}, {'id': 643, 'image_count': 188}, {'id': 644, 'image_count': 227}, {'id': 645, 'image_count': 861}, {'id': 646, 'image_count': 6}, {'id': 647, 'image_count': 115}, {'id': 648, 'image_count': 5}, {'id': 649, 'image_count': 43}, {'id': 650, 'image_count': 14}, {'id': 651, 'image_count': 6}, {'id': 652, 'image_count': 15}, {'id': 653, 'image_count': 1167}, {'id': 654, 'image_count': 15}, {'id': 655, 'image_count': 994}, {'id': 656, 'image_count': 28}, {'id': 657, 'image_count': 2}, {'id': 658, 'image_count': 338}, {'id': 659, 'image_count': 334}, {'id': 660, 'image_count': 15}, {'id': 661, 'image_count': 102}, {'id': 662, 'image_count': 1}, {'id': 663, 'image_count': 8}, {'id': 664, 'image_count': 1}, {'id': 665, 'image_count': 1}, {'id': 666, 'image_count': 28}, {'id': 667, 'image_count': 91}, {'id': 668, 'image_count': 260}, {'id': 669, 'image_count': 131}, {'id': 670, 'image_count': 128}, {'id': 671, 'image_count': 3}, {'id': 672, 'image_count': 10}, {'id': 673, 'image_count': 39}, {'id': 674, 'image_count': 2}, {'id': 675, 'image_count': 925}, {'id': 676, 'image_count': 354}, {'id': 677, 'image_count': 31}, {'id': 678, 'image_count': 10}, {'id': 679, 'image_count': 215}, {'id': 680, 'image_count': 71}, {'id': 681, 'image_count': 43}, {'id': 682, 'image_count': 28}, {'id': 683, 'image_count': 34}, {'id': 684, 'image_count': 16}, {'id': 685, 'image_count': 273}, {'id': 686, 'image_count': 2}, {'id': 687, 'image_count': 999}, {'id': 688, 'image_count': 4}, {'id': 689, 'image_count': 107}, {'id': 690, 'image_count': 2}, {'id': 691, 'image_count': 1}, {'id': 692, 'image_count': 454}, {'id': 693, 'image_count': 9}, {'id': 694, 'image_count': 1901}, {'id': 695, 'image_count': 61}, {'id': 696, 'image_count': 91}, {'id': 697, 'image_count': 46}, {'id': 698, 'image_count': 1402}, {'id': 699, 'image_count': 74}, {'id': 700, 'image_count': 421}, {'id': 701, 'image_count': 226}, {'id': 702, 'image_count': 10}, {'id': 703, 'image_count': 1720}, {'id': 704, 'image_count': 261}, {'id': 705, 'image_count': 1337}, {'id': 706, 'image_count': 293}, {'id': 707, 'image_count': 62}, {'id': 708, 'image_count': 814}, {'id': 709, 'image_count': 407}, {'id': 710, 'image_count': 6}, {'id': 711, 'image_count': 16}, {'id': 712, 'image_count': 7}, {'id': 713, 'image_count': 1791}, {'id': 714, 'image_count': 2}, {'id': 715, 'image_count': 1915}, {'id': 716, 'image_count': 1940}, {'id': 717, 'image_count': 13}, {'id': 718, 'image_count': 16}, {'id': 719, 'image_count': 448}, {'id': 720, 'image_count': 12}, {'id': 721, 'image_count': 18}, {'id': 722, 'image_count': 4}, {'id': 723, 'image_count': 71}, {'id': 724, 'image_count': 189}, {'id': 725, 'image_count': 74}, {'id': 726, 'image_count': 103}, {'id': 727, 'image_count': 3}, {'id': 728, 'image_count': 110}, {'id': 729, 'image_count': 5}, {'id': 730, 'image_count': 9}, {'id': 731, 'image_count': 15}, {'id': 732, 'image_count': 25}, {'id': 733, 'image_count': 7}, {'id': 734, 'image_count': 647}, {'id': 735, 'image_count': 824}, {'id': 736, 'image_count': 100}, {'id': 737, 'image_count': 47}, {'id': 738, 'image_count': 121}, {'id': 739, 'image_count': 731}, {'id': 740, 'image_count': 73}, {'id': 741, 'image_count': 49}, {'id': 742, 'image_count': 23}, {'id': 743, 'image_count': 4}, {'id': 744, 'image_count': 62}, {'id': 745, 'image_count': 118}, {'id': 746, 'image_count': 99}, {'id': 747, 'image_count': 40}, {'id': 748, 'image_count': 1036}, {'id': 749, 'image_count': 105}, {'id': 750, 'image_count': 21}, {'id': 751, 'image_count': 229}, {'id': 752, 'image_count': 7}, {'id': 753, 'image_count': 72}, {'id': 754, 'image_count': 9}, {'id': 755, 'image_count': 10}, {'id': 756, 'image_count': 328}, {'id': 757, 'image_count': 468}, {'id': 758, 'image_count': 1}, {'id': 759, 'image_count': 2}, {'id': 760, 'image_count': 24}, {'id': 761, 'image_count': 11}, {'id': 762, 'image_count': 72}, {'id': 763, 'image_count': 17}, {'id': 764, 'image_count': 10}, {'id': 765, 'image_count': 17}, {'id': 766, 'image_count': 489}, {'id': 767, 'image_count': 47}, {'id': 768, 'image_count': 93}, {'id': 769, 'image_count': 1}, {'id': 770, 'image_count': 12}, {'id': 771, 'image_count': 228}, {'id': 772, 'image_count': 5}, {'id': 773, 'image_count': 76}, {'id': 774, 'image_count': 71}, {'id': 775, 'image_count': 30}, {'id': 776, 'image_count': 109}, {'id': 777, 'image_count': 14}, {'id': 778, 'image_count': 1}, {'id': 779, 'image_count': 8}, {'id': 780, 'image_count': 26}, {'id': 781, 'image_count': 339}, {'id': 782, 'image_count': 153}, {'id': 783, 'image_count': 2}, {'id': 784, 'image_count': 3}, {'id': 785, 'image_count': 8}, {'id': 786, 'image_count': 47}, {'id': 787, 'image_count': 8}, {'id': 788, 'image_count': 6}, {'id': 789, 'image_count': 116}, {'id': 790, 'image_count': 69}, {'id': 791, 'image_count': 13}, {'id': 792, 'image_count': 6}, {'id': 793, 'image_count': 1928}, {'id': 794, 'image_count': 79}, {'id': 795, 'image_count': 14}, {'id': 796, 'image_count': 7}, {'id': 797, 'image_count': 20}, {'id': 798, 'image_count': 114}, {'id': 799, 'image_count': 221}, {'id': 800, 'image_count': 502}, {'id': 801, 'image_count': 62}, {'id': 802, 'image_count': 87}, {'id': 803, 'image_count': 4}, {'id': 804, 'image_count': 1912}, {'id': 805, 'image_count': 7}, {'id': 806, 'image_count': 186}, {'id': 807, 'image_count': 18}, {'id': 808, 'image_count': 4}, {'id': 809, 'image_count': 3}, {'id': 810, 'image_count': 7}, {'id': 811, 'image_count': 1413}, {'id': 812, 'image_count': 7}, {'id': 813, 'image_count': 12}, {'id': 814, 'image_count': 248}, {'id': 815, 'image_count': 4}, {'id': 816, 'image_count': 1881}, {'id': 817, 'image_count': 529}, {'id': 818, 'image_count': 1932}, {'id': 819, 'image_count': 50}, {'id': 820, 'image_count': 3}, {'id': 821, 'image_count': 28}, {'id': 822, 'image_count': 10}, {'id': 823, 'image_count': 5}, {'id': 824, 'image_count': 5}, {'id': 825, 'image_count': 18}, {'id': 826, 'image_count': 14}, {'id': 827, 'image_count': 1890}, {'id': 828, 'image_count': 660}, {'id': 829, 'image_count': 8}, {'id': 830, 'image_count': 25}, {'id': 831, 'image_count': 10}, {'id': 832, 'image_count': 218}, {'id': 833, 'image_count': 36}, {'id': 834, 'image_count': 16}, {'id': 835, 'image_count': 808}, {'id': 836, 'image_count': 479}, {'id': 837, 'image_count': 1404}, {'id': 838, 'image_count': 307}, {'id': 839, 'image_count': 57}, {'id': 840, 'image_count': 28}, {'id': 841, 'image_count': 80}, {'id': 842, 'image_count': 11}, {'id': 843, 'image_count': 92}, {'id': 844, 'image_count': 20}, {'id': 845, 'image_count': 194}, {'id': 846, 'image_count': 23}, {'id': 847, 'image_count': 52}, {'id': 848, 'image_count': 673}, {'id': 849, 'image_count': 2}, {'id': 850, 'image_count': 2}, {'id': 851, 'image_count': 1}, {'id': 852, 'image_count': 2}, {'id': 853, 'image_count': 8}, {'id': 854, 'image_count': 80}, {'id': 855, 'image_count': 3}, {'id': 856, 'image_count': 3}, {'id': 857, 'image_count': 15}, {'id': 858, 'image_count': 2}, {'id': 859, 'image_count': 10}, {'id': 860, 'image_count': 386}, {'id': 861, 'image_count': 65}, {'id': 862, 'image_count': 3}, {'id': 863, 'image_count': 35}, {'id': 864, 'image_count': 5}, {'id': 865, 'image_count': 180}, {'id': 866, 'image_count': 99}, {'id': 867, 'image_count': 49}, {'id': 868, 'image_count': 28}, {'id': 869, 'image_count': 1}, {'id': 870, 'image_count': 52}, {'id': 871, 'image_count': 36}, {'id': 872, 'image_count': 70}, {'id': 873, 'image_count': 6}, {'id': 874, 'image_count': 29}, {'id': 875, 'image_count': 24}, {'id': 876, 'image_count': 1115}, {'id': 877, 'image_count': 61}, {'id': 878, 'image_count': 18}, {'id': 879, 'image_count': 18}, {'id': 880, 'image_count': 665}, {'id': 881, 'image_count': 1096}, {'id': 882, 'image_count': 29}, {'id': 883, 'image_count': 8}, {'id': 884, 'image_count': 14}, {'id': 885, 'image_count': 1622}, {'id': 886, 'image_count': 2}, {'id': 887, 'image_count': 3}, {'id': 888, 'image_count': 32}, {'id': 889, 'image_count': 55}, {'id': 890, 'image_count': 1}, {'id': 891, 'image_count': 10}, {'id': 892, 'image_count': 10}, {'id': 893, 'image_count': 47}, {'id': 894, 'image_count': 3}, {'id': 895, 'image_count': 29}, {'id': 896, 'image_count': 342}, {'id': 897, 'image_count': 25}, {'id': 898, 'image_count': 1469}, {'id': 899, 'image_count': 521}, {'id': 900, 'image_count': 347}, {'id': 901, 'image_count': 35}, {'id': 902, 'image_count': 7}, {'id': 903, 'image_count': 207}, {'id': 904, 'image_count': 108}, {'id': 905, 'image_count': 2}, {'id': 906, 'image_count': 34}, {'id': 907, 'image_count': 12}, {'id': 908, 'image_count': 10}, {'id': 909, 'image_count': 13}, {'id': 910, 'image_count': 361}, {'id': 911, 'image_count': 1023}, {'id': 912, 'image_count': 782}, {'id': 913, 'image_count': 2}, {'id': 914, 'image_count': 5}, {'id': 915, 'image_count': 247}, {'id': 916, 'image_count': 221}, {'id': 917, 'image_count': 4}, {'id': 918, 'image_count': 8}, {'id': 919, 'image_count': 158}, {'id': 920, 'image_count': 3}, {'id': 921, 'image_count': 752}, {'id': 922, 'image_count': 64}, {'id': 923, 'image_count': 707}, {'id': 924, 'image_count': 143}, {'id': 925, 'image_count': 1}, {'id': 926, 'image_count': 49}, {'id': 927, 'image_count': 126}, {'id': 928, 'image_count': 76}, {'id': 929, 'image_count': 11}, {'id': 930, 'image_count': 11}, {'id': 931, 'image_count': 4}, {'id': 932, 'image_count': 39}, {'id': 933, 'image_count': 11}, {'id': 934, 'image_count': 13}, {'id': 935, 'image_count': 91}, {'id': 936, 'image_count': 14}, {'id': 937, 'image_count': 5}, {'id': 938, 'image_count': 3}, {'id': 939, 'image_count': 10}, {'id': 940, 'image_count': 18}, {'id': 941, 'image_count': 9}, {'id': 942, 'image_count': 6}, {'id': 943, 'image_count': 951}, {'id': 944, 'image_count': 2}, {'id': 945, 'image_count': 1}, {'id': 946, 'image_count': 19}, {'id': 947, 'image_count': 1942}, {'id': 948, 'image_count': 1916}, {'id': 949, 'image_count': 139}, {'id': 950, 'image_count': 43}, {'id': 951, 'image_count': 1969}, {'id': 952, 'image_count': 5}, {'id': 953, 'image_count': 134}, {'id': 954, 'image_count': 74}, {'id': 955, 'image_count': 381}, {'id': 956, 'image_count': 1}, {'id': 957, 'image_count': 381}, {'id': 958, 'image_count': 6}, {'id': 959, 'image_count': 1826}, {'id': 960, 'image_count': 28}, {'id': 961, 'image_count': 1635}, {'id': 962, 'image_count': 1967}, {'id': 963, 'image_count': 16}, {'id': 964, 'image_count': 1926}, {'id': 965, 'image_count': 1789}, {'id': 966, 'image_count': 401}, {'id': 967, 'image_count': 1968}, {'id': 968, 'image_count': 1167}, {'id': 969, 'image_count': 1}, {'id': 970, 'image_count': 56}, {'id': 971, 'image_count': 17}, {'id': 972, 'image_count': 1}, {'id': 973, 'image_count': 58}, {'id': 974, 'image_count': 9}, {'id': 975, 'image_count': 8}, {'id': 976, 'image_count': 1124}, {'id': 977, 'image_count': 31}, {'id': 978, 'image_count': 16}, {'id': 979, 'image_count': 491}, {'id': 980, 'image_count': 432}, {'id': 981, 'image_count': 1945}, {'id': 982, 'image_count': 1899}, {'id': 983, 'image_count': 5}, {'id': 984, 'image_count': 28}, {'id': 985, 'image_count': 7}, {'id': 986, 'image_count': 146}, {'id': 987, 'image_count': 1}, {'id': 988, 'image_count': 25}, {'id': 989, 'image_count': 22}, {'id': 990, 'image_count': 1}, {'id': 991, 'image_count': 10}, {'id': 992, 'image_count': 9}, {'id': 993, 'image_count': 308}, {'id': 994, 'image_count': 4}, {'id': 995, 'image_count': 1969}, {'id': 996, 'image_count': 45}, {'id': 997, 'image_count': 12}, {'id': 998, 'image_count': 1}, {'id': 999, 'image_count': 85}, {'id': 1000, 'image_count': 1127}, {'id': 1001, 'image_count': 11}, {'id': 1002, 'image_count': 60}, {'id': 1003, 'image_count': 1}, {'id': 1004, 'image_count': 16}, {'id': 1005, 'image_count': 1}, {'id': 1006, 'image_count': 65}, {'id': 1007, 'image_count': 13}, {'id': 1008, 'image_count': 655}, {'id': 1009, 'image_count': 51}, {'id': 1010, 'image_count': 1}, {'id': 1011, 'image_count': 673}, {'id': 1012, 'image_count': 5}, {'id': 1013, 'image_count': 36}, {'id': 1014, 'image_count': 54}, {'id': 1015, 'image_count': 5}, {'id': 1016, 'image_count': 8}, {'id': 1017, 'image_count': 305}, {'id': 1018, 'image_count': 297}, {'id': 1019, 'image_count': 1053}, {'id': 1020, 'image_count': 223}, {'id': 1021, 'image_count': 1037}, {'id': 1022, 'image_count': 63}, {'id': 1023, 'image_count': 1881}, {'id': 1024, 'image_count': 507}, {'id': 1025, 'image_count': 333}, {'id': 1026, 'image_count': 1911}, {'id': 1027, 'image_count': 1765}, {'id': 1028, 'image_count': 1}, {'id': 1029, 'image_count': 5}, {'id': 1030, 'image_count': 1}, {'id': 1031, 'image_count': 9}, {'id': 1032, 'image_count': 2}, {'id': 1033, 'image_count': 151}, {'id': 1034, 'image_count': 82}, {'id': 1035, 'image_count': 1931}, {'id': 1036, 'image_count': 41}, {'id': 1037, 'image_count': 1895}, {'id': 1038, 'image_count': 24}, {'id': 1039, 'image_count': 22}, {'id': 1040, 'image_count': 35}, {'id': 1041, 'image_count': 69}, {'id': 1042, 'image_count': 962}, {'id': 1043, 'image_count': 588}, {'id': 1044, 'image_count': 21}, {'id': 1045, 'image_count': 825}, {'id': 1046, 'image_count': 52}, {'id': 1047, 'image_count': 5}, {'id': 1048, 'image_count': 5}, {'id': 1049, 'image_count': 5}, {'id': 1050, 'image_count': 1860}, {'id': 1051, 'image_count': 56}, {'id': 1052, 'image_count': 1582}, {'id': 1053, 'image_count': 7}, {'id': 1054, 'image_count': 2}, {'id': 1055, 'image_count': 1562}, {'id': 1056, 'image_count': 1885}, {'id': 1057, 'image_count': 1}, {'id': 1058, 'image_count': 5}, {'id': 1059, 'image_count': 137}, {'id': 1060, 'image_count': 1094}, {'id': 1061, 'image_count': 134}, {'id': 1062, 'image_count': 29}, {'id': 1063, 'image_count': 22}, {'id': 1064, 'image_count': 522}, {'id': 1065, 'image_count': 50}, {'id': 1066, 'image_count': 68}, {'id': 1067, 'image_count': 16}, {'id': 1068, 'image_count': 40}, {'id': 1069, 'image_count': 35}, {'id': 1070, 'image_count': 135}, {'id': 1071, 'image_count': 1413}, {'id': 1072, 'image_count': 772}, {'id': 1073, 'image_count': 50}, {'id': 1074, 'image_count': 1015}, {'id': 1075, 'image_count': 1}, {'id': 1076, 'image_count': 65}, {'id': 1077, 'image_count': 1900}, {'id': 1078, 'image_count': 1302}, {'id': 1079, 'image_count': 1977}, {'id': 1080, 'image_count': 2}, {'id': 1081, 'image_count': 29}, {'id': 1082, 'image_count': 36}, {'id': 1083, 'image_count': 138}, {'id': 1084, 'image_count': 4}, {'id': 1085, 'image_count': 67}, {'id': 1086, 'image_count': 26}, {'id': 1087, 'image_count': 25}, {'id': 1088, 'image_count': 33}, {'id': 1089, 'image_count': 37}, {'id': 1090, 'image_count': 50}, {'id': 1091, 'image_count': 270}, {'id': 1092, 'image_count': 12}, {'id': 1093, 'image_count': 316}, {'id': 1094, 'image_count': 41}, {'id': 1095, 'image_count': 224}, {'id': 1096, 'image_count': 105}, {'id': 1097, 'image_count': 1925}, {'id': 1098, 'image_count': 1021}, {'id': 1099, 'image_count': 1213}, {'id': 1100, 'image_count': 172}, {'id': 1101, 'image_count': 28}, {'id': 1102, 'image_count': 745}, {'id': 1103, 'image_count': 187}, {'id': 1104, 'image_count': 147}, {'id': 1105, 'image_count': 136}, {'id': 1106, 'image_count': 34}, {'id': 1107, 'image_count': 41}, {'id': 1108, 'image_count': 636}, {'id': 1109, 'image_count': 570}, {'id': 1110, 'image_count': 1149}, {'id': 1111, 'image_count': 61}, {'id': 1112, 'image_count': 1890}, {'id': 1113, 'image_count': 18}, {'id': 1114, 'image_count': 143}, {'id': 1115, 'image_count': 1517}, {'id': 1116, 'image_count': 7}, {'id': 1117, 'image_count': 943}, {'id': 1118, 'image_count': 6}, {'id': 1119, 'image_count': 1}, {'id': 1120, 'image_count': 11}, {'id': 1121, 'image_count': 101}, {'id': 1122, 'image_count': 1909}, {'id': 1123, 'image_count': 800}, {'id': 1124, 'image_count': 1}, {'id': 1125, 'image_count': 44}, {'id': 1126, 'image_count': 3}, {'id': 1127, 'image_count': 44}, {'id': 1128, 'image_count': 31}, {'id': 1129, 'image_count': 7}, {'id': 1130, 'image_count': 20}, {'id': 1131, 'image_count': 11}, {'id': 1132, 'image_count': 13}, {'id': 1133, 'image_count': 1924}, {'id': 1134, 'image_count': 113}, {'id': 1135, 'image_count': 2}, {'id': 1136, 'image_count': 139}, {'id': 1137, 'image_count': 12}, {'id': 1138, 'image_count': 37}, {'id': 1139, 'image_count': 1866}, {'id': 1140, 'image_count': 47}, {'id': 1141, 'image_count': 1468}, {'id': 1142, 'image_count': 729}, {'id': 1143, 'image_count': 24}, {'id': 1144, 'image_count': 1}, {'id': 1145, 'image_count': 10}, {'id': 1146, 'image_count': 3}, {'id': 1147, 'image_count': 14}, {'id': 1148, 'image_count': 4}, {'id': 1149, 'image_count': 29}, {'id': 1150, 'image_count': 4}, {'id': 1151, 'image_count': 70}, {'id': 1152, 'image_count': 46}, {'id': 1153, 'image_count': 14}, {'id': 1154, 'image_count': 48}, {'id': 1155, 'image_count': 1855}, {'id': 1156, 'image_count': 113}, {'id': 1157, 'image_count': 1}, {'id': 1158, 'image_count': 1}, {'id': 1159, 'image_count': 10}, {'id': 1160, 'image_count': 54}, {'id': 1161, 'image_count': 1923}, {'id': 1162, 'image_count': 630}, {'id': 1163, 'image_count': 31}, {'id': 1164, 'image_count': 69}, {'id': 1165, 'image_count': 7}, {'id': 1166, 'image_count': 11}, {'id': 1167, 'image_count': 1}, {'id': 1168, 'image_count': 30}, {'id': 1169, 'image_count': 50}, {'id': 1170, 'image_count': 45}, {'id': 1171, 'image_count': 28}, {'id': 1172, 'image_count': 114}, {'id': 1173, 'image_count': 193}, {'id': 1174, 'image_count': 21}, {'id': 1175, 'image_count': 91}, {'id': 1176, 'image_count': 31}, {'id': 1177, 'image_count': 1469}, {'id': 1178, 'image_count': 1924}, {'id': 1179, 'image_count': 87}, {'id': 1180, 'image_count': 77}, {'id': 1181, 'image_count': 11}, {'id': 1182, 'image_count': 47}, {'id': 1183, 'image_count': 21}, {'id': 1184, 'image_count': 47}, {'id': 1185, 'image_count': 70}, {'id': 1186, 'image_count': 1838}, {'id': 1187, 'image_count': 19}, {'id': 1188, 'image_count': 531}, {'id': 1189, 'image_count': 11}, {'id': 1190, 'image_count': 941}, {'id': 1191, 'image_count': 113}, {'id': 1192, 'image_count': 26}, {'id': 1193, 'image_count': 5}, {'id': 1194, 'image_count': 56}, {'id': 1195, 'image_count': 73}, {'id': 1196, 'image_count': 32}, {'id': 1197, 'image_count': 128}, {'id': 1198, 'image_count': 623}, {'id': 1199, 'image_count': 12}, {'id': 1200, 'image_count': 52}, {'id': 1201, 'image_count': 11}, {'id': 1202, 'image_count': 1674}, {'id': 1203, 'image_count': 81}] # noqa # fmt: on ================================================ FILE: annotator/oneformer/detectron2/data/datasets/pascal_voc.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import numpy as np import os import xml.etree.ElementTree as ET from typing import List, Tuple, Union from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from annotator.oneformer.detectron2.structures import BoxMode from annotator.oneformer.detectron2.utils.file_io import PathManager __all__ = ["load_voc_instances", "register_pascal_voc"] # fmt: off CLASS_NAMES = ( "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" ) # fmt: on def load_voc_instances(dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]): """ Load Pascal VOC detection annotations to Detectron2 format. Args: dirname: Contain "Annotations", "ImageSets", "JPEGImages" split (str): one of "train", "test", "val", "trainval" class_names: list or tuple of class names """ with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f: fileids = np.loadtxt(f, dtype=np.str) # Needs to read many small annotation files. Makes sense at local annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/")) dicts = [] for fileid in fileids: anno_file = os.path.join(annotation_dirname, fileid + ".xml") jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg") with PathManager.open(anno_file) as f: tree = ET.parse(f) r = { "file_name": jpeg_file, "image_id": fileid, "height": int(tree.findall("./size/height")[0].text), "width": int(tree.findall("./size/width")[0].text), } instances = [] for obj in tree.findall("object"): cls = obj.find("name").text # We include "difficult" samples in training. # Based on limited experiments, they don't hurt accuracy. # difficult = int(obj.find("difficult").text) # if difficult == 1: # continue bbox = obj.find("bndbox") bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]] # Original annotations are integers in the range [1, W or H] # Assuming they mean 1-based pixel indices (inclusive), # a box with annotation (xmin=1, xmax=W) covers the whole image. # In coordinate space this is represented by (xmin=0, xmax=W) bbox[0] -= 1.0 bbox[1] -= 1.0 instances.append( {"category_id": class_names.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS} ) r["annotations"] = instances dicts.append(r) return dicts def register_pascal_voc(name, dirname, split, year, class_names=CLASS_NAMES): DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split, class_names)) MetadataCatalog.get(name).set( thing_classes=list(class_names), dirname=dirname, year=year, split=split ) ================================================ FILE: annotator/oneformer/detectron2/data/datasets/register_coco.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .coco import register_coco_instances # noqa from .coco_panoptic import register_coco_panoptic_separated # noqa ================================================ FILE: annotator/oneformer/detectron2/data/detection_utils.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. """ Common data processing utilities that are used in a typical object detection data pipeline. """ import logging import numpy as np from typing import List, Union import annotator.oneformer.pycocotools.mask as mask_util import torch from PIL import Image from annotator.oneformer.detectron2.structures import ( BitMasks, Boxes, BoxMode, Instances, Keypoints, PolygonMasks, RotatedBoxes, polygons_to_bitmask, ) from annotator.oneformer.detectron2.utils.file_io import PathManager from . import transforms as T from .catalog import MetadataCatalog __all__ = [ "SizeMismatchError", "convert_image_to_rgb", "check_image_size", "transform_proposals", "transform_instance_annotations", "annotations_to_instances", "annotations_to_instances_rotated", "build_augmentation", "build_transform_gen", "create_keypoint_hflip_indices", "filter_empty_instances", "read_image", ] class SizeMismatchError(ValueError): """ When loaded image has difference width/height compared with annotation. """ # https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601 _M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]] _M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]] # https://www.exiv2.org/tags.html _EXIF_ORIENT = 274 # exif 'Orientation' tag def convert_PIL_to_numpy(image, format): """ Convert PIL image to numpy array of target format. Args: image (PIL.Image): a PIL image format (str): the format of output image Returns: (np.ndarray): also see `read_image` """ if format is not None: # PIL only supports RGB, so convert to RGB and flip channels over below conversion_format = format if format in ["BGR", "YUV-BT.601"]: conversion_format = "RGB" image = image.convert(conversion_format) image = np.asarray(image) # PIL squeezes out the channel dimension for "L", so make it HWC if format == "L": image = np.expand_dims(image, -1) # handle formats not supported by PIL elif format == "BGR": # flip channels if needed image = image[:, :, ::-1] elif format == "YUV-BT.601": image = image / 255.0 image = np.dot(image, np.array(_M_RGB2YUV).T) return image def convert_image_to_rgb(image, format): """ Convert an image from given format to RGB. Args: image (np.ndarray or Tensor): an HWC image format (str): the format of input image, also see `read_image` Returns: (np.ndarray): (H,W,3) RGB image in 0-255 range, can be either float or uint8 """ if isinstance(image, torch.Tensor): image = image.cpu().numpy() if format == "BGR": image = image[:, :, [2, 1, 0]] elif format == "YUV-BT.601": image = np.dot(image, np.array(_M_YUV2RGB).T) image = image * 255.0 else: if format == "L": image = image[:, :, 0] image = image.astype(np.uint8) image = np.asarray(Image.fromarray(image, mode=format).convert("RGB")) return image def _apply_exif_orientation(image): """ Applies the exif orientation correctly. This code exists per the bug: https://github.com/python-pillow/Pillow/issues/3973 with the function `ImageOps.exif_transpose`. The Pillow source raises errors with various methods, especially `tobytes` Function based on: https://github.com/wkentaro/labelme/blob/v4.5.4/labelme/utils/image.py#L59 https://github.com/python-pillow/Pillow/blob/7.1.2/src/PIL/ImageOps.py#L527 Args: image (PIL.Image): a PIL image Returns: (PIL.Image): the PIL image with exif orientation applied, if applicable """ if not hasattr(image, "getexif"): return image try: exif = image.getexif() except Exception: # https://github.com/facebookresearch/detectron2/issues/1885 exif = None if exif is None: return image orientation = exif.get(_EXIF_ORIENT) method = { 2: Image.FLIP_LEFT_RIGHT, 3: Image.ROTATE_180, 4: Image.FLIP_TOP_BOTTOM, 5: Image.TRANSPOSE, 6: Image.ROTATE_270, 7: Image.TRANSVERSE, 8: Image.ROTATE_90, }.get(orientation) if method is not None: return image.transpose(method) return image def read_image(file_name, format=None): """ Read an image into the given format. Will apply rotation and flipping if the image has such exif information. Args: file_name (str): image file path format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601". Returns: image (np.ndarray): an HWC image in the given format, which is 0-255, uint8 for supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601. """ with PathManager.open(file_name, "rb") as f: image = Image.open(f) # work around this bug: https://github.com/python-pillow/Pillow/issues/3973 image = _apply_exif_orientation(image) return convert_PIL_to_numpy(image, format) def check_image_size(dataset_dict, image): """ Raise an error if the image does not match the size specified in the dict. """ if "width" in dataset_dict or "height" in dataset_dict: image_wh = (image.shape[1], image.shape[0]) expected_wh = (dataset_dict["width"], dataset_dict["height"]) if not image_wh == expected_wh: raise SizeMismatchError( "Mismatched image shape{}, got {}, expect {}.".format( " for image " + dataset_dict["file_name"] if "file_name" in dataset_dict else "", image_wh, expected_wh, ) + " Please check the width/height in your annotation." ) # To ensure bbox always remap to original image size if "width" not in dataset_dict: dataset_dict["width"] = image.shape[1] if "height" not in dataset_dict: dataset_dict["height"] = image.shape[0] def transform_proposals(dataset_dict, image_shape, transforms, *, proposal_topk, min_box_size=0): """ Apply transformations to the proposals in dataset_dict, if any. Args: dataset_dict (dict): a dict read from the dataset, possibly contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode" image_shape (tuple): height, width transforms (TransformList): proposal_topk (int): only keep top-K scoring proposals min_box_size (int): proposals with either side smaller than this threshold are removed The input dict is modified in-place, with abovementioned keys removed. A new key "proposals" will be added. Its value is an `Instances` object which contains the transformed proposals in its field "proposal_boxes" and "objectness_logits". """ if "proposal_boxes" in dataset_dict: # Transform proposal boxes boxes = transforms.apply_box( BoxMode.convert( dataset_dict.pop("proposal_boxes"), dataset_dict.pop("proposal_bbox_mode"), BoxMode.XYXY_ABS, ) ) boxes = Boxes(boxes) objectness_logits = torch.as_tensor( dataset_dict.pop("proposal_objectness_logits").astype("float32") ) boxes.clip(image_shape) keep = boxes.nonempty(threshold=min_box_size) boxes = boxes[keep] objectness_logits = objectness_logits[keep] proposals = Instances(image_shape) proposals.proposal_boxes = boxes[:proposal_topk] proposals.objectness_logits = objectness_logits[:proposal_topk] dataset_dict["proposals"] = proposals def get_bbox(annotation): """ Get bbox from data Args: annotation (dict): dict of instance annotations for a single instance. Returns: bbox (ndarray): x1, y1, x2, y2 coordinates """ # bbox is 1d (per-instance bounding box) bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) return bbox def transform_instance_annotations( annotation, transforms, image_size, *, keypoint_hflip_indices=None ): """ Apply transforms to box, segmentation and keypoints annotations of a single instance. It will use `transforms.apply_box` for the box, and `transforms.apply_coords` for segmentation polygons & keypoints. If you need anything more specially designed for each data structure, you'll need to implement your own version of this function or the transforms. Args: annotation (dict): dict of instance annotations for a single instance. It will be modified in-place. transforms (TransformList or list[Transform]): image_size (tuple): the height, width of the transformed image keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. Returns: dict: the same input dict with fields "bbox", "segmentation", "keypoints" transformed according to `transforms`. The "bbox_mode" field will be set to XYXY_ABS. """ if isinstance(transforms, (tuple, list)): transforms = T.TransformList(transforms) # bbox is 1d (per-instance bounding box) bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) # clip transformed bbox to image size bbox = transforms.apply_box(np.array([bbox]))[0].clip(min=0) annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1]) annotation["bbox_mode"] = BoxMode.XYXY_ABS if "segmentation" in annotation: # each instance contains 1 or more polygons segm = annotation["segmentation"] if isinstance(segm, list): # polygons polygons = [np.asarray(p).reshape(-1, 2) for p in segm] annotation["segmentation"] = [ p.reshape(-1) for p in transforms.apply_polygons(polygons) ] elif isinstance(segm, dict): # RLE mask = mask_util.decode(segm) mask = transforms.apply_segmentation(mask) assert tuple(mask.shape[:2]) == image_size annotation["segmentation"] = mask else: raise ValueError( "Cannot transform segmentation of type '{}'!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict.".format(type(segm)) ) if "keypoints" in annotation: keypoints = transform_keypoint_annotations( annotation["keypoints"], transforms, image_size, keypoint_hflip_indices ) annotation["keypoints"] = keypoints return annotation def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None): """ Transform keypoint annotations of an image. If a keypoint is transformed out of image boundary, it will be marked "unlabeled" (visibility=0) Args: keypoints (list[float]): Nx3 float in Detectron2's Dataset format. Each point is represented by (x, y, visibility). transforms (TransformList): image_size (tuple): the height, width of the transformed image keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. When `transforms` includes horizontal flip, will use the index mapping to flip keypoints. """ # (N*3,) -> (N, 3) keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3) keypoints_xy = transforms.apply_coords(keypoints[:, :2]) # Set all out-of-boundary points to "unlabeled" inside = (keypoints_xy >= np.array([0, 0])) & (keypoints_xy <= np.array(image_size[::-1])) inside = inside.all(axis=1) keypoints[:, :2] = keypoints_xy keypoints[:, 2][~inside] = 0 # This assumes that HorizFlipTransform is the only one that does flip do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1 # Alternative way: check if probe points was horizontally flipped. # probe = np.asarray([[0.0, 0.0], [image_width, 0.0]]) # probe_aug = transforms.apply_coords(probe.copy()) # do_hflip = np.sign(probe[1][0] - probe[0][0]) != np.sign(probe_aug[1][0] - probe_aug[0][0]) # noqa # If flipped, swap each keypoint with its opposite-handed equivalent if do_hflip: if keypoint_hflip_indices is None: raise ValueError("Cannot flip keypoints without providing flip indices!") if len(keypoints) != len(keypoint_hflip_indices): raise ValueError( "Keypoint data has {} points, but metadata " "contains {} points!".format(len(keypoints), len(keypoint_hflip_indices)) ) keypoints = keypoints[np.asarray(keypoint_hflip_indices, dtype=np.int32), :] # Maintain COCO convention that if visibility == 0 (unlabeled), then x, y = 0 keypoints[keypoints[:, 2] == 0] = 0 return keypoints def annotations_to_instances(annos, image_size, mask_format="polygon"): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: It will contain fields "gt_boxes", "gt_classes", "gt_masks", "gt_keypoints", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = ( np.stack( [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] ) if len(annos) else np.zeros((0, 4)) ) target = Instances(image_size) target.gt_boxes = Boxes(boxes) classes = [int(obj["category_id"]) for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes if len(annos) and "segmentation" in annos[0]: segms = [obj["segmentation"] for obj in annos] if mask_format == "polygon": try: masks = PolygonMasks(segms) except ValueError as e: raise ValueError( "Failed to use mask_format=='polygon' from the given annotations!" ) from e else: assert mask_format == "bitmask", mask_format masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image_size)) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim ) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a binary segmentation mask " " in a 2D numpy array of shape HxW.".format(type(segm)) ) # torch.from_numpy does not support array with negative stride. masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks]) ) target.gt_masks = masks if len(annos) and "keypoints" in annos[0]: kpts = [obj.get("keypoints", []) for obj in annos] target.gt_keypoints = Keypoints(kpts) return target def annotations_to_instances_rotated(annos, image_size): """ Create an :class:`Instances` object used by the models, from instance annotations in the dataset dict. Compared to `annotations_to_instances`, this function is for rotated boxes only Args: annos (list[dict]): a list of instance annotations in one image, each element for one instance. image_size (tuple): height, width Returns: Instances: Containing fields "gt_boxes", "gt_classes", if they can be obtained from `annos`. This is the format that builtin models expect. """ boxes = [obj["bbox"] for obj in annos] target = Instances(image_size) boxes = target.gt_boxes = RotatedBoxes(boxes) boxes.clip(image_size) classes = [obj["category_id"] for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) target.gt_classes = classes return target def filter_empty_instances( instances, by_box=True, by_mask=True, box_threshold=1e-5, return_mask=False ): """ Filter out empty instances in an `Instances` object. Args: instances (Instances): by_box (bool): whether to filter out instances with empty boxes by_mask (bool): whether to filter out instances with empty masks box_threshold (float): minimum width and height to be considered non-empty return_mask (bool): whether to return boolean mask of filtered instances Returns: Instances: the filtered instances. tensor[bool], optional: boolean mask of filtered instances """ assert by_box or by_mask r = [] if by_box: r.append(instances.gt_boxes.nonempty(threshold=box_threshold)) if instances.has("gt_masks") and by_mask: r.append(instances.gt_masks.nonempty()) # TODO: can also filter visible keypoints if not r: return instances m = r[0] for x in r[1:]: m = m & x if return_mask: return instances[m], m return instances[m] def create_keypoint_hflip_indices(dataset_names: Union[str, List[str]]) -> List[int]: """ Args: dataset_names: list of dataset names Returns: list[int]: a list of size=#keypoints, storing the horizontally-flipped keypoint indices. """ if isinstance(dataset_names, str): dataset_names = [dataset_names] check_metadata_consistency("keypoint_names", dataset_names) check_metadata_consistency("keypoint_flip_map", dataset_names) meta = MetadataCatalog.get(dataset_names[0]) names = meta.keypoint_names # TODO flip -> hflip flip_map = dict(meta.keypoint_flip_map) flip_map.update({v: k for k, v in flip_map.items()}) flipped_names = [i if i not in flip_map else flip_map[i] for i in names] flip_indices = [names.index(i) for i in flipped_names] return flip_indices def get_fed_loss_cls_weights(dataset_names: Union[str, List[str]], freq_weight_power=1.0): """ Get frequency weight for each class sorted by class id. We now calcualte freqency weight using image_count to the power freq_weight_power. Args: dataset_names: list of dataset names freq_weight_power: power value """ if isinstance(dataset_names, str): dataset_names = [dataset_names] check_metadata_consistency("class_image_count", dataset_names) meta = MetadataCatalog.get(dataset_names[0]) class_freq_meta = meta.class_image_count class_freq = torch.tensor( [c["image_count"] for c in sorted(class_freq_meta, key=lambda x: x["id"])] ) class_freq_weight = class_freq.float() ** freq_weight_power return class_freq_weight def gen_crop_transform_with_instance(crop_size, image_size, instance): """ Generate a CropTransform so that the cropping region contains the center of the given instance. Args: crop_size (tuple): h, w in pixels image_size (tuple): h, w instance (dict): an annotation dict of one instance, in Detectron2's dataset format. """ crop_size = np.asarray(crop_size, dtype=np.int32) bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS) center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5 assert ( image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1] ), "The annotation bounding box is outside of the image!" assert ( image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1] ), "Crop size is larger than image size!" min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0) max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0) max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32)) y0 = np.random.randint(min_yx[0], max_yx[0] + 1) x0 = np.random.randint(min_yx[1], max_yx[1] + 1) return T.CropTransform(x0, y0, crop_size[1], crop_size[0]) def check_metadata_consistency(key, dataset_names): """ Check that the datasets have consistent metadata. Args: key (str): a metadata key dataset_names (list[str]): a list of dataset names Raises: AttributeError: if the key does not exist in the metadata ValueError: if the given datasets do not have the same metadata values defined by key """ if len(dataset_names) == 0: return logger = logging.getLogger(__name__) entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names] for idx, entry in enumerate(entries_per_dataset): if entry != entries_per_dataset[0]: logger.error( "Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry)) ) logger.error( "Metadata '{}' for dataset '{}' is '{}'".format( key, dataset_names[0], str(entries_per_dataset[0]) ) ) raise ValueError("Datasets have different metadata '{}'!".format(key)) def build_augmentation(cfg, is_train): """ Create a list of default :class:`Augmentation` from config. Now it includes resizing and flipping. Returns: list[Augmentation] """ if is_train: min_size = cfg.INPUT.MIN_SIZE_TRAIN max_size = cfg.INPUT.MAX_SIZE_TRAIN sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING else: min_size = cfg.INPUT.MIN_SIZE_TEST max_size = cfg.INPUT.MAX_SIZE_TEST sample_style = "choice" augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)] if is_train and cfg.INPUT.RANDOM_FLIP != "none": augmentation.append( T.RandomFlip( horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", vertical=cfg.INPUT.RANDOM_FLIP == "vertical", ) ) return augmentation build_transform_gen = build_augmentation """ Alias for backward-compatibility. """ ================================================ FILE: annotator/oneformer/detectron2/data/samplers/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .distributed_sampler import ( InferenceSampler, RandomSubsetTrainingSampler, RepeatFactorTrainingSampler, TrainingSampler, ) from .grouped_batch_sampler import GroupedBatchSampler __all__ = [ "GroupedBatchSampler", "TrainingSampler", "RandomSubsetTrainingSampler", "InferenceSampler", "RepeatFactorTrainingSampler", ] ================================================ FILE: annotator/oneformer/detectron2/data/samplers/distributed_sampler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import itertools import logging import math from collections import defaultdict from typing import Optional import torch from torch.utils.data.sampler import Sampler from annotator.oneformer.detectron2.utils import comm logger = logging.getLogger(__name__) class TrainingSampler(Sampler): """ In training, we only care about the "infinite stream" of training data. So this sampler produces an infinite stream of indices and all workers cooperate to correctly shuffle the indices and sample different indices. The samplers in each worker effectively produces `indices[worker_id::num_workers]` where `indices` is an infinite stream of indices consisting of `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True) or `range(size) + range(size) + ...` (if shuffle is False) Note that this sampler does not shard based on pytorch DataLoader worker id. A sampler passed to pytorch DataLoader is used only with map-style dataset and will not be executed inside workers. But if this sampler is used in a way that it gets execute inside a dataloader worker, then extra work needs to be done to shard its outputs based on worker id. This is required so that workers don't produce identical data. :class:`ToIterableDataset` implements this logic. This note is true for all samplers in detectron2. """ def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None): """ Args: size (int): the total number of data of the underlying dataset to sample from shuffle (bool): whether to shuffle the indices or not seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ if not isinstance(size, int): raise TypeError(f"TrainingSampler(size=) expects an int. Got type {type(size)}.") if size <= 0: raise ValueError(f"TrainingSampler(size=) expects a positive int. Got {size}.") self._size = size self._shuffle = shuffle if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size() def __iter__(self): start = self._rank yield from itertools.islice(self._infinite_indices(), start, None, self._world_size) def _infinite_indices(self): g = torch.Generator() g.manual_seed(self._seed) while True: if self._shuffle: yield from torch.randperm(self._size, generator=g).tolist() else: yield from torch.arange(self._size).tolist() class RandomSubsetTrainingSampler(TrainingSampler): """ Similar to TrainingSampler, but only sample a random subset of indices. This is useful when you want to estimate the accuracy vs data-number curves by training the model with different subset_ratio. """ def __init__( self, size: int, subset_ratio: float, shuffle: bool = True, seed_shuffle: Optional[int] = None, seed_subset: Optional[int] = None, ): """ Args: size (int): the total number of data of the underlying dataset to sample from subset_ratio (float): the ratio of subset data to sample from the underlying dataset shuffle (bool): whether to shuffle the indices or not seed_shuffle (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). seed_subset (int): the seed to randomize the subset to be sampled. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ super().__init__(size=size, shuffle=shuffle, seed=seed_shuffle) assert 0.0 < subset_ratio <= 1.0 self._size_subset = int(size * subset_ratio) assert self._size_subset > 0 if seed_subset is None: seed_subset = comm.shared_random_seed() self._seed_subset = int(seed_subset) # randomly generate the subset indexes to be sampled from g = torch.Generator() g.manual_seed(self._seed_subset) indexes_randperm = torch.randperm(self._size, generator=g) self._indexes_subset = indexes_randperm[: self._size_subset] logger.info("Using RandomSubsetTrainingSampler......") logger.info(f"Randomly sample {self._size_subset} data from the original {self._size} data") def _infinite_indices(self): g = torch.Generator() g.manual_seed(self._seed) # self._seed equals seed_shuffle from __init__() while True: if self._shuffle: # generate a random permutation to shuffle self._indexes_subset randperm = torch.randperm(self._size_subset, generator=g) yield from self._indexes_subset[randperm].tolist() else: yield from self._indexes_subset.tolist() class RepeatFactorTrainingSampler(Sampler): """ Similar to TrainingSampler, but a sample may appear more times than others based on its "repeat factor". This is suitable for training on class imbalanced datasets like LVIS. """ def __init__(self, repeat_factors, *, shuffle=True, seed=None): """ Args: repeat_factors (Tensor): a float vector, the repeat factor for each indice. When it's full of ones, it is equivalent to ``TrainingSampler(len(repeat_factors), ...)``. shuffle (bool): whether to shuffle the indices or not seed (int): the initial seed of the shuffle. Must be the same across all workers. If None, will use a random seed shared among workers (require synchronization among all workers). """ self._shuffle = shuffle if seed is None: seed = comm.shared_random_seed() self._seed = int(seed) self._rank = comm.get_rank() self._world_size = comm.get_world_size() # Split into whole number (_int_part) and fractional (_frac_part) parts. self._int_part = torch.trunc(repeat_factors) self._frac_part = repeat_factors - self._int_part @staticmethod def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh): """ Compute (fractional) per-image repeat factors based on category frequency. The repeat factor for an image is a function of the frequency of the rarest category labeled in that image. The "frequency of category c" in [0, 1] is defined as the fraction of images in the training set (without repeats) in which category c appears. See :paper:`lvis` (>= v2) Appendix B.2. Args: dataset_dicts (list[dict]): annotations in Detectron2 dataset format. repeat_thresh (float): frequency threshold below which data is repeated. If the frequency is half of `repeat_thresh`, the image will be repeated twice. Returns: torch.Tensor: the i-th element is the repeat factor for the dataset image at index i. """ # 1. For each category c, compute the fraction of images that contain it: f(c) category_freq = defaultdict(int) for dataset_dict in dataset_dicts: # For each image (without repeats) cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} for cat_id in cat_ids: category_freq[cat_id] += 1 num_images = len(dataset_dicts) for k, v in category_freq.items(): category_freq[k] = v / num_images # 2. For each category c, compute the category-level repeat factor: # r(c) = max(1, sqrt(t / f(c))) category_rep = { cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq)) for cat_id, cat_freq in category_freq.items() } # 3. For each image I, compute the image-level repeat factor: # r(I) = max_{c in I} r(c) rep_factors = [] for dataset_dict in dataset_dicts: cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}, default=1.0) rep_factors.append(rep_factor) return torch.tensor(rep_factors, dtype=torch.float32) def _get_epoch_indices(self, generator): """ Create a list of dataset indices (with repeats) to use for one epoch. Args: generator (torch.Generator): pseudo random number generator used for stochastic rounding. Returns: torch.Tensor: list of dataset indices to use in one epoch. Each index is repeated based on its calculated repeat factor. """ # Since repeat factors are fractional, we use stochastic rounding so # that the target repeat factor is achieved in expectation over the # course of training rands = torch.rand(len(self._frac_part), generator=generator) rep_factors = self._int_part + (rands < self._frac_part).float() # Construct a list of indices in which we repeat images as specified indices = [] for dataset_index, rep_factor in enumerate(rep_factors): indices.extend([dataset_index] * int(rep_factor.item())) return torch.tensor(indices, dtype=torch.int64) def __iter__(self): start = self._rank yield from itertools.islice(self._infinite_indices(), start, None, self._world_size) def _infinite_indices(self): g = torch.Generator() g.manual_seed(self._seed) while True: # Sample indices with repeats determined by stochastic rounding; each # "epoch" may have a slightly different size due to the rounding. indices = self._get_epoch_indices(g) if self._shuffle: randperm = torch.randperm(len(indices), generator=g) yield from indices[randperm].tolist() else: yield from indices.tolist() class InferenceSampler(Sampler): """ Produce indices for inference across all workers. Inference needs to run on the __exact__ set of samples, therefore when the total number of samples is not divisible by the number of workers, this sampler produces different number of samples on different workers. """ def __init__(self, size: int): """ Args: size (int): the total number of data of the underlying dataset to sample from """ self._size = size assert size > 0 self._rank = comm.get_rank() self._world_size = comm.get_world_size() self._local_indices = self._get_local_indices(size, self._world_size, self._rank) @staticmethod def _get_local_indices(total_size, world_size, rank): shard_size = total_size // world_size left = total_size % world_size shard_sizes = [shard_size + int(r < left) for r in range(world_size)] begin = sum(shard_sizes[:rank]) end = min(sum(shard_sizes[: rank + 1]), total_size) return range(begin, end) def __iter__(self): yield from self._local_indices def __len__(self): return len(self._local_indices) ================================================ FILE: annotator/oneformer/detectron2/data/samplers/grouped_batch_sampler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import numpy as np from torch.utils.data.sampler import BatchSampler, Sampler class GroupedBatchSampler(BatchSampler): """ Wraps another sampler to yield a mini-batch of indices. It enforces that the batch only contain elements from the same group. It also tries to provide mini-batches which follows an ordering which is as close as possible to the ordering from the original sampler. """ def __init__(self, sampler, group_ids, batch_size): """ Args: sampler (Sampler): Base sampler. group_ids (list[int]): If the sampler produces indices in range [0, N), `group_ids` must be a list of `N` ints which contains the group id of each sample. The group ids must be a set of integers in the range [0, num_groups). batch_size (int): Size of mini-batch. """ if not isinstance(sampler, Sampler): raise ValueError( "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler) ) self.sampler = sampler self.group_ids = np.asarray(group_ids) assert self.group_ids.ndim == 1 self.batch_size = batch_size groups = np.unique(self.group_ids).tolist() # buffer the indices of each group until batch size is reached self.buffer_per_group = {k: [] for k in groups} def __iter__(self): for idx in self.sampler: group_id = self.group_ids[idx] group_buffer = self.buffer_per_group[group_id] group_buffer.append(idx) if len(group_buffer) == self.batch_size: yield group_buffer[:] # yield a copy of the list del group_buffer[:] def __len__(self): raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.") ================================================ FILE: annotator/oneformer/detectron2/data/transforms/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from fvcore.transforms.transform import Transform, TransformList # order them first from fvcore.transforms.transform import * from .transform import * from .augmentation import * from .augmentation_impl import * __all__ = [k for k in globals().keys() if not k.startswith("_")] from annotator.oneformer.detectron2.utils.env import fixup_module_metadata fixup_module_metadata(__name__, globals(), __all__) del fixup_module_metadata ================================================ FILE: annotator/oneformer/detectron2/data/transforms/augmentation.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import inspect import numpy as np import pprint from typing import Any, List, Optional, Tuple, Union from fvcore.transforms.transform import Transform, TransformList """ See "Data Augmentation" tutorial for an overview of the system: https://detectron2.readthedocs.io/tutorials/augmentation.html """ __all__ = [ "Augmentation", "AugmentationList", "AugInput", "TransformGen", "apply_transform_gens", "StandardAugInput", "apply_augmentations", ] def _check_img_dtype(img): assert isinstance(img, np.ndarray), "[Augmentation] Needs an numpy array, but got a {}!".format( type(img) ) assert not isinstance(img.dtype, np.integer) or ( img.dtype == np.uint8 ), "[Augmentation] Got image of type {}, use uint8 or floating points instead!".format( img.dtype ) assert img.ndim in [2, 3], img.ndim def _get_aug_input_args(aug, aug_input) -> List[Any]: """ Get the arguments to be passed to ``aug.get_transform`` from the input ``aug_input``. """ if aug.input_args is None: # Decide what attributes are needed automatically prms = list(inspect.signature(aug.get_transform).parameters.items()) # The default behavior is: if there is one parameter, then its "image" # (work automatically for majority of use cases, and also avoid BC breaking), # Otherwise, use the argument names. if len(prms) == 1: names = ("image",) else: names = [] for name, prm in prms: if prm.kind in ( inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD, ): raise TypeError( f""" \ The default implementation of `{type(aug)}.__call__` does not allow \ `{type(aug)}.get_transform` to use variable-length arguments (*args, **kwargs)! \ If arguments are unknown, reimplement `__call__` instead. \ """ ) names.append(name) aug.input_args = tuple(names) args = [] for f in aug.input_args: try: args.append(getattr(aug_input, f)) except AttributeError as e: raise AttributeError( f"{type(aug)}.get_transform needs input attribute '{f}', " f"but it is not an attribute of {type(aug_input)}!" ) from e return args class Augmentation: """ Augmentation defines (often random) policies/strategies to generate :class:`Transform` from data. It is often used for pre-processing of input data. A "policy" that generates a :class:`Transform` may, in the most general case, need arbitrary information from input data in order to determine what transforms to apply. Therefore, each :class:`Augmentation` instance defines the arguments needed by its :meth:`get_transform` method. When called with the positional arguments, the :meth:`get_transform` method executes the policy. Note that :class:`Augmentation` defines the policies to create a :class:`Transform`, but not how to execute the actual transform operations to those data. Its :meth:`__call__` method will use :meth:`AugInput.transform` to execute the transform. The returned `Transform` object is meant to describe deterministic transformation, which means it can be re-applied on associated data, e.g. the geometry of an image and its segmentation masks need to be transformed together. (If such re-application is not needed, then determinism is not a crucial requirement.) """ input_args: Optional[Tuple[str]] = None """ Stores the attribute names needed by :meth:`get_transform`, e.g. ``("image", "sem_seg")``. By default, it is just a tuple of argument names in :meth:`self.get_transform`, which often only contain "image". As long as the argument name convention is followed, there is no need for users to touch this attribute. """ def _init(self, params=None): if params: for k, v in params.items(): if k != "self" and not k.startswith("_"): setattr(self, k, v) def get_transform(self, *args) -> Transform: """ Execute the policy based on input data, and decide what transform to apply to inputs. Args: args: Any fixed-length positional arguments. By default, the name of the arguments should exist in the :class:`AugInput` to be used. Returns: Transform: Returns the deterministic transform to apply to the input. Examples: :: class MyAug: # if a policy needs to know both image and semantic segmentation def get_transform(image, sem_seg) -> T.Transform: pass tfm: Transform = MyAug().get_transform(image, sem_seg) new_image = tfm.apply_image(image) Notes: Users can freely use arbitrary new argument names in custom :meth:`get_transform` method, as long as they are available in the input data. In detectron2 we use the following convention: * image: (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or floating point in range [0, 1] or [0, 255]. * boxes: (N,4) ndarray of float32. It represents the instance bounding boxes of N instances. Each is in XYXY format in unit of absolute coordinates. * sem_seg: (H,W) ndarray of type uint8. Each element is an integer label of pixel. We do not specify convention for other types and do not include builtin :class:`Augmentation` that uses other types in detectron2. """ raise NotImplementedError def __call__(self, aug_input) -> Transform: """ Augment the given `aug_input` **in-place**, and return the transform that's used. This method will be called to apply the augmentation. In most augmentation, it is enough to use the default implementation, which calls :meth:`get_transform` using the inputs. But a subclass can overwrite it to have more complicated logic. Args: aug_input (AugInput): an object that has attributes needed by this augmentation (defined by ``self.get_transform``). Its ``transform`` method will be called to in-place transform it. Returns: Transform: the transform that is applied on the input. """ args = _get_aug_input_args(self, aug_input) tfm = self.get_transform(*args) assert isinstance(tfm, (Transform, TransformList)), ( f"{type(self)}.get_transform must return an instance of Transform! " f"Got {type(tfm)} instead." ) aug_input.transform(tfm) return tfm def _rand_range(self, low=1.0, high=None, size=None): """ Uniform float random number between low and high. """ if high is None: low, high = 0, low if size is None: size = [] return np.random.uniform(low, high, size) def __repr__(self): """ Produce something like: "MyAugmentation(field1={self.field1}, field2={self.field2})" """ try: sig = inspect.signature(self.__init__) classname = type(self).__name__ argstr = [] for name, param in sig.parameters.items(): assert ( param.kind != param.VAR_POSITIONAL and param.kind != param.VAR_KEYWORD ), "The default __repr__ doesn't support *args or **kwargs" assert hasattr(self, name), ( "Attribute {} not found! " "Default __repr__ only works if attributes match the constructor.".format(name) ) attr = getattr(self, name) default = param.default if default is attr: continue attr_str = pprint.pformat(attr) if "\n" in attr_str: # don't show it if pformat decides to use >1 lines attr_str = "..." argstr.append("{}={}".format(name, attr_str)) return "{}({})".format(classname, ", ".join(argstr)) except AssertionError: return super().__repr__() __str__ = __repr__ class _TransformToAug(Augmentation): def __init__(self, tfm: Transform): self.tfm = tfm def get_transform(self, *args): return self.tfm def __repr__(self): return repr(self.tfm) __str__ = __repr__ def _transform_to_aug(tfm_or_aug): """ Wrap Transform into Augmentation. Private, used internally to implement augmentations. """ assert isinstance(tfm_or_aug, (Transform, Augmentation)), tfm_or_aug if isinstance(tfm_or_aug, Augmentation): return tfm_or_aug else: return _TransformToAug(tfm_or_aug) class AugmentationList(Augmentation): """ Apply a sequence of augmentations. It has ``__call__`` method to apply the augmentations. Note that :meth:`get_transform` method is impossible (will throw error if called) for :class:`AugmentationList`, because in order to apply a sequence of augmentations, the kth augmentation must be applied first, to provide inputs needed by the (k+1)th augmentation. """ def __init__(self, augs): """ Args: augs (list[Augmentation or Transform]): """ super().__init__() self.augs = [_transform_to_aug(x) for x in augs] def __call__(self, aug_input) -> TransformList: tfms = [] for x in self.augs: tfm = x(aug_input) tfms.append(tfm) return TransformList(tfms) def __repr__(self): msgs = [str(x) for x in self.augs] return "AugmentationList[{}]".format(", ".join(msgs)) __str__ = __repr__ class AugInput: """ Input that can be used with :meth:`Augmentation.__call__`. This is a standard implementation for the majority of use cases. This class provides the standard attributes **"image", "boxes", "sem_seg"** defined in :meth:`__init__` and they may be needed by different augmentations. Most augmentation policies do not need attributes beyond these three. After applying augmentations to these attributes (using :meth:`AugInput.transform`), the returned transforms can then be used to transform other data structures that users have. Examples: :: input = AugInput(image, boxes=boxes) tfms = augmentation(input) transformed_image = input.image transformed_boxes = input.boxes transformed_other_data = tfms.apply_other(other_data) An extended project that works with new data types may implement augmentation policies that need other inputs. An algorithm may need to transform inputs in a way different from the standard approach defined in this class. In those rare situations, users can implement a class similar to this class, that satify the following condition: * The input must provide access to these data in the form of attribute access (``getattr``). For example, if an :class:`Augmentation` to be applied needs "image" and "sem_seg" arguments, its input must have the attribute "image" and "sem_seg". * The input must have a ``transform(tfm: Transform) -> None`` method which in-place transforms all its attributes. """ # TODO maybe should support more builtin data types here def __init__( self, image: np.ndarray, *, boxes: Optional[np.ndarray] = None, sem_seg: Optional[np.ndarray] = None, ): """ Args: image (ndarray): (H,W) or (H,W,C) ndarray of type uint8 in range [0, 255], or floating point in range [0, 1] or [0, 255]. The meaning of C is up to users. boxes (ndarray or None): Nx4 float32 boxes in XYXY_ABS mode sem_seg (ndarray or None): HxW uint8 semantic segmentation mask. Each element is an integer label of pixel. """ _check_img_dtype(image) self.image = image self.boxes = boxes self.sem_seg = sem_seg def transform(self, tfm: Transform) -> None: """ In-place transform all attributes of this class. By "in-place", it means after calling this method, accessing an attribute such as ``self.image`` will return transformed data. """ self.image = tfm.apply_image(self.image) if self.boxes is not None: self.boxes = tfm.apply_box(self.boxes) if self.sem_seg is not None: self.sem_seg = tfm.apply_segmentation(self.sem_seg) def apply_augmentations( self, augmentations: List[Union[Augmentation, Transform]] ) -> TransformList: """ Equivalent of ``AugmentationList(augmentations)(self)`` """ return AugmentationList(augmentations)(self) def apply_augmentations(augmentations: List[Union[Transform, Augmentation]], inputs): """ Use ``T.AugmentationList(augmentations)(inputs)`` instead. """ if isinstance(inputs, np.ndarray): # handle the common case of image-only Augmentation, also for backward compatibility image_only = True inputs = AugInput(inputs) else: image_only = False tfms = inputs.apply_augmentations(augmentations) return inputs.image if image_only else inputs, tfms apply_transform_gens = apply_augmentations """ Alias for backward-compatibility. """ TransformGen = Augmentation """ Alias for Augmentation, since it is something that generates :class:`Transform`s """ StandardAugInput = AugInput """ Alias for compatibility. It's not worth the complexity to have two classes. """ ================================================ FILE: annotator/oneformer/detectron2/data/transforms/augmentation_impl.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. """ Implement many useful :class:`Augmentation`. """ import numpy as np import sys from numpy import random from typing import Tuple import torch from fvcore.transforms.transform import ( BlendTransform, CropTransform, HFlipTransform, NoOpTransform, PadTransform, Transform, TransformList, VFlipTransform, ) from PIL import Image from annotator.oneformer.detectron2.structures import Boxes, pairwise_iou from .augmentation import Augmentation, _transform_to_aug from .transform import ExtentTransform, ResizeTransform, RotationTransform __all__ = [ "FixedSizeCrop", "RandomApply", "RandomBrightness", "RandomContrast", "RandomCrop", "RandomExtent", "RandomFlip", "RandomSaturation", "RandomLighting", "RandomRotation", "Resize", "ResizeScale", "ResizeShortestEdge", "RandomCrop_CategoryAreaConstraint", "RandomResize", "MinIoURandomCrop", ] class RandomApply(Augmentation): """ Randomly apply an augmentation with a given probability. """ def __init__(self, tfm_or_aug, prob=0.5): """ Args: tfm_or_aug (Transform, Augmentation): the transform or augmentation to be applied. It can either be a `Transform` or `Augmentation` instance. prob (float): probability between 0.0 and 1.0 that the wrapper transformation is applied """ super().__init__() self.aug = _transform_to_aug(tfm_or_aug) assert 0.0 <= prob <= 1.0, f"Probablity must be between 0.0 and 1.0 (given: {prob})" self.prob = prob def get_transform(self, *args): do = self._rand_range() < self.prob if do: return self.aug.get_transform(*args) else: return NoOpTransform() def __call__(self, aug_input): do = self._rand_range() < self.prob if do: return self.aug(aug_input) else: return NoOpTransform() class RandomFlip(Augmentation): """ Flip the image horizontally or vertically with the given probability. """ def __init__(self, prob=0.5, *, horizontal=True, vertical=False): """ Args: prob (float): probability of flip. horizontal (boolean): whether to apply horizontal flipping vertical (boolean): whether to apply vertical flipping """ super().__init__() if horizontal and vertical: raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") if not horizontal and not vertical: raise ValueError("At least one of horiz or vert has to be True!") self._init(locals()) def get_transform(self, image): h, w = image.shape[:2] do = self._rand_range() < self.prob if do: if self.horizontal: return HFlipTransform(w) elif self.vertical: return VFlipTransform(h) else: return NoOpTransform() class Resize(Augmentation): """Resize image to a fixed target size""" def __init__(self, shape, interp=Image.BILINEAR): """ Args: shape: (h, w) tuple or a int interp: PIL interpolation method """ if isinstance(shape, int): shape = (shape, shape) shape = tuple(shape) self._init(locals()) def get_transform(self, image): return ResizeTransform( image.shape[0], image.shape[1], self.shape[0], self.shape[1], self.interp ) class ResizeShortestEdge(Augmentation): """ Resize the image while keeping the aspect ratio unchanged. It attempts to scale the shorter edge to the given `short_edge_length`, as long as the longer edge does not exceed `max_size`. If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. """ @torch.jit.unused def __init__( self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR ): """ Args: short_edge_length (list[int]): If ``sample_style=="range"``, a [min, max] interval from which to sample the shortest edge length. If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. max_size (int): maximum allowed longest edge length. sample_style (str): either "range" or "choice". """ super().__init__() assert sample_style in ["range", "choice"], sample_style self.is_range = sample_style == "range" if isinstance(short_edge_length, int): short_edge_length = (short_edge_length, short_edge_length) if self.is_range: assert len(short_edge_length) == 2, ( "short_edge_length must be two values using 'range' sample style." f" Got {short_edge_length}!" ) self._init(locals()) @torch.jit.unused def get_transform(self, image): h, w = image.shape[:2] if self.is_range: size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) else: size = np.random.choice(self.short_edge_length) if size == 0: return NoOpTransform() newh, neww = ResizeShortestEdge.get_output_shape(h, w, size, self.max_size) return ResizeTransform(h, w, newh, neww, self.interp) @staticmethod def get_output_shape( oldh: int, oldw: int, short_edge_length: int, max_size: int ) -> Tuple[int, int]: """ Compute the output size given input size and target short edge length. """ h, w = oldh, oldw size = short_edge_length * 1.0 scale = size / min(h, w) if h < w: newh, neww = size, scale * w else: newh, neww = scale * h, size if max(newh, neww) > max_size: scale = max_size * 1.0 / max(newh, neww) newh = newh * scale neww = neww * scale neww = int(neww + 0.5) newh = int(newh + 0.5) return (newh, neww) class ResizeScale(Augmentation): """ Takes target size as input and randomly scales the given target size between `min_scale` and `max_scale`. It then scales the input image such that it fits inside the scaled target box, keeping the aspect ratio constant. This implements the resize part of the Google's 'resize_and_crop' data augmentation: https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/input_utils.py#L127 """ def __init__( self, min_scale: float, max_scale: float, target_height: int, target_width: int, interp: int = Image.BILINEAR, ): """ Args: min_scale: minimum image scale range. max_scale: maximum image scale range. target_height: target image height. target_width: target image width. interp: image interpolation method. """ super().__init__() self._init(locals()) def _get_resize(self, image: np.ndarray, scale: float) -> Transform: input_size = image.shape[:2] # Compute new target size given a scale. target_size = (self.target_height, self.target_width) target_scale_size = np.multiply(target_size, scale) # Compute actual rescaling applied to input image and output size. output_scale = np.minimum( target_scale_size[0] / input_size[0], target_scale_size[1] / input_size[1] ) output_size = np.round(np.multiply(input_size, output_scale)).astype(int) return ResizeTransform( input_size[0], input_size[1], output_size[0], output_size[1], self.interp ) def get_transform(self, image: np.ndarray) -> Transform: random_scale = np.random.uniform(self.min_scale, self.max_scale) return self._get_resize(image, random_scale) class RandomRotation(Augmentation): """ This method returns a copy of this image, rotated the given number of degrees counter clockwise around the given center. """ def __init__(self, angle, expand=True, center=None, sample_style="range", interp=None): """ Args: angle (list[float]): If ``sample_style=="range"``, a [min, max] interval from which to sample the angle (in degrees). If ``sample_style=="choice"``, a list of angles to sample from expand (bool): choose if the image should be resized to fit the whole rotated image (default), or simply cropped center (list[[float, float]]): If ``sample_style=="range"``, a [[minx, miny], [maxx, maxy]] relative interval from which to sample the center, [0, 0] being the top left of the image and [1, 1] the bottom right. If ``sample_style=="choice"``, a list of centers to sample from Default: None, which means that the center of rotation is the center of the image center has no effect if expand=True because it only affects shifting """ super().__init__() assert sample_style in ["range", "choice"], sample_style self.is_range = sample_style == "range" if isinstance(angle, (float, int)): angle = (angle, angle) if center is not None and isinstance(center[0], (float, int)): center = (center, center) self._init(locals()) def get_transform(self, image): h, w = image.shape[:2] center = None if self.is_range: angle = np.random.uniform(self.angle[0], self.angle[1]) if self.center is not None: center = ( np.random.uniform(self.center[0][0], self.center[1][0]), np.random.uniform(self.center[0][1], self.center[1][1]), ) else: angle = np.random.choice(self.angle) if self.center is not None: center = np.random.choice(self.center) if center is not None: center = (w * center[0], h * center[1]) # Convert to absolute coordinates if angle % 360 == 0: return NoOpTransform() return RotationTransform(h, w, angle, expand=self.expand, center=center, interp=self.interp) class FixedSizeCrop(Augmentation): """ If `crop_size` is smaller than the input image size, then it uses a random crop of the crop size. If `crop_size` is larger than the input image size, then it pads the right and the bottom of the image to the crop size if `pad` is True, otherwise it returns the smaller image. """ def __init__( self, crop_size: Tuple[int], pad: bool = True, pad_value: float = 128.0, seg_pad_value: int = 255, ): """ Args: crop_size: target image (height, width). pad: if True, will pad images smaller than `crop_size` up to `crop_size` pad_value: the padding value to the image. seg_pad_value: the padding value to the segmentation mask. """ super().__init__() self._init(locals()) def _get_crop(self, image: np.ndarray) -> Transform: # Compute the image scale and scaled size. input_size = image.shape[:2] output_size = self.crop_size # Add random crop if the image is scaled up. max_offset = np.subtract(input_size, output_size) max_offset = np.maximum(max_offset, 0) offset = np.multiply(max_offset, np.random.uniform(0.0, 1.0)) offset = np.round(offset).astype(int) return CropTransform( offset[1], offset[0], output_size[1], output_size[0], input_size[1], input_size[0] ) def _get_pad(self, image: np.ndarray) -> Transform: # Compute the image scale and scaled size. input_size = image.shape[:2] output_size = self.crop_size # Add padding if the image is scaled down. pad_size = np.subtract(output_size, input_size) pad_size = np.maximum(pad_size, 0) original_size = np.minimum(input_size, output_size) return PadTransform( 0, 0, pad_size[1], pad_size[0], original_size[1], original_size[0], self.pad_value, self.seg_pad_value, ) def get_transform(self, image: np.ndarray) -> TransformList: transforms = [self._get_crop(image)] if self.pad: transforms.append(self._get_pad(image)) return TransformList(transforms) class RandomCrop(Augmentation): """ Randomly crop a rectangle region out of an image. """ def __init__(self, crop_type: str, crop_size): """ Args: crop_type (str): one of "relative_range", "relative", "absolute", "absolute_range". crop_size (tuple[float, float]): two floats, explained below. - "relative": crop a (H * crop_size[0], W * crop_size[1]) region from an input image of size (H, W). crop size should be in (0, 1] - "relative_range": uniformly sample two values from [crop_size[0], 1] and [crop_size[1]], 1], and use them as in "relative" crop type. - "absolute" crop a (crop_size[0], crop_size[1]) region from input image. crop_size must be smaller than the input image size. - "absolute_range", for an input of size (H, W), uniformly sample H_crop in [crop_size[0], min(H, crop_size[1])] and W_crop in [crop_size[0], min(W, crop_size[1])]. Then crop a region (H_crop, W_crop). """ # TODO style of relative_range and absolute_range are not consistent: # one takes (h, w) but another takes (min, max) super().__init__() assert crop_type in ["relative_range", "relative", "absolute", "absolute_range"] self._init(locals()) def get_transform(self, image): h, w = image.shape[:2] croph, cropw = self.get_crop_size((h, w)) assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self) h0 = np.random.randint(h - croph + 1) w0 = np.random.randint(w - cropw + 1) return CropTransform(w0, h0, cropw, croph) def get_crop_size(self, image_size): """ Args: image_size (tuple): height, width Returns: crop_size (tuple): height, width in absolute pixels """ h, w = image_size if self.crop_type == "relative": ch, cw = self.crop_size return int(h * ch + 0.5), int(w * cw + 0.5) elif self.crop_type == "relative_range": crop_size = np.asarray(self.crop_size, dtype=np.float32) ch, cw = crop_size + np.random.rand(2) * (1 - crop_size) return int(h * ch + 0.5), int(w * cw + 0.5) elif self.crop_type == "absolute": return (min(self.crop_size[0], h), min(self.crop_size[1], w)) elif self.crop_type == "absolute_range": assert self.crop_size[0] <= self.crop_size[1] ch = np.random.randint(min(h, self.crop_size[0]), min(h, self.crop_size[1]) + 1) cw = np.random.randint(min(w, self.crop_size[0]), min(w, self.crop_size[1]) + 1) return ch, cw else: raise NotImplementedError("Unknown crop type {}".format(self.crop_type)) class RandomCrop_CategoryAreaConstraint(Augmentation): """ Similar to :class:`RandomCrop`, but find a cropping window such that no single category occupies a ratio of more than `single_category_max_area` in semantic segmentation ground truth, which can cause unstability in training. The function attempts to find such a valid cropping window for at most 10 times. """ def __init__( self, crop_type: str, crop_size, single_category_max_area: float = 1.0, ignored_category: int = None, ): """ Args: crop_type, crop_size: same as in :class:`RandomCrop` single_category_max_area: the maximum allowed area ratio of a category. Set to 1.0 to disable ignored_category: allow this category in the semantic segmentation ground truth to exceed the area ratio. Usually set to the category that's ignored in training. """ self.crop_aug = RandomCrop(crop_type, crop_size) self._init(locals()) def get_transform(self, image, sem_seg): if self.single_category_max_area >= 1.0: return self.crop_aug.get_transform(image) else: h, w = sem_seg.shape for _ in range(10): crop_size = self.crop_aug.get_crop_size((h, w)) y0 = np.random.randint(h - crop_size[0] + 1) x0 = np.random.randint(w - crop_size[1] + 1) sem_seg_temp = sem_seg[y0 : y0 + crop_size[0], x0 : x0 + crop_size[1]] labels, cnt = np.unique(sem_seg_temp, return_counts=True) if self.ignored_category is not None: cnt = cnt[labels != self.ignored_category] if len(cnt) > 1 and np.max(cnt) < np.sum(cnt) * self.single_category_max_area: break crop_tfm = CropTransform(x0, y0, crop_size[1], crop_size[0]) return crop_tfm class RandomExtent(Augmentation): """ Outputs an image by cropping a random "subrect" of the source image. The subrect can be parameterized to include pixels outside the source image, in which case they will be set to zeros (i.e. black). The size of the output image will vary with the size of the random subrect. """ def __init__(self, scale_range, shift_range): """ Args: output_size (h, w): Dimensions of output image scale_range (l, h): Range of input-to-output size scaling factor shift_range (x, y): Range of shifts of the cropped subrect. The rect is shifted by [w / 2 * Uniform(-x, x), h / 2 * Uniform(-y, y)], where (w, h) is the (width, height) of the input image. Set each component to zero to crop at the image's center. """ super().__init__() self._init(locals()) def get_transform(self, image): img_h, img_w = image.shape[:2] # Initialize src_rect to fit the input image. src_rect = np.array([-0.5 * img_w, -0.5 * img_h, 0.5 * img_w, 0.5 * img_h]) # Apply a random scaling to the src_rect. src_rect *= np.random.uniform(self.scale_range[0], self.scale_range[1]) # Apply a random shift to the coordinates origin. src_rect[0::2] += self.shift_range[0] * img_w * (np.random.rand() - 0.5) src_rect[1::2] += self.shift_range[1] * img_h * (np.random.rand() - 0.5) # Map src_rect coordinates into image coordinates (center at corner). src_rect[0::2] += 0.5 * img_w src_rect[1::2] += 0.5 * img_h return ExtentTransform( src_rect=(src_rect[0], src_rect[1], src_rect[2], src_rect[3]), output_size=(int(src_rect[3] - src_rect[1]), int(src_rect[2] - src_rect[0])), ) class RandomContrast(Augmentation): """ Randomly transforms image contrast. Contrast intensity is uniformly sampled in (intensity_min, intensity_max). - intensity < 1 will reduce contrast - intensity = 1 will preserve the input image - intensity > 1 will increase contrast See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html """ def __init__(self, intensity_min, intensity_max): """ Args: intensity_min (float): Minimum augmentation intensity_max (float): Maximum augmentation """ super().__init__() self._init(locals()) def get_transform(self, image): w = np.random.uniform(self.intensity_min, self.intensity_max) return BlendTransform(src_image=image.mean(), src_weight=1 - w, dst_weight=w) class RandomBrightness(Augmentation): """ Randomly transforms image brightness. Brightness intensity is uniformly sampled in (intensity_min, intensity_max). - intensity < 1 will reduce brightness - intensity = 1 will preserve the input image - intensity > 1 will increase brightness See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html """ def __init__(self, intensity_min, intensity_max): """ Args: intensity_min (float): Minimum augmentation intensity_max (float): Maximum augmentation """ super().__init__() self._init(locals()) def get_transform(self, image): w = np.random.uniform(self.intensity_min, self.intensity_max) return BlendTransform(src_image=0, src_weight=1 - w, dst_weight=w) class RandomSaturation(Augmentation): """ Randomly transforms saturation of an RGB image. Input images are assumed to have 'RGB' channel order. Saturation intensity is uniformly sampled in (intensity_min, intensity_max). - intensity < 1 will reduce saturation (make the image more grayscale) - intensity = 1 will preserve the input image - intensity > 1 will increase saturation See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html """ def __init__(self, intensity_min, intensity_max): """ Args: intensity_min (float): Minimum augmentation (1 preserves input). intensity_max (float): Maximum augmentation (1 preserves input). """ super().__init__() self._init(locals()) def get_transform(self, image): assert image.shape[-1] == 3, "RandomSaturation only works on RGB images" w = np.random.uniform(self.intensity_min, self.intensity_max) grayscale = image.dot([0.299, 0.587, 0.114])[:, :, np.newaxis] return BlendTransform(src_image=grayscale, src_weight=1 - w, dst_weight=w) class RandomLighting(Augmentation): """ The "lighting" augmentation described in AlexNet, using fixed PCA over ImageNet. Input images are assumed to have 'RGB' channel order. The degree of color jittering is randomly sampled via a normal distribution, with standard deviation given by the scale parameter. """ def __init__(self, scale): """ Args: scale (float): Standard deviation of principal component weighting. """ super().__init__() self._init(locals()) self.eigen_vecs = np.array( [[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]] ) self.eigen_vals = np.array([0.2175, 0.0188, 0.0045]) def get_transform(self, image): assert image.shape[-1] == 3, "RandomLighting only works on RGB images" weights = np.random.normal(scale=self.scale, size=3) return BlendTransform( src_image=self.eigen_vecs.dot(weights * self.eigen_vals), src_weight=1.0, dst_weight=1.0 ) class RandomResize(Augmentation): """Randomly resize image to a target size in shape_list""" def __init__(self, shape_list, interp=Image.BILINEAR): """ Args: shape_list: a list of shapes in (h, w) interp: PIL interpolation method """ self.shape_list = shape_list self._init(locals()) def get_transform(self, image): shape_idx = np.random.randint(low=0, high=len(self.shape_list)) h, w = self.shape_list[shape_idx] return ResizeTransform(image.shape[0], image.shape[1], h, w, self.interp) class MinIoURandomCrop(Augmentation): """Random crop the image & bboxes, the cropped patches have minimum IoU requirement with original image & bboxes, the IoU threshold is randomly selected from min_ious. Args: min_ious (tuple): minimum IoU threshold for all intersections with bounding boxes min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w, where a >= min_crop_size) mode_trials: number of trials for sampling min_ious threshold crop_trials: number of trials for sampling crop_size after cropping """ def __init__( self, min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3, mode_trials=1000, crop_trials=50, ): self.min_ious = min_ious self.sample_mode = (1, *min_ious, 0) self.min_crop_size = min_crop_size self.mode_trials = mode_trials self.crop_trials = crop_trials def get_transform(self, image, boxes): """Call function to crop images and bounding boxes with minimum IoU constraint. Args: boxes: ground truth boxes in (x1, y1, x2, y2) format """ if boxes is None: return NoOpTransform() h, w, c = image.shape for _ in range(self.mode_trials): mode = random.choice(self.sample_mode) self.mode = mode if mode == 1: return NoOpTransform() min_iou = mode for _ in range(self.crop_trials): new_w = random.uniform(self.min_crop_size * w, w) new_h = random.uniform(self.min_crop_size * h, h) # h / w in [0.5, 2] if new_h / new_w < 0.5 or new_h / new_w > 2: continue left = random.uniform(w - new_w) top = random.uniform(h - new_h) patch = np.array((int(left), int(top), int(left + new_w), int(top + new_h))) # Line or point crop is not allowed if patch[2] == patch[0] or patch[3] == patch[1]: continue overlaps = pairwise_iou( Boxes(patch.reshape(-1, 4)), Boxes(boxes.reshape(-1, 4)) ).reshape(-1) if len(overlaps) > 0 and overlaps.min() < min_iou: continue # center of boxes should inside the crop img # only adjust boxes and instance masks when the gt is not empty if len(overlaps) > 0: # adjust boxes def is_center_of_bboxes_in_patch(boxes, patch): center = (boxes[:, :2] + boxes[:, 2:]) / 2 mask = ( (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * (center[:, 1] < patch[3]) ) return mask mask = is_center_of_bboxes_in_patch(boxes, patch) if not mask.any(): continue return CropTransform(int(left), int(top), int(new_w), int(new_h)) ================================================ FILE: annotator/oneformer/detectron2/data/transforms/transform.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. """ See "Data Augmentation" tutorial for an overview of the system: https://detectron2.readthedocs.io/tutorials/augmentation.html """ import numpy as np import torch import torch.nn.functional as F from fvcore.transforms.transform import ( CropTransform, HFlipTransform, NoOpTransform, Transform, TransformList, ) from PIL import Image try: import cv2 # noqa except ImportError: # OpenCV is an optional dependency at the moment pass __all__ = [ "ExtentTransform", "ResizeTransform", "RotationTransform", "ColorTransform", "PILColorTransform", ] class ExtentTransform(Transform): """ Extracts a subregion from the source image and scales it to the output size. The fill color is used to map pixels from the source rect that fall outside the source image. See: https://pillow.readthedocs.io/en/latest/PIL.html#PIL.ImageTransform.ExtentTransform """ def __init__(self, src_rect, output_size, interp=Image.LINEAR, fill=0): """ Args: src_rect (x0, y0, x1, y1): src coordinates output_size (h, w): dst image size interp: PIL interpolation methods fill: Fill color used when src_rect extends outside image """ super().__init__() self._set_attributes(locals()) def apply_image(self, img, interp=None): h, w = self.output_size if len(img.shape) > 2 and img.shape[2] == 1: pil_image = Image.fromarray(img[:, :, 0], mode="L") else: pil_image = Image.fromarray(img) pil_image = pil_image.transform( size=(w, h), method=Image.EXTENT, data=self.src_rect, resample=interp if interp else self.interp, fill=self.fill, ) ret = np.asarray(pil_image) if len(img.shape) > 2 and img.shape[2] == 1: ret = np.expand_dims(ret, -1) return ret def apply_coords(self, coords): # Transform image center from source coordinates into output coordinates # and then map the new origin to the corner of the output image. h, w = self.output_size x0, y0, x1, y1 = self.src_rect new_coords = coords.astype(np.float32) new_coords[:, 0] -= 0.5 * (x0 + x1) new_coords[:, 1] -= 0.5 * (y0 + y1) new_coords[:, 0] *= w / (x1 - x0) new_coords[:, 1] *= h / (y1 - y0) new_coords[:, 0] += 0.5 * w new_coords[:, 1] += 0.5 * h return new_coords def apply_segmentation(self, segmentation): segmentation = self.apply_image(segmentation, interp=Image.NEAREST) return segmentation class ResizeTransform(Transform): """ Resize the image to a target size. """ def __init__(self, h, w, new_h, new_w, interp=None): """ Args: h, w (int): original image size new_h, new_w (int): new image size interp: PIL interpolation methods, defaults to bilinear. """ # TODO decide on PIL vs opencv super().__init__() if interp is None: interp = Image.BILINEAR self._set_attributes(locals()) def apply_image(self, img, interp=None): assert img.shape[:2] == (self.h, self.w) assert len(img.shape) <= 4 interp_method = interp if interp is not None else self.interp if img.dtype == np.uint8: if len(img.shape) > 2 and img.shape[2] == 1: pil_image = Image.fromarray(img[:, :, 0], mode="L") else: pil_image = Image.fromarray(img) pil_image = pil_image.resize((self.new_w, self.new_h), interp_method) ret = np.asarray(pil_image) if len(img.shape) > 2 and img.shape[2] == 1: ret = np.expand_dims(ret, -1) else: # PIL only supports uint8 if any(x < 0 for x in img.strides): img = np.ascontiguousarray(img) img = torch.from_numpy(img) shape = list(img.shape) shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:] img = img.view(shape_4d).permute(2, 3, 0, 1) # hw(c) -> nchw _PIL_RESIZE_TO_INTERPOLATE_MODE = { Image.NEAREST: "nearest", Image.BILINEAR: "bilinear", Image.BICUBIC: "bicubic", } mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[interp_method] align_corners = None if mode == "nearest" else False img = F.interpolate( img, (self.new_h, self.new_w), mode=mode, align_corners=align_corners ) shape[:2] = (self.new_h, self.new_w) ret = img.permute(2, 3, 0, 1).view(shape).numpy() # nchw -> hw(c) return ret def apply_coords(self, coords): coords[:, 0] = coords[:, 0] * (self.new_w * 1.0 / self.w) coords[:, 1] = coords[:, 1] * (self.new_h * 1.0 / self.h) return coords def apply_segmentation(self, segmentation): segmentation = self.apply_image(segmentation, interp=Image.NEAREST) return segmentation def inverse(self): return ResizeTransform(self.new_h, self.new_w, self.h, self.w, self.interp) class RotationTransform(Transform): """ This method returns a copy of this image, rotated the given number of degrees counter clockwise around its center. """ def __init__(self, h, w, angle, expand=True, center=None, interp=None): """ Args: h, w (int): original image size angle (float): degrees for rotation expand (bool): choose if the image should be resized to fit the whole rotated image (default), or simply cropped center (tuple (width, height)): coordinates of the rotation center if left to None, the center will be fit to the center of each image center has no effect if expand=True because it only affects shifting interp: cv2 interpolation method, default cv2.INTER_LINEAR """ super().__init__() image_center = np.array((w / 2, h / 2)) if center is None: center = image_center if interp is None: interp = cv2.INTER_LINEAR abs_cos, abs_sin = (abs(np.cos(np.deg2rad(angle))), abs(np.sin(np.deg2rad(angle)))) if expand: # find the new width and height bounds bound_w, bound_h = np.rint( [h * abs_sin + w * abs_cos, h * abs_cos + w * abs_sin] ).astype(int) else: bound_w, bound_h = w, h self._set_attributes(locals()) self.rm_coords = self.create_rotation_matrix() # Needed because of this problem https://github.com/opencv/opencv/issues/11784 self.rm_image = self.create_rotation_matrix(offset=-0.5) def apply_image(self, img, interp=None): """ img should be a numpy array, formatted as Height * Width * Nchannels """ if len(img) == 0 or self.angle % 360 == 0: return img assert img.shape[:2] == (self.h, self.w) interp = interp if interp is not None else self.interp return cv2.warpAffine(img, self.rm_image, (self.bound_w, self.bound_h), flags=interp) def apply_coords(self, coords): """ coords should be a N * 2 array-like, containing N couples of (x, y) points """ coords = np.asarray(coords, dtype=float) if len(coords) == 0 or self.angle % 360 == 0: return coords return cv2.transform(coords[:, np.newaxis, :], self.rm_coords)[:, 0, :] def apply_segmentation(self, segmentation): segmentation = self.apply_image(segmentation, interp=cv2.INTER_NEAREST) return segmentation def create_rotation_matrix(self, offset=0): center = (self.center[0] + offset, self.center[1] + offset) rm = cv2.getRotationMatrix2D(tuple(center), self.angle, 1) if self.expand: # Find the coordinates of the center of rotation in the new image # The only point for which we know the future coordinates is the center of the image rot_im_center = cv2.transform(self.image_center[None, None, :] + offset, rm)[0, 0, :] new_center = np.array([self.bound_w / 2, self.bound_h / 2]) + offset - rot_im_center # shift the rotation center to the new coordinates rm[:, 2] += new_center return rm def inverse(self): """ The inverse is to rotate it back with expand, and crop to get the original shape. """ if not self.expand: # Not possible to inverse if a part of the image is lost raise NotImplementedError() rotation = RotationTransform( self.bound_h, self.bound_w, -self.angle, True, None, self.interp ) crop = CropTransform( (rotation.bound_w - self.w) // 2, (rotation.bound_h - self.h) // 2, self.w, self.h ) return TransformList([rotation, crop]) class ColorTransform(Transform): """ Generic wrapper for any photometric transforms. These transformations should only affect the color space and not the coordinate space of the image (e.g. annotation coordinates such as bounding boxes should not be changed) """ def __init__(self, op): """ Args: op (Callable): operation to be applied to the image, which takes in an ndarray and returns an ndarray. """ if not callable(op): raise ValueError("op parameter should be callable") super().__init__() self._set_attributes(locals()) def apply_image(self, img): return self.op(img) def apply_coords(self, coords): return coords def inverse(self): return NoOpTransform() def apply_segmentation(self, segmentation): return segmentation class PILColorTransform(ColorTransform): """ Generic wrapper for PIL Photometric image transforms, which affect the color space and not the coordinate space of the image """ def __init__(self, op): """ Args: op (Callable): operation to be applied to the image, which takes in a PIL Image and returns a transformed PIL Image. For reference on possible operations see: - https://pillow.readthedocs.io/en/stable/ """ if not callable(op): raise ValueError("op parameter should be callable") super().__init__(op) def apply_image(self, img): img = Image.fromarray(img) return np.asarray(super().apply_image(img)) def HFlip_rotated_box(transform, rotated_boxes): """ Apply the horizontal flip transform on rotated boxes. Args: rotated_boxes (ndarray): Nx5 floating point array of (x_center, y_center, width, height, angle_degrees) format in absolute coordinates. """ # Transform x_center rotated_boxes[:, 0] = transform.width - rotated_boxes[:, 0] # Transform angle rotated_boxes[:, 4] = -rotated_boxes[:, 4] return rotated_boxes def Resize_rotated_box(transform, rotated_boxes): """ Apply the resizing transform on rotated boxes. For details of how these (approximation) formulas are derived, please refer to :meth:`RotatedBoxes.scale`. Args: rotated_boxes (ndarray): Nx5 floating point array of (x_center, y_center, width, height, angle_degrees) format in absolute coordinates. """ scale_factor_x = transform.new_w * 1.0 / transform.w scale_factor_y = transform.new_h * 1.0 / transform.h rotated_boxes[:, 0] *= scale_factor_x rotated_boxes[:, 1] *= scale_factor_y theta = rotated_boxes[:, 4] * np.pi / 180.0 c = np.cos(theta) s = np.sin(theta) rotated_boxes[:, 2] *= np.sqrt(np.square(scale_factor_x * c) + np.square(scale_factor_y * s)) rotated_boxes[:, 3] *= np.sqrt(np.square(scale_factor_x * s) + np.square(scale_factor_y * c)) rotated_boxes[:, 4] = np.arctan2(scale_factor_x * s, scale_factor_y * c) * 180 / np.pi return rotated_boxes HFlipTransform.register_type("rotated_box", HFlip_rotated_box) ResizeTransform.register_type("rotated_box", Resize_rotated_box) # not necessary any more with latest fvcore NoOpTransform.register_type("rotated_box", lambda t, x: x) ================================================ FILE: annotator/oneformer/detectron2/engine/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .launch import * from .train_loop import * __all__ = [k for k in globals().keys() if not k.startswith("_")] # prefer to let hooks and defaults live in separate namespaces (therefore not in __all__) # but still make them available here from .hooks import * from .defaults import * ================================================ FILE: annotator/oneformer/detectron2/engine/defaults.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. """ This file contains components with some default boilerplate logic user may need in training / testing. They will not work for everyone, but many users may find them useful. The behavior of functions/classes in this file is subject to change, since they are meant to represent the "common default behavior" people need in their projects. """ import argparse import logging import os import sys import weakref from collections import OrderedDict from typing import Optional import torch from fvcore.nn.precise_bn import get_bn_modules from omegaconf import OmegaConf from torch.nn.parallel import DistributedDataParallel import annotator.oneformer.detectron2.data.transforms as T from annotator.oneformer.detectron2.checkpoint import DetectionCheckpointer from annotator.oneformer.detectron2.config import CfgNode, LazyConfig from annotator.oneformer.detectron2.data import ( MetadataCatalog, build_detection_test_loader, build_detection_train_loader, ) from annotator.oneformer.detectron2.evaluation import ( DatasetEvaluator, inference_on_dataset, print_csv_format, verify_results, ) from annotator.oneformer.detectron2.modeling import build_model from annotator.oneformer.detectron2.solver import build_lr_scheduler, build_optimizer from annotator.oneformer.detectron2.utils import comm from annotator.oneformer.detectron2.utils.collect_env import collect_env_info from annotator.oneformer.detectron2.utils.env import seed_all_rng from annotator.oneformer.detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter from annotator.oneformer.detectron2.utils.file_io import PathManager from annotator.oneformer.detectron2.utils.logger import setup_logger from . import hooks from .train_loop import AMPTrainer, SimpleTrainer, TrainerBase __all__ = [ "create_ddp_model", "default_argument_parser", "default_setup", "default_writers", "DefaultPredictor", "DefaultTrainer", ] def create_ddp_model(model, *, fp16_compression=False, **kwargs): """ Create a DistributedDataParallel model if there are >1 processes. Args: model: a torch.nn.Module fp16_compression: add fp16 compression hooks to the ddp object. See more at https://pytorch.org/docs/stable/ddp_comm_hooks.html#torch.distributed.algorithms.ddp_comm_hooks.default_hooks.fp16_compress_hook kwargs: other arguments of :module:`torch.nn.parallel.DistributedDataParallel`. """ # noqa if comm.get_world_size() == 1: return model if "device_ids" not in kwargs: kwargs["device_ids"] = [comm.get_local_rank()] ddp = DistributedDataParallel(model, **kwargs) if fp16_compression: from torch.distributed.algorithms.ddp_comm_hooks import default as comm_hooks ddp.register_comm_hook(state=None, hook=comm_hooks.fp16_compress_hook) return ddp def default_argument_parser(epilog=None): """ Create a parser with some common arguments used by detectron2 users. Args: epilog (str): epilog passed to ArgumentParser describing the usage. Returns: argparse.ArgumentParser: """ parser = argparse.ArgumentParser( epilog=epilog or f""" Examples: Run on single machine: $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml Change some config options: $ {sys.argv[0]} --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth SOLVER.BASE_LR 0.001 Run on multiple machines: (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url [--other-flags] (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url [--other-flags] """, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") parser.add_argument( "--resume", action="store_true", help="Whether to attempt to resume from the checkpoint directory. " "See documentation of `DefaultTrainer.resume_or_load()` for what it means.", ) parser.add_argument("--eval-only", action="store_true", help="perform evaluation only") parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*") parser.add_argument("--num-machines", type=int, default=1, help="total number of machines") parser.add_argument( "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)" ) # PyTorch still may leave orphan processes in multi-gpu training. # Therefore we use a deterministic way to obtain port, # so that users are aware of orphan processes by seeing the port occupied. port = 2**15 + 2**14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2**14 parser.add_argument( "--dist-url", default="tcp://127.0.0.1:{}".format(port), help="initialization URL for pytorch distributed backend. See " "https://pytorch.org/docs/stable/distributed.html for details.", ) parser.add_argument( "opts", help=""" Modify config options at the end of the command. For Yacs configs, use space-separated "PATH.KEY VALUE" pairs. For python-based LazyConfig, use "path.key=value". """.strip(), default=None, nargs=argparse.REMAINDER, ) return parser def _try_get_key(cfg, *keys, default=None): """ Try select keys from cfg until the first key that exists. Otherwise return default. """ if isinstance(cfg, CfgNode): cfg = OmegaConf.create(cfg.dump()) for k in keys: none = object() p = OmegaConf.select(cfg, k, default=none) if p is not none: return p return default def _highlight(code, filename): try: import pygments except ImportError: return code from pygments.lexers import Python3Lexer, YamlLexer from pygments.formatters import Terminal256Formatter lexer = Python3Lexer() if filename.endswith(".py") else YamlLexer() code = pygments.highlight(code, lexer, Terminal256Formatter(style="monokai")) return code def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the detectron2 logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (CfgNode or omegaconf.DictConfig): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = _try_get_key(cfg, "OUTPUT_DIR", "output_dir", "train.output_dir") if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() setup_logger(output_dir, distributed_rank=rank, name="fvcore") logger = setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info( "Contents of args.config_file={}:\n{}".format( args.config_file, _highlight(PathManager.open(args.config_file, "r").read(), args.config_file), ) ) if comm.is_main_process() and output_dir: # Note: some of our scripts may expect the existence of # config.yaml in output directory path = os.path.join(output_dir, "config.yaml") if isinstance(cfg, CfgNode): logger.info("Running with full config:\n{}".format(_highlight(cfg.dump(), ".yaml"))) with PathManager.open(path, "w") as f: f.write(cfg.dump()) else: LazyConfig.save(cfg, path) logger.info("Full config saved to {}".format(path)) # make sure each worker has a different, yet deterministic seed if specified seed = _try_get_key(cfg, "SEED", "train.seed", default=-1) seed_all_rng(None if seed < 0 else seed + rank) # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = _try_get_key( cfg, "CUDNN_BENCHMARK", "train.cudnn_benchmark", default=False ) def default_writers(output_dir: str, max_iter: Optional[int] = None): """ Build a list of :class:`EventWriter` to be used. It now consists of a :class:`CommonMetricPrinter`, :class:`TensorboardXWriter` and :class:`JSONWriter`. Args: output_dir: directory to store JSON metrics and tensorboard events max_iter: the total number of iterations Returns: list[EventWriter]: a list of :class:`EventWriter` objects. """ PathManager.mkdirs(output_dir) return [ # It may not always print what you want to see, since it prints "common" metrics only. CommonMetricPrinter(max_iter), JSONWriter(os.path.join(output_dir, "metrics.json")), TensorboardXWriter(output_dir), ] class DefaultPredictor: """ Create a simple end-to-end predictor with the given config that runs on single device for a single input image. Compared to using the model directly, this class does the following additions: 1. Load checkpoint from `cfg.MODEL.WEIGHTS`. 2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`. 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`. 4. Take one input image and produce a single output, instead of a batch. This is meant for simple demo purposes, so it does the above steps automatically. This is not meant for benchmarks or running complicated inference logic. If you'd like to do anything more complicated, please refer to its source code as examples to build and use the model manually. Attributes: metadata (Metadata): the metadata of the underlying dataset, obtained from cfg.DATASETS.TEST. Examples: :: pred = DefaultPredictor(cfg) inputs = cv2.imread("input.jpg") outputs = pred(inputs) """ def __init__(self, cfg): self.cfg = cfg.clone() # cfg can be modified by model self.model = build_model(self.cfg) self.model.eval() if len(cfg.DATASETS.TEST): self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) checkpointer = DetectionCheckpointer(self.model) checkpointer.load(cfg.MODEL.WEIGHTS) self.aug = T.ResizeShortestEdge( [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST ) self.input_format = cfg.INPUT.FORMAT assert self.input_format in ["RGB", "BGR"], self.input_format def __call__(self, original_image): """ Args: original_image (np.ndarray): an image of shape (H, W, C) (in BGR order). Returns: predictions (dict): the output of the model for one image only. See :doc:`/tutorials/models` for details about the format. """ with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 # Apply pre-processing to image. if self.input_format == "RGB": # whether the model expects BGR inputs or RGB original_image = original_image[:, :, ::-1] height, width = original_image.shape[:2] image = self.aug.get_transform(original_image).apply_image(original_image) image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) inputs = {"image": image, "height": height, "width": width} predictions = self.model([inputs])[0] return predictions class DefaultTrainer(TrainerBase): """ A trainer with default training logic. It does the following: 1. Create a :class:`SimpleTrainer` using model, optimizer, dataloader defined by the given config. Create a LR scheduler defined by the config. 2. Load the last checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when `resume_or_load` is called. 3. Register a few common hooks defined by the config. It is created to simplify the **standard model training workflow** and reduce code boilerplate for users who only need the standard training workflow, with standard features. It means this class makes *many assumptions* about your training logic that may easily become invalid in a new research. In fact, any assumptions beyond those made in the :class:`SimpleTrainer` are too much for research. The code of this class has been annotated about restrictive assumptions it makes. When they do not work for you, you're encouraged to: 1. Overwrite methods of this class, OR: 2. Use :class:`SimpleTrainer`, which only does minimal SGD training and nothing else. You can then add your own hooks if needed. OR: 3. Write your own training loop similar to `tools/plain_train_net.py`. See the :doc:`/tutorials/training` tutorials for more details. Note that the behavior of this class, like other functions/classes in this file, is not stable, since it is meant to represent the "common default behavior". It is only guaranteed to work well with the standard models and training workflow in detectron2. To obtain more stable behavior, write your own training logic with other public APIs. Examples: :: trainer = DefaultTrainer(cfg) trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS trainer.train() Attributes: scheduler: checkpointer (DetectionCheckpointer): cfg (CfgNode): """ def __init__(self, cfg): """ Args: cfg (CfgNode): """ super().__init__() logger = logging.getLogger("detectron2") if not logger.isEnabledFor(logging.INFO): # setup_logger is not called for d2 setup_logger() cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) model = create_ddp_model(model, broadcast_buffers=False) self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)( model, data_loader, optimizer ) self.scheduler = self.build_lr_scheduler(cfg, optimizer) self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, trainer=weakref.proxy(self), ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks()) def resume_or_load(self, resume=True): """ If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by a `last_checkpoint` file), resume from the file. Resuming means loading all available states (eg. optimizer and scheduler) and update iteration counter from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used. Otherwise, this is considered as an independent training. The method will load model weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start from iteration 0. Args: resume (bool): whether to do resume or not """ self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume) if resume and self.checkpointer.has_checkpoint(): # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration self.start_iter = self.iter + 1 def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(), hooks.PreciseBN( # Run at the same freq as (but before) evaluation. cfg.TEST.EVAL_PERIOD, self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. if comm.is_main_process(): ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results # Do evaluation after checkpointer, because then if it fails, # we can use the saved checkpoint to debug. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): # Here the default print/log frequency of each writer is used. # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers(), period=20)) return ret def build_writers(self): """ Build a list of writers to be used using :func:`default_writers()`. If you'd like a different list of writers, you can overwrite it in your trainer. Returns: list[EventWriter]: a list of :class:`EventWriter` objects. """ return default_writers(self.cfg.OUTPUT_DIR, self.max_iter) def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ super().train(self.start_iter, self.max_iter) if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process(): assert hasattr( self, "_last_eval_results" ), "No evaluation results obtained during training!" verify_results(self.cfg, self._last_eval_results) return self._last_eval_results def run_step(self): self._trainer.iter = self.iter self._trainer.run_step() def state_dict(self): ret = super().state_dict() ret["_trainer"] = self._trainer.state_dict() return ret def load_state_dict(self, state_dict): super().load_state_dict(state_dict) self._trainer.load_state_dict(state_dict["_trainer"]) @classmethod def build_model(cls, cfg): """ Returns: torch.nn.Module: It now calls :func:`detectron2.modeling.build_model`. Overwrite it if you'd like a different model. """ model = build_model(cfg) logger = logging.getLogger(__name__) logger.info("Model:\n{}".format(model)) return model @classmethod def build_optimizer(cls, cfg, model): """ Returns: torch.optim.Optimizer: It now calls :func:`detectron2.solver.build_optimizer`. Overwrite it if you'd like a different optimizer. """ return build_optimizer(cfg, model) @classmethod def build_lr_scheduler(cls, cfg, optimizer): """ It now calls :func:`detectron2.solver.build_lr_scheduler`. Overwrite it if you'd like a different scheduler. """ return build_lr_scheduler(cfg, optimizer) @classmethod def build_train_loader(cls, cfg): """ Returns: iterable It now calls :func:`detectron2.data.build_detection_train_loader`. Overwrite it if you'd like a different data loader. """ return build_detection_train_loader(cfg) @classmethod def build_test_loader(cls, cfg, dataset_name): """ Returns: iterable It now calls :func:`detectron2.data.build_detection_test_loader`. Overwrite it if you'd like a different data loader. """ return build_detection_test_loader(cfg, dataset_name) @classmethod def build_evaluator(cls, cfg, dataset_name): """ Returns: DatasetEvaluator or None It is not implemented by default. """ raise NotImplementedError( """ If you want DefaultTrainer to automatically run evaluation, please implement `build_evaluator()` in subclasses (see train_net.py for example). Alternatively, you can call evaluation functions yourself (see Colab balloon tutorial for example). """ ) @classmethod def test(cls, cfg, model, evaluators=None): """ Evaluate the given model. The given model is expected to already contain weights to evaluate. Args: cfg (CfgNode): model (nn.Module): evaluators (list[DatasetEvaluator] or None): if None, will call :meth:`build_evaluator`. Otherwise, must have the same length as ``cfg.DATASETS.TEST``. Returns: dict: a dict of result metrics """ logger = logging.getLogger(__name__) if isinstance(evaluators, DatasetEvaluator): evaluators = [evaluators] if evaluators is not None: assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( len(cfg.DATASETS.TEST), len(evaluators) ) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TEST): data_loader = cls.build_test_loader(cfg, dataset_name) # When evaluators are passed in as arguments, # implicitly assume that evaluators can be created before data_loader. if evaluators is not None: evaluator = evaluators[idx] else: try: evaluator = cls.build_evaluator(cfg, dataset_name) except NotImplementedError: logger.warn( "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " "or implement its `build_evaluator` method." ) results[dataset_name] = {} continue results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results_i, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results_i ) logger.info("Evaluation results for {} in csv format:".format(dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results @staticmethod def auto_scale_workers(cfg, num_workers: int): """ When the config is defined for certain number of workers (according to ``cfg.SOLVER.REFERENCE_WORLD_SIZE``) that's different from the number of workers currently in use, returns a new cfg where the total batch size is scaled so that the per-GPU batch size stays the same as the original ``IMS_PER_BATCH // REFERENCE_WORLD_SIZE``. Other config options are also scaled accordingly: * training steps and warmup steps are scaled inverse proportionally. * learning rate are scaled proportionally, following :paper:`ImageNet in 1h`. For example, with the original config like the following: .. code-block:: yaml IMS_PER_BATCH: 16 BASE_LR: 0.1 REFERENCE_WORLD_SIZE: 8 MAX_ITER: 5000 STEPS: (4000,) CHECKPOINT_PERIOD: 1000 When this config is used on 16 GPUs instead of the reference number 8, calling this method will return a new config with: .. code-block:: yaml IMS_PER_BATCH: 32 BASE_LR: 0.2 REFERENCE_WORLD_SIZE: 16 MAX_ITER: 2500 STEPS: (2000,) CHECKPOINT_PERIOD: 500 Note that both the original config and this new config can be trained on 16 GPUs. It's up to user whether to enable this feature (by setting ``REFERENCE_WORLD_SIZE``). Returns: CfgNode: a new config. Same as original if ``cfg.SOLVER.REFERENCE_WORLD_SIZE==0``. """ old_world_size = cfg.SOLVER.REFERENCE_WORLD_SIZE if old_world_size == 0 or old_world_size == num_workers: return cfg cfg = cfg.clone() frozen = cfg.is_frozen() cfg.defrost() assert ( cfg.SOLVER.IMS_PER_BATCH % old_world_size == 0 ), "Invalid REFERENCE_WORLD_SIZE in config!" scale = num_workers / old_world_size bs = cfg.SOLVER.IMS_PER_BATCH = int(round(cfg.SOLVER.IMS_PER_BATCH * scale)) lr = cfg.SOLVER.BASE_LR = cfg.SOLVER.BASE_LR * scale max_iter = cfg.SOLVER.MAX_ITER = int(round(cfg.SOLVER.MAX_ITER / scale)) warmup_iter = cfg.SOLVER.WARMUP_ITERS = int(round(cfg.SOLVER.WARMUP_ITERS / scale)) cfg.SOLVER.STEPS = tuple(int(round(s / scale)) for s in cfg.SOLVER.STEPS) cfg.TEST.EVAL_PERIOD = int(round(cfg.TEST.EVAL_PERIOD / scale)) cfg.SOLVER.CHECKPOINT_PERIOD = int(round(cfg.SOLVER.CHECKPOINT_PERIOD / scale)) cfg.SOLVER.REFERENCE_WORLD_SIZE = num_workers # maintain invariant logger = logging.getLogger(__name__) logger.info( f"Auto-scaling the config to batch_size={bs}, learning_rate={lr}, " f"max_iter={max_iter}, warmup={warmup_iter}." ) if frozen: cfg.freeze() return cfg # Access basic attributes from the underlying trainer for _attr in ["model", "data_loader", "optimizer"]: setattr( DefaultTrainer, _attr, property( # getter lambda self, x=_attr: getattr(self._trainer, x), # setter lambda self, value, x=_attr: setattr(self._trainer, x, value), ), ) ================================================ FILE: annotator/oneformer/detectron2/engine/hooks.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import datetime import itertools import logging import math import operator import os import tempfile import time import warnings from collections import Counter import torch from fvcore.common.checkpoint import Checkpointer from fvcore.common.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer from fvcore.common.param_scheduler import ParamScheduler from fvcore.common.timer import Timer from fvcore.nn.precise_bn import get_bn_modules, update_bn_stats import annotator.oneformer.detectron2.utils.comm as comm from annotator.oneformer.detectron2.evaluation.testing import flatten_results_dict from annotator.oneformer.detectron2.solver import LRMultiplier from annotator.oneformer.detectron2.solver import LRScheduler as _LRScheduler from annotator.oneformer.detectron2.utils.events import EventStorage, EventWriter from annotator.oneformer.detectron2.utils.file_io import PathManager from .train_loop import HookBase __all__ = [ "CallbackHook", "IterationTimer", "PeriodicWriter", "PeriodicCheckpointer", "BestCheckpointer", "LRScheduler", "AutogradProfiler", "EvalHook", "PreciseBN", "TorchProfiler", "TorchMemoryStats", ] """ Implement some common hooks. """ class CallbackHook(HookBase): """ Create a hook using callback functions provided by the user. """ def __init__(self, *, before_train=None, after_train=None, before_step=None, after_step=None): """ Each argument is a function that takes one argument: the trainer. """ self._before_train = before_train self._before_step = before_step self._after_step = after_step self._after_train = after_train def before_train(self): if self._before_train: self._before_train(self.trainer) def after_train(self): if self._after_train: self._after_train(self.trainer) # The functions may be closures that hold reference to the trainer # Therefore, delete them to avoid circular reference. del self._before_train, self._after_train del self._before_step, self._after_step def before_step(self): if self._before_step: self._before_step(self.trainer) def after_step(self): if self._after_step: self._after_step(self.trainer) class IterationTimer(HookBase): """ Track the time spent for each iteration (each run_step call in the trainer). Print a summary in the end of training. This hook uses the time between the call to its :meth:`before_step` and :meth:`after_step` methods. Under the convention that :meth:`before_step` of all hooks should only take negligible amount of time, the :class:`IterationTimer` hook should be placed at the beginning of the list of hooks to obtain accurate timing. """ def __init__(self, warmup_iter=3): """ Args: warmup_iter (int): the number of iterations at the beginning to exclude from timing. """ self._warmup_iter = warmup_iter self._step_timer = Timer() self._start_time = time.perf_counter() self._total_timer = Timer() def before_train(self): self._start_time = time.perf_counter() self._total_timer.reset() self._total_timer.pause() def after_train(self): logger = logging.getLogger(__name__) total_time = time.perf_counter() - self._start_time total_time_minus_hooks = self._total_timer.seconds() hook_time = total_time - total_time_minus_hooks num_iter = self.trainer.storage.iter + 1 - self.trainer.start_iter - self._warmup_iter if num_iter > 0 and total_time_minus_hooks > 0: # Speed is meaningful only after warmup # NOTE this format is parsed by grep in some scripts logger.info( "Overall training speed: {} iterations in {} ({:.4f} s / it)".format( num_iter, str(datetime.timedelta(seconds=int(total_time_minus_hooks))), total_time_minus_hooks / num_iter, ) ) logger.info( "Total training time: {} ({} on hooks)".format( str(datetime.timedelta(seconds=int(total_time))), str(datetime.timedelta(seconds=int(hook_time))), ) ) def before_step(self): self._step_timer.reset() self._total_timer.resume() def after_step(self): # +1 because we're in after_step, the current step is done # but not yet counted iter_done = self.trainer.storage.iter - self.trainer.start_iter + 1 if iter_done >= self._warmup_iter: sec = self._step_timer.seconds() self.trainer.storage.put_scalars(time=sec) else: self._start_time = time.perf_counter() self._total_timer.reset() self._total_timer.pause() class PeriodicWriter(HookBase): """ Write events to EventStorage (by calling ``writer.write()``) periodically. It is executed every ``period`` iterations and after the last iteration. Note that ``period`` does not affect how data is smoothed by each writer. """ def __init__(self, writers, period=20): """ Args: writers (list[EventWriter]): a list of EventWriter objects period (int): """ self._writers = writers for w in writers: assert isinstance(w, EventWriter), w self._period = period def after_step(self): if (self.trainer.iter + 1) % self._period == 0 or ( self.trainer.iter == self.trainer.max_iter - 1 ): for writer in self._writers: writer.write() def after_train(self): for writer in self._writers: # If any new data is found (e.g. produced by other after_train), # write them before closing writer.write() writer.close() class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase): """ Same as :class:`detectron2.checkpoint.PeriodicCheckpointer`, but as a hook. Note that when used as a hook, it is unable to save additional data other than what's defined by the given `checkpointer`. It is executed every ``period`` iterations and after the last iteration. """ def before_train(self): self.max_iter = self.trainer.max_iter def after_step(self): # No way to use **kwargs self.step(self.trainer.iter) class BestCheckpointer(HookBase): """ Checkpoints best weights based off given metric. This hook should be used in conjunction to and executed after the hook that produces the metric, e.g. `EvalHook`. """ def __init__( self, eval_period: int, checkpointer: Checkpointer, val_metric: str, mode: str = "max", file_prefix: str = "model_best", ) -> None: """ Args: eval_period (int): the period `EvalHook` is set to run. checkpointer: the checkpointer object used to save checkpoints. val_metric (str): validation metric to track for best checkpoint, e.g. "bbox/AP50" mode (str): one of {'max', 'min'}. controls whether the chosen val metric should be maximized or minimized, e.g. for "bbox/AP50" it should be "max" file_prefix (str): the prefix of checkpoint's filename, defaults to "model_best" """ self._logger = logging.getLogger(__name__) self._period = eval_period self._val_metric = val_metric assert mode in [ "max", "min", ], f'Mode "{mode}" to `BestCheckpointer` is unknown. It should be one of {"max", "min"}.' if mode == "max": self._compare = operator.gt else: self._compare = operator.lt self._checkpointer = checkpointer self._file_prefix = file_prefix self.best_metric = None self.best_iter = None def _update_best(self, val, iteration): if math.isnan(val) or math.isinf(val): return False self.best_metric = val self.best_iter = iteration return True def _best_checking(self): metric_tuple = self.trainer.storage.latest().get(self._val_metric) if metric_tuple is None: self._logger.warning( f"Given val metric {self._val_metric} does not seem to be computed/stored." "Will not be checkpointing based on it." ) return else: latest_metric, metric_iter = metric_tuple if self.best_metric is None: if self._update_best(latest_metric, metric_iter): additional_state = {"iteration": metric_iter} self._checkpointer.save(f"{self._file_prefix}", **additional_state) self._logger.info( f"Saved first model at {self.best_metric:0.5f} @ {self.best_iter} steps" ) elif self._compare(latest_metric, self.best_metric): additional_state = {"iteration": metric_iter} self._checkpointer.save(f"{self._file_prefix}", **additional_state) self._logger.info( f"Saved best model as latest eval score for {self._val_metric} is " f"{latest_metric:0.5f}, better than last best score " f"{self.best_metric:0.5f} @ iteration {self.best_iter}." ) self._update_best(latest_metric, metric_iter) else: self._logger.info( f"Not saving as latest eval score for {self._val_metric} is {latest_metric:0.5f}, " f"not better than best score {self.best_metric:0.5f} @ iteration {self.best_iter}." ) def after_step(self): # same conditions as `EvalHook` next_iter = self.trainer.iter + 1 if ( self._period > 0 and next_iter % self._period == 0 and next_iter != self.trainer.max_iter ): self._best_checking() def after_train(self): # same conditions as `EvalHook` if self.trainer.iter + 1 >= self.trainer.max_iter: self._best_checking() class LRScheduler(HookBase): """ A hook which executes a torch builtin LR scheduler and summarizes the LR. It is executed after every iteration. """ def __init__(self, optimizer=None, scheduler=None): """ Args: optimizer (torch.optim.Optimizer): scheduler (torch.optim.LRScheduler or fvcore.common.param_scheduler.ParamScheduler): if a :class:`ParamScheduler` object, it defines the multiplier over the base LR in the optimizer. If any argument is not given, will try to obtain it from the trainer. """ self._optimizer = optimizer self._scheduler = scheduler def before_train(self): self._optimizer = self._optimizer or self.trainer.optimizer if isinstance(self.scheduler, ParamScheduler): self._scheduler = LRMultiplier( self._optimizer, self.scheduler, self.trainer.max_iter, last_iter=self.trainer.iter - 1, ) self._best_param_group_id = LRScheduler.get_best_param_group_id(self._optimizer) @staticmethod def get_best_param_group_id(optimizer): # NOTE: some heuristics on what LR to summarize # summarize the param group with most parameters largest_group = max(len(g["params"]) for g in optimizer.param_groups) if largest_group == 1: # If all groups have one parameter, # then find the most common initial LR, and use it for summary lr_count = Counter([g["lr"] for g in optimizer.param_groups]) lr = lr_count.most_common()[0][0] for i, g in enumerate(optimizer.param_groups): if g["lr"] == lr: return i else: for i, g in enumerate(optimizer.param_groups): if len(g["params"]) == largest_group: return i def after_step(self): lr = self._optimizer.param_groups[self._best_param_group_id]["lr"] self.trainer.storage.put_scalar("lr", lr, smoothing_hint=False) self.scheduler.step() @property def scheduler(self): return self._scheduler or self.trainer.scheduler def state_dict(self): if isinstance(self.scheduler, _LRScheduler): return self.scheduler.state_dict() return {} def load_state_dict(self, state_dict): if isinstance(self.scheduler, _LRScheduler): logger = logging.getLogger(__name__) logger.info("Loading scheduler from state_dict ...") self.scheduler.load_state_dict(state_dict) class TorchProfiler(HookBase): """ A hook which runs `torch.profiler.profile`. Examples: :: hooks.TorchProfiler( lambda trainer: 10 < trainer.iter < 20, self.cfg.OUTPUT_DIR ) The above example will run the profiler for iteration 10~20 and dump results to ``OUTPUT_DIR``. We did not profile the first few iterations because they are typically slower than the rest. The result files can be loaded in the ``chrome://tracing`` page in chrome browser, and the tensorboard visualizations can be visualized using ``tensorboard --logdir OUTPUT_DIR/log`` """ def __init__(self, enable_predicate, output_dir, *, activities=None, save_tensorboard=True): """ Args: enable_predicate (callable[trainer -> bool]): a function which takes a trainer, and returns whether to enable the profiler. It will be called once every step, and can be used to select which steps to profile. output_dir (str): the output directory to dump tracing files. activities (iterable): same as in `torch.profiler.profile`. save_tensorboard (bool): whether to save tensorboard visualizations at (output_dir)/log/ """ self._enable_predicate = enable_predicate self._activities = activities self._output_dir = output_dir self._save_tensorboard = save_tensorboard def before_step(self): if self._enable_predicate(self.trainer): if self._save_tensorboard: on_trace_ready = torch.profiler.tensorboard_trace_handler( os.path.join( self._output_dir, "log", "profiler-tensorboard-iter{}".format(self.trainer.iter), ), f"worker{comm.get_rank()}", ) else: on_trace_ready = None self._profiler = torch.profiler.profile( activities=self._activities, on_trace_ready=on_trace_ready, record_shapes=True, profile_memory=True, with_stack=True, with_flops=True, ) self._profiler.__enter__() else: self._profiler = None def after_step(self): if self._profiler is None: return self._profiler.__exit__(None, None, None) if not self._save_tensorboard: PathManager.mkdirs(self._output_dir) out_file = os.path.join( self._output_dir, "profiler-trace-iter{}.json".format(self.trainer.iter) ) if "://" not in out_file: self._profiler.export_chrome_trace(out_file) else: # Support non-posix filesystems with tempfile.TemporaryDirectory(prefix="detectron2_profiler") as d: tmp_file = os.path.join(d, "tmp.json") self._profiler.export_chrome_trace(tmp_file) with open(tmp_file) as f: content = f.read() with PathManager.open(out_file, "w") as f: f.write(content) class AutogradProfiler(TorchProfiler): """ A hook which runs `torch.autograd.profiler.profile`. Examples: :: hooks.AutogradProfiler( lambda trainer: 10 < trainer.iter < 20, self.cfg.OUTPUT_DIR ) The above example will run the profiler for iteration 10~20 and dump results to ``OUTPUT_DIR``. We did not profile the first few iterations because they are typically slower than the rest. The result files can be loaded in the ``chrome://tracing`` page in chrome browser. Note: When used together with NCCL on older version of GPUs, autograd profiler may cause deadlock because it unnecessarily allocates memory on every device it sees. The memory management calls, if interleaved with NCCL calls, lead to deadlock on GPUs that do not support ``cudaLaunchCooperativeKernelMultiDevice``. """ def __init__(self, enable_predicate, output_dir, *, use_cuda=True): """ Args: enable_predicate (callable[trainer -> bool]): a function which takes a trainer, and returns whether to enable the profiler. It will be called once every step, and can be used to select which steps to profile. output_dir (str): the output directory to dump tracing files. use_cuda (bool): same as in `torch.autograd.profiler.profile`. """ warnings.warn("AutogradProfiler has been deprecated in favor of TorchProfiler.") self._enable_predicate = enable_predicate self._use_cuda = use_cuda self._output_dir = output_dir def before_step(self): if self._enable_predicate(self.trainer): self._profiler = torch.autograd.profiler.profile(use_cuda=self._use_cuda) self._profiler.__enter__() else: self._profiler = None class EvalHook(HookBase): """ Run an evaluation function periodically, and at the end of training. It is executed every ``eval_period`` iterations and after the last iteration. """ def __init__(self, eval_period, eval_function, eval_after_train=True): """ Args: eval_period (int): the period to run `eval_function`. Set to 0 to not evaluate periodically (but still evaluate after the last iteration if `eval_after_train` is True). eval_function (callable): a function which takes no arguments, and returns a nested dict of evaluation metrics. eval_after_train (bool): whether to evaluate after the last iteration Note: This hook must be enabled in all or none workers. If you would like only certain workers to perform evaluation, give other workers a no-op function (`eval_function=lambda: None`). """ self._period = eval_period self._func = eval_function self._eval_after_train = eval_after_train def _do_eval(self): results = self._func() if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format(results) flattened_results = flatten_results_dict(results) for k, v in flattened_results.items(): try: v = float(v) except Exception as e: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v) ) from e self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False) # Evaluation may take different time among workers. # A barrier make them start the next iteration together. comm.synchronize() def after_step(self): next_iter = self.trainer.iter + 1 if self._period > 0 and next_iter % self._period == 0: # do the last eval in after_train if next_iter != self.trainer.max_iter: self._do_eval() def after_train(self): # This condition is to prevent the eval from running after a failed training if self._eval_after_train and self.trainer.iter + 1 >= self.trainer.max_iter: self._do_eval() # func is likely a closure that holds reference to the trainer # therefore we clean it to avoid circular reference in the end del self._func class PreciseBN(HookBase): """ The standard implementation of BatchNorm uses EMA in inference, which is sometimes suboptimal. This class computes the true average of statistics rather than the moving average, and put true averages to every BN layer in the given model. It is executed every ``period`` iterations and after the last iteration. """ def __init__(self, period, model, data_loader, num_iter): """ Args: period (int): the period this hook is run, or 0 to not run during training. The hook will always run in the end of training. model (nn.Module): a module whose all BN layers in training mode will be updated by precise BN. Note that user is responsible for ensuring the BN layers to be updated are in training mode when this hook is triggered. data_loader (iterable): it will produce data to be run by `model(data)`. num_iter (int): number of iterations used to compute the precise statistics. """ self._logger = logging.getLogger(__name__) if len(get_bn_modules(model)) == 0: self._logger.info( "PreciseBN is disabled because model does not contain BN layers in training mode." ) self._disabled = True return self._model = model self._data_loader = data_loader self._num_iter = num_iter self._period = period self._disabled = False self._data_iter = None def after_step(self): next_iter = self.trainer.iter + 1 is_final = next_iter == self.trainer.max_iter if is_final or (self._period > 0 and next_iter % self._period == 0): self.update_stats() def update_stats(self): """ Update the model with precise statistics. Users can manually call this method. """ if self._disabled: return if self._data_iter is None: self._data_iter = iter(self._data_loader) def data_loader(): for num_iter in itertools.count(1): if num_iter % 100 == 0: self._logger.info( "Running precise-BN ... {}/{} iterations.".format(num_iter, self._num_iter) ) # This way we can reuse the same iterator yield next(self._data_iter) with EventStorage(): # capture events in a new storage to discard them self._logger.info( "Running precise-BN for {} iterations... ".format(self._num_iter) + "Note that this could produce different statistics every time." ) update_bn_stats(self._model, data_loader(), self._num_iter) class TorchMemoryStats(HookBase): """ Writes pytorch's cuda memory statistics periodically. """ def __init__(self, period=20, max_runs=10): """ Args: period (int): Output stats each 'period' iterations max_runs (int): Stop the logging after 'max_runs' """ self._logger = logging.getLogger(__name__) self._period = period self._max_runs = max_runs self._runs = 0 def after_step(self): if self._runs > self._max_runs: return if (self.trainer.iter + 1) % self._period == 0 or ( self.trainer.iter == self.trainer.max_iter - 1 ): if torch.cuda.is_available(): max_reserved_mb = torch.cuda.max_memory_reserved() / 1024.0 / 1024.0 reserved_mb = torch.cuda.memory_reserved() / 1024.0 / 1024.0 max_allocated_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0 allocated_mb = torch.cuda.memory_allocated() / 1024.0 / 1024.0 self._logger.info( ( " iter: {} " " max_reserved_mem: {:.0f}MB " " reserved_mem: {:.0f}MB " " max_allocated_mem: {:.0f}MB " " allocated_mem: {:.0f}MB " ).format( self.trainer.iter, max_reserved_mb, reserved_mb, max_allocated_mb, allocated_mb, ) ) self._runs += 1 if self._runs == self._max_runs: mem_summary = torch.cuda.memory_summary() self._logger.info("\n" + mem_summary) torch.cuda.reset_peak_memory_stats() ================================================ FILE: annotator/oneformer/detectron2/engine/launch.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging from datetime import timedelta import torch import torch.distributed as dist import torch.multiprocessing as mp from annotator.oneformer.detectron2.utils import comm __all__ = ["DEFAULT_TIMEOUT", "launch"] DEFAULT_TIMEOUT = timedelta(minutes=30) def _find_free_port(): import socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Binding to port 0 will cause the OS to find an available port for us sock.bind(("", 0)) port = sock.getsockname()[1] sock.close() # NOTE: there is still a chance the port could be taken by other processes. return port def launch( main_func, # Should be num_processes_per_machine, but kept for compatibility. num_gpus_per_machine, num_machines=1, machine_rank=0, dist_url=None, args=(), timeout=DEFAULT_TIMEOUT, ): """ Launch multi-process or distributed training. This function must be called on all machines involved in the training. It will spawn child processes (defined by ``num_gpus_per_machine``) on each machine. Args: main_func: a function that will be called by `main_func(*args)` num_gpus_per_machine (int): number of processes per machine. When using GPUs, this should be the number of GPUs. num_machines (int): the total number of machines machine_rank (int): the rank of this machine dist_url (str): url to connect to for distributed jobs, including protocol e.g. "tcp://127.0.0.1:8686". Can be set to "auto" to automatically select a free port on localhost timeout (timedelta): timeout of the distributed workers args (tuple): arguments passed to main_func """ world_size = num_machines * num_gpus_per_machine if world_size > 1: # https://github.com/pytorch/pytorch/pull/14391 # TODO prctl in spawned processes if dist_url == "auto": assert num_machines == 1, "dist_url=auto not supported in multi-machine jobs." port = _find_free_port() dist_url = f"tcp://127.0.0.1:{port}" if num_machines > 1 and dist_url.startswith("file://"): logger = logging.getLogger(__name__) logger.warning( "file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://" ) mp.start_processes( _distributed_worker, nprocs=num_gpus_per_machine, args=( main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args, timeout, ), daemon=False, ) else: main_func(*args) def _distributed_worker( local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args, timeout=DEFAULT_TIMEOUT, ): has_gpu = torch.cuda.is_available() if has_gpu: assert num_gpus_per_machine <= torch.cuda.device_count() global_rank = machine_rank * num_gpus_per_machine + local_rank try: dist.init_process_group( backend="NCCL" if has_gpu else "GLOO", init_method=dist_url, world_size=world_size, rank=global_rank, timeout=timeout, ) except Exception as e: logger = logging.getLogger(__name__) logger.error("Process group URL: {}".format(dist_url)) raise e # Setup the local process group. comm.create_local_process_group(num_gpus_per_machine) if has_gpu: torch.cuda.set_device(local_rank) # synchronize is needed here to prevent a possible timeout after calling init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() main_func(*args) ================================================ FILE: annotator/oneformer/detectron2/engine/train_loop.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np import time import weakref from typing import List, Mapping, Optional import torch from torch.nn.parallel import DataParallel, DistributedDataParallel import annotator.oneformer.detectron2.utils.comm as comm from annotator.oneformer.detectron2.utils.events import EventStorage, get_event_storage from annotator.oneformer.detectron2.utils.logger import _log_api_usage __all__ = ["HookBase", "TrainerBase", "SimpleTrainer", "AMPTrainer"] class HookBase: """ Base class for hooks that can be registered with :class:`TrainerBase`. Each hook can implement 4 methods. The way they are called is demonstrated in the following snippet: :: hook.before_train() for iter in range(start_iter, max_iter): hook.before_step() trainer.run_step() hook.after_step() iter += 1 hook.after_train() Notes: 1. In the hook method, users can access ``self.trainer`` to access more properties about the context (e.g., model, current iteration, or config if using :class:`DefaultTrainer`). 2. A hook that does something in :meth:`before_step` can often be implemented equivalently in :meth:`after_step`. If the hook takes non-trivial time, it is strongly recommended to implement the hook in :meth:`after_step` instead of :meth:`before_step`. The convention is that :meth:`before_step` should only take negligible time. Following this convention will allow hooks that do care about the difference between :meth:`before_step` and :meth:`after_step` (e.g., timer) to function properly. """ trainer: "TrainerBase" = None """ A weak reference to the trainer object. Set by the trainer when the hook is registered. """ def before_train(self): """ Called before the first iteration. """ pass def after_train(self): """ Called after the last iteration. """ pass def before_step(self): """ Called before each iteration. """ pass def after_backward(self): """ Called after the backward pass of each iteration. """ pass def after_step(self): """ Called after each iteration. """ pass def state_dict(self): """ Hooks are stateless by default, but can be made checkpointable by implementing `state_dict` and `load_state_dict`. """ return {} class TrainerBase: """ Base class for iterative trainer with hooks. The only assumption we made here is: the training runs in a loop. A subclass can implement what the loop is. We made no assumptions about the existence of dataloader, optimizer, model, etc. Attributes: iter(int): the current iteration. start_iter(int): The iteration to start with. By convention the minimum possible value is 0. max_iter(int): The iteration to end training. storage(EventStorage): An EventStorage that's opened during the course of training. """ def __init__(self) -> None: self._hooks: List[HookBase] = [] self.iter: int = 0 self.start_iter: int = 0 self.max_iter: int self.storage: EventStorage _log_api_usage("trainer." + self.__class__.__name__) def register_hooks(self, hooks: List[Optional[HookBase]]) -> None: """ Register hooks to the trainer. The hooks are executed in the order they are registered. Args: hooks (list[Optional[HookBase]]): list of hooks """ hooks = [h for h in hooks if h is not None] for h in hooks: assert isinstance(h, HookBase) # To avoid circular reference, hooks and trainer cannot own each other. # This normally does not matter, but will cause memory leak if the # involved objects contain __del__: # See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/ h.trainer = weakref.proxy(self) self._hooks.extend(hooks) def train(self, start_iter: int, max_iter: int): """ Args: start_iter, max_iter (int): See docs above """ logger = logging.getLogger(__name__) logger.info("Starting training from iteration {}".format(start_iter)) self.iter = self.start_iter = start_iter self.max_iter = max_iter with EventStorage(start_iter) as self.storage: try: self.before_train() for self.iter in range(start_iter, max_iter): self.before_step() self.run_step() self.after_step() # self.iter == max_iter can be used by `after_train` to # tell whether the training successfully finished or failed # due to exceptions. self.iter += 1 except Exception: logger.exception("Exception during training:") raise finally: self.after_train() def before_train(self): for h in self._hooks: h.before_train() def after_train(self): self.storage.iter = self.iter for h in self._hooks: h.after_train() def before_step(self): # Maintain the invariant that storage.iter == trainer.iter # for the entire execution of each step self.storage.iter = self.iter for h in self._hooks: h.before_step() def after_backward(self): for h in self._hooks: h.after_backward() def after_step(self): for h in self._hooks: h.after_step() def run_step(self): raise NotImplementedError def state_dict(self): ret = {"iteration": self.iter} hooks_state = {} for h in self._hooks: sd = h.state_dict() if sd: name = type(h).__qualname__ if name in hooks_state: # TODO handle repetitive stateful hooks continue hooks_state[name] = sd if hooks_state: ret["hooks"] = hooks_state return ret def load_state_dict(self, state_dict): logger = logging.getLogger(__name__) self.iter = state_dict["iteration"] for key, value in state_dict.get("hooks", {}).items(): for h in self._hooks: try: name = type(h).__qualname__ except AttributeError: continue if name == key: h.load_state_dict(value) break else: logger.warning(f"Cannot find the hook '{key}', its state_dict is ignored.") class SimpleTrainer(TrainerBase): """ A simple trainer for the most common type of task: single-cost single-optimizer single-data-source iterative optimization, optionally using data-parallelism. It assumes that every step, you: 1. Compute the loss with a data from the data_loader. 2. Compute the gradients with the above loss. 3. Update the model with the optimizer. All other tasks during training (checkpointing, logging, evaluation, LR schedule) are maintained by hooks, which can be registered by :meth:`TrainerBase.register_hooks`. If you want to do anything fancier than this, either subclass TrainerBase and implement your own `run_step`, or write your own training loop. """ def __init__(self, model, data_loader, optimizer, gather_metric_period=1): """ Args: model: a torch Module. Takes a data from data_loader and returns a dict of losses. data_loader: an iterable. Contains data to be used to call model. optimizer: a torch optimizer. gather_metric_period: an int. Every gather_metric_period iterations the metrics are gathered from all the ranks to rank 0 and logged. """ super().__init__() """ We set the model to training mode in the trainer. However it's valid to train a model that's in eval mode. If you want your model (or a submodule of it) to behave like evaluation during training, you can overwrite its train() method. """ model.train() self.model = model self.data_loader = data_loader # to access the data loader iterator, call `self._data_loader_iter` self._data_loader_iter_obj = None self.optimizer = optimizer self.gather_metric_period = gather_metric_period def run_step(self): """ Implement the standard training logic described above. """ assert self.model.training, "[SimpleTrainer] model was changed to eval mode!" start = time.perf_counter() """ If you want to do something with the data, you can wrap the dataloader. """ data = next(self._data_loader_iter) data_time = time.perf_counter() - start """ If you want to do something with the losses, you can wrap the model. """ loss_dict = self.model(data) if isinstance(loss_dict, torch.Tensor): losses = loss_dict loss_dict = {"total_loss": loss_dict} else: losses = sum(loss_dict.values()) """ If you need to accumulate gradients or do something similar, you can wrap the optimizer with your custom `zero_grad()` method. """ self.optimizer.zero_grad() losses.backward() self.after_backward() self._write_metrics(loss_dict, data_time) """ If you need gradient clipping/scaling or other processing, you can wrap the optimizer with your custom `step()` method. But it is suboptimal as explained in https://arxiv.org/abs/2006.15704 Sec 3.2.4 """ self.optimizer.step() @property def _data_loader_iter(self): # only create the data loader iterator when it is used if self._data_loader_iter_obj is None: self._data_loader_iter_obj = iter(self.data_loader) return self._data_loader_iter_obj def reset_data_loader(self, data_loader_builder): """ Delete and replace the current data loader with a new one, which will be created by calling `data_loader_builder` (without argument). """ del self.data_loader data_loader = data_loader_builder() self.data_loader = data_loader self._data_loader_iter_obj = None def _write_metrics( self, loss_dict: Mapping[str, torch.Tensor], data_time: float, prefix: str = "", ) -> None: if (self.iter + 1) % self.gather_metric_period == 0: SimpleTrainer.write_metrics(loss_dict, data_time, prefix) @staticmethod def write_metrics( loss_dict: Mapping[str, torch.Tensor], data_time: float, prefix: str = "", ) -> None: """ Args: loss_dict (dict): dict of scalar losses data_time (float): time taken by the dataloader iteration prefix (str): prefix for logging keys """ metrics_dict = {k: v.detach().cpu().item() for k, v in loss_dict.items()} metrics_dict["data_time"] = data_time # Gather metrics among all workers for logging # This assumes we do DDP-style training, which is currently the only # supported method in detectron2. all_metrics_dict = comm.gather(metrics_dict) if comm.is_main_process(): storage = get_event_storage() # data_time among workers can have high variance. The actual latency # caused by data_time is the maximum among workers. data_time = np.max([x.pop("data_time") for x in all_metrics_dict]) storage.put_scalar("data_time", data_time) # average the rest metrics metrics_dict = { k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() } total_losses_reduced = sum(metrics_dict.values()) if not np.isfinite(total_losses_reduced): raise FloatingPointError( f"Loss became infinite or NaN at iteration={storage.iter}!\n" f"loss_dict = {metrics_dict}" ) storage.put_scalar("{}total_loss".format(prefix), total_losses_reduced) if len(metrics_dict) > 1: storage.put_scalars(**metrics_dict) def state_dict(self): ret = super().state_dict() ret["optimizer"] = self.optimizer.state_dict() return ret def load_state_dict(self, state_dict): super().load_state_dict(state_dict) self.optimizer.load_state_dict(state_dict["optimizer"]) class AMPTrainer(SimpleTrainer): """ Like :class:`SimpleTrainer`, but uses PyTorch's native automatic mixed precision in the training loop. """ def __init__( self, model, data_loader, optimizer, gather_metric_period=1, grad_scaler=None, precision: torch.dtype = torch.float16, log_grad_scaler: bool = False, ): """ Args: model, data_loader, optimizer, gather_metric_period: same as in :class:`SimpleTrainer`. grad_scaler: torch GradScaler to automatically scale gradients. precision: torch.dtype as the target precision to cast to in computations """ unsupported = "AMPTrainer does not support single-process multi-device training!" if isinstance(model, DistributedDataParallel): assert not (model.device_ids and len(model.device_ids) > 1), unsupported assert not isinstance(model, DataParallel), unsupported super().__init__(model, data_loader, optimizer, gather_metric_period) if grad_scaler is None: from torch.cuda.amp import GradScaler grad_scaler = GradScaler() self.grad_scaler = grad_scaler self.precision = precision self.log_grad_scaler = log_grad_scaler def run_step(self): """ Implement the AMP training logic. """ assert self.model.training, "[AMPTrainer] model was changed to eval mode!" assert torch.cuda.is_available(), "[AMPTrainer] CUDA is required for AMP training!" from torch.cuda.amp import autocast start = time.perf_counter() data = next(self._data_loader_iter) data_time = time.perf_counter() - start with autocast(dtype=self.precision): loss_dict = self.model(data) if isinstance(loss_dict, torch.Tensor): losses = loss_dict loss_dict = {"total_loss": loss_dict} else: losses = sum(loss_dict.values()) self.optimizer.zero_grad() self.grad_scaler.scale(losses).backward() if self.log_grad_scaler: storage = get_event_storage() storage.put_scalar("[metric]grad_scaler", self.grad_scaler.get_scale()) self.after_backward() self._write_metrics(loss_dict, data_time) self.grad_scaler.step(self.optimizer) self.grad_scaler.update() def state_dict(self): ret = super().state_dict() ret["grad_scaler"] = self.grad_scaler.state_dict() return ret def load_state_dict(self, state_dict): super().load_state_dict(state_dict) self.grad_scaler.load_state_dict(state_dict["grad_scaler"]) ================================================ FILE: annotator/oneformer/detectron2/evaluation/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .cityscapes_evaluation import CityscapesInstanceEvaluator, CityscapesSemSegEvaluator from .coco_evaluation import COCOEvaluator from .rotated_coco_evaluation import RotatedCOCOEvaluator from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset from .lvis_evaluation import LVISEvaluator from .panoptic_evaluation import COCOPanopticEvaluator from .pascal_voc_evaluation import PascalVOCDetectionEvaluator from .sem_seg_evaluation import SemSegEvaluator from .testing import print_csv_format, verify_results __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: annotator/oneformer/detectron2/evaluation/cityscapes_evaluation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import glob import logging import numpy as np import os import tempfile from collections import OrderedDict import torch from PIL import Image from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.utils import comm from annotator.oneformer.detectron2.utils.file_io import PathManager from .evaluator import DatasetEvaluator class CityscapesEvaluator(DatasetEvaluator): """ Base class for evaluation using cityscapes API. """ def __init__(self, dataset_name): """ Args: dataset_name (str): the name of the dataset. It must have the following metadata associated with it: "thing_classes", "gt_dir". """ self._metadata = MetadataCatalog.get(dataset_name) self._cpu_device = torch.device("cpu") self._logger = logging.getLogger(__name__) def reset(self): self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_") self._temp_dir = self._working_dir.name # All workers will write to the same results directory # TODO this does not work in distributed training assert ( comm.get_local_size() == comm.get_world_size() ), "CityscapesEvaluator currently do not work with multiple machines." self._temp_dir = comm.all_gather(self._temp_dir)[0] if self._temp_dir != self._working_dir.name: self._working_dir.cleanup() self._logger.info( "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir) ) class CityscapesInstanceEvaluator(CityscapesEvaluator): """ Evaluate instance segmentation results on cityscapes dataset using cityscapes API. Note: * It does not work in multi-machine distributed training. * It contains a synchronization, therefore has to be used on all ranks. * Only the main process runs evaluation. """ def process(self, inputs, outputs): from cityscapesscripts.helpers.labels import name2label for input, output in zip(inputs, outputs): file_name = input["file_name"] basename = os.path.splitext(os.path.basename(file_name))[0] pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt") if "instances" in output: output = output["instances"].to(self._cpu_device) num_instances = len(output) with open(pred_txt, "w") as fout: for i in range(num_instances): pred_class = output.pred_classes[i] classes = self._metadata.thing_classes[pred_class] class_id = name2label[classes].id score = output.scores[i] mask = output.pred_masks[i].numpy().astype("uint8") png_filename = os.path.join( self._temp_dir, basename + "_{}_{}.png".format(i, classes) ) Image.fromarray(mask * 255).save(png_filename) fout.write( "{} {} {}\n".format(os.path.basename(png_filename), class_id, score) ) else: # Cityscapes requires a prediction file for every ground truth image. with open(pred_txt, "w") as fout: pass def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP" and "AP50". """ comm.synchronize() if comm.get_rank() > 0: return import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) # set some global states in cityscapes evaluation API, before evaluating cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) cityscapes_eval.args.predictionWalk = None cityscapes_eval.args.JSONOutput = False cityscapes_eval.args.colorized = False cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json") # These lines are adopted from # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa gt_dir = PathManager.get_local_path(self._metadata.gt_dir) groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png")) assert len( groundTruthImgList ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( cityscapes_eval.args.groundTruthSearch ) predictionImgList = [] for gt in groundTruthImgList: predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args)) results = cityscapes_eval.evaluateImgLists( predictionImgList, groundTruthImgList, cityscapes_eval.args )["averages"] ret = OrderedDict() ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100} self._working_dir.cleanup() return ret class CityscapesSemSegEvaluator(CityscapesEvaluator): """ Evaluate semantic segmentation results on cityscapes dataset using cityscapes API. Note: * It does not work in multi-machine distributed training. * It contains a synchronization, therefore has to be used on all ranks. * Only the main process runs evaluation. """ def process(self, inputs, outputs): from cityscapesscripts.helpers.labels import trainId2label for input, output in zip(inputs, outputs): file_name = input["file_name"] basename = os.path.splitext(os.path.basename(file_name))[0] pred_filename = os.path.join(self._temp_dir, basename + "_pred.png") output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy() pred = 255 * np.ones(output.shape, dtype=np.uint8) for train_id, label in trainId2label.items(): if label.ignoreInEval: continue pred[output == train_id] = label.id Image.fromarray(pred).save(pred_filename) def evaluate(self): comm.synchronize() if comm.get_rank() > 0: return # Load the Cityscapes eval script *after* setting the required env var, # since the script reads CITYSCAPES_DATASET into global variables at load time. import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) # set some global states in cityscapes evaluation API, before evaluating cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) cityscapes_eval.args.predictionWalk = None cityscapes_eval.args.JSONOutput = False cityscapes_eval.args.colorized = False # These lines are adopted from # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa gt_dir = PathManager.get_local_path(self._metadata.gt_dir) groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png")) assert len( groundTruthImgList ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( cityscapes_eval.args.groundTruthSearch ) predictionImgList = [] for gt in groundTruthImgList: predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt)) results = cityscapes_eval.evaluateImgLists( predictionImgList, groundTruthImgList, cityscapes_eval.args ) ret = OrderedDict() ret["sem_seg"] = { "IoU": 100.0 * results["averageScoreClasses"], "iIoU": 100.0 * results["averageScoreInstClasses"], "IoU_sup": 100.0 * results["averageScoreCategories"], "iIoU_sup": 100.0 * results["averageScoreInstCategories"], } self._working_dir.cleanup() return ret ================================================ FILE: annotator/oneformer/detectron2/evaluation/coco_evaluation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import contextlib import copy import io import itertools import json import logging import numpy as np import os import pickle from collections import OrderedDict import annotator.oneformer.pycocotools.mask as mask_util import torch from annotator.oneformer.pycocotools.coco import COCO from annotator.oneformer.pycocotools.cocoeval import COCOeval from tabulate import tabulate import annotator.oneformer.detectron2.utils.comm as comm from annotator.oneformer.detectron2.config import CfgNode from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.data.datasets.coco import convert_to_coco_json from annotator.oneformer.detectron2.structures import Boxes, BoxMode, pairwise_iou from annotator.oneformer.detectron2.utils.file_io import PathManager from annotator.oneformer.detectron2.utils.logger import create_small_table from .evaluator import DatasetEvaluator try: from annotator.oneformer.detectron2.evaluation.fast_eval_api import COCOeval_opt except ImportError: COCOeval_opt = COCOeval class COCOEvaluator(DatasetEvaluator): """ Evaluate AR for object proposals, AP for instance detection/segmentation, AP for keypoint detection outputs using COCO's metrics. See http://cocodataset.org/#detection-eval and http://cocodataset.org/#keypoints-eval to understand its metrics. The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means the metric cannot be computed (e.g. due to no predictions made). In addition to COCO, this evaluator is able to support any bounding box detection, instance segmentation, or keypoint detection dataset. """ def __init__( self, dataset_name, tasks=None, distributed=True, output_dir=None, *, max_dets_per_image=None, use_fast_impl=True, kpt_oks_sigmas=(), allow_cached_coco=True, ): """ Args: dataset_name (str): name of the dataset to be evaluated. It must have either the following corresponding metadata: "json_file": the path to the COCO format annotation Or it must be in detectron2's standard dataset format so it can be converted to COCO format automatically. tasks (tuple[str]): tasks that can be evaluated under the given configuration. A task is one of "bbox", "segm", "keypoints". By default, will infer this automatically from predictions. distributed (True): if True, will collect results from all ranks and run evaluation in the main process. Otherwise, will only evaluate the results in the current process. output_dir (str): optional, an output directory to dump all results predicted on the dataset. The dump contains two files: 1. "instances_predictions.pth" a file that can be loaded with `torch.load` and contains all the results in the format they are produced by the model. 2. "coco_instances_results.json" a json file in COCO's result format. max_dets_per_image (int): limit on the maximum number of detections per image. By default in COCO, this limit is to 100, but this can be customized to be greater, as is needed in evaluation metrics AP fixed and AP pool (see https://arxiv.org/pdf/2102.01066.pdf) This doesn't affect keypoint evaluation. use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP. Although the results should be very close to the official implementation in COCO API, it is still recommended to compute results with the official API for use in papers. The faster implementation also uses more RAM. kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval When empty, it will use the defaults in COCO. Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS. allow_cached_coco (bool): Whether to use cached coco json from previous validation runs. You should set this to False if you need to use different validation data. Defaults to True. """ self._logger = logging.getLogger(__name__) self._distributed = distributed self._output_dir = output_dir if use_fast_impl and (COCOeval_opt is COCOeval): self._logger.info("Fast COCO eval is not built. Falling back to official COCO eval.") use_fast_impl = False self._use_fast_impl = use_fast_impl # COCOeval requires the limit on the number of detections per image (maxDets) to be a list # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the # 3rd element (100) is used as the limit on the number of detections per image when # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval, # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults. if max_dets_per_image is None: max_dets_per_image = [1, 10, 100] else: max_dets_per_image = [1, 10, max_dets_per_image] self._max_dets_per_image = max_dets_per_image if tasks is not None and isinstance(tasks, CfgNode): kpt_oks_sigmas = ( tasks.TEST.KEYPOINT_OKS_SIGMAS if not kpt_oks_sigmas else kpt_oks_sigmas ) self._logger.warn( "COCO Evaluator instantiated using config, this is deprecated behavior." " Please pass in explicit arguments instead." ) self._tasks = None # Infering it from predictions should be better else: self._tasks = tasks self._cpu_device = torch.device("cpu") self._metadata = MetadataCatalog.get(dataset_name) if not hasattr(self._metadata, "json_file"): if output_dir is None: raise ValueError( "output_dir must be provided to COCOEvaluator " "for datasets not in COCO format." ) self._logger.info(f"Trying to convert '{dataset_name}' to COCO format ...") cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json") self._metadata.json_file = cache_path convert_to_coco_json(dataset_name, cache_path, allow_cached=allow_cached_coco) json_file = PathManager.get_local_path(self._metadata.json_file) with contextlib.redirect_stdout(io.StringIO()): self._coco_api = COCO(json_file) # Test set json files do not contain annotations (evaluation must be # performed using the COCO evaluation server). self._do_evaluation = "annotations" in self._coco_api.dataset if self._do_evaluation: self._kpt_oks_sigmas = kpt_oks_sigmas def reset(self): self._predictions = [] def process(self, inputs, outputs): """ Args: inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id". outputs: the outputs of a COCO model. It is a list of dicts with key "instances" that contains :class:`Instances`. """ for input, output in zip(inputs, outputs): prediction = {"image_id": input["image_id"]} if "instances" in output: instances = output["instances"].to(self._cpu_device) prediction["instances"] = instances_to_coco_json(instances, input["image_id"]) if "proposals" in output: prediction["proposals"] = output["proposals"].to(self._cpu_device) if len(prediction) > 1: self._predictions.append(prediction) def evaluate(self, img_ids=None): """ Args: img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset """ if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions if len(predictions) == 0: self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) self._results = OrderedDict() if "proposals" in predictions[0]: self._eval_box_proposals(predictions) if "instances" in predictions[0]: self._eval_predictions(predictions, img_ids=img_ids) # Copy so the caller can do whatever with results return copy.deepcopy(self._results) def _tasks_from_predictions(self, predictions): """ Get COCO API "tasks" (i.e. iou_type) from COCO-format predictions. """ tasks = {"bbox"} for pred in predictions: if "segmentation" in pred: tasks.add("segm") if "keypoints" in pred: tasks.add("keypoints") return sorted(tasks) def _eval_predictions(self, predictions, img_ids=None): """ Evaluate predictions. Fill self._results with the metrics of the tasks. """ self._logger.info("Preparing results for COCO format ...") coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) tasks = self._tasks or self._tasks_from_predictions(coco_results) # unmap the category ids for COCO if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) num_classes = len(all_contiguous_ids) assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} for result in coco_results: category_id = result["category_id"] assert category_id < num_classes, ( f"A prediction has class={category_id}, " f"but the dataset only has {num_classes} classes and " f"predicted class id should be in [0, {num_classes - 1}]." ) result["category_id"] = reverse_id_mapping[category_id] if self._output_dir: file_path = os.path.join(self._output_dir, "coco_instances_results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(coco_results)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info( "Evaluating predictions with {} COCO API...".format( "unofficial" if self._use_fast_impl else "official" ) ) for task in sorted(tasks): assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" coco_eval = ( _evaluate_predictions_on_coco( self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas, cocoeval_fn=COCOeval_opt if self._use_fast_impl else COCOeval, img_ids=img_ids, max_dets_per_image=self._max_dets_per_image, ) if len(coco_results) > 0 else None # cocoapi does not handle empty results very well ) res = self._derive_coco_results( coco_eval, task, class_names=self._metadata.get("thing_classes") ) self._results[task] = res def _eval_box_proposals(self, predictions): """ Evaluate the box proposals in predictions. Fill self._results with the metrics for "box_proposals" task. """ if self._output_dir: # Saving generated box proposals to file. # Predicted box_proposals are in XYXY_ABS mode. bbox_mode = BoxMode.XYXY_ABS.value ids, boxes, objectness_logits = [], [], [] for prediction in predictions: ids.append(prediction["image_id"]) boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy()) objectness_logits.append(prediction["proposals"].objectness_logits.numpy()) proposal_data = { "boxes": boxes, "objectness_logits": objectness_logits, "ids": ids, "bbox_mode": bbox_mode, } with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f: pickle.dump(proposal_data, f) if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info("Evaluating bbox proposals ...") res = {} areas = {"all": "", "small": "s", "medium": "m", "large": "l"} for limit in [100, 1000]: for area, suffix in areas.items(): stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit) key = "AR{}@{:d}".format(suffix, limit) res[key] = float(stats["ar"].item() * 100) self._logger.info("Proposal metrics: \n" + create_small_table(res)) self._results["box_proposals"] = res def _derive_coco_results(self, coco_eval, iou_type, class_names=None): """ Derive the desired score numbers from summarized COCOeval. Args: coco_eval (None or COCOEval): None represents no predictions from model. iou_type (str): class_names (None or list[str]): if provided, will use it to predict per-category AP. Returns: a dict of {metric name: score} """ metrics = { "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], }[iou_type] if coco_eval is None: self._logger.warn("No predictions from the model!") return {metric: float("nan") for metric in metrics} # the standard metrics results = { metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") for idx, metric in enumerate(metrics) } self._logger.info( "Evaluation results for {}: \n".format(iou_type) + create_small_table(results) ) if not np.isfinite(sum(results.values())): self._logger.info("Some metrics cannot be computed and is shown as NaN.") if class_names is None or len(class_names) <= 1: return results # Compute per-category AP # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa precisions = coco_eval.eval["precision"] # precision has dims (iou, recall, cls, area range, max dets) assert len(class_names) == precisions.shape[2] results_per_category = [] for idx, name in enumerate(class_names): # area range index 0: all area ranges # max dets index -1: typically 100 per image precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] ap = np.mean(precision) if precision.size else float("nan") results_per_category.append(("{}".format(name), float(ap * 100))) # tabulate it N_COLS = min(6, len(results_per_category) * 2) results_flatten = list(itertools.chain(*results_per_category)) results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) table = tabulate( results_2d, tablefmt="pipe", floatfmt=".3f", headers=["category", "AP"] * (N_COLS // 2), numalign="left", ) self._logger.info("Per-category {} AP: \n".format(iou_type) + table) results.update({"AP-" + name: ap for name, ap in results_per_category}) return results def instances_to_coco_json(instances, img_id): """ Dump an "Instances" object to a COCO-format json that's used for evaluation. Args: instances (Instances): img_id (int): the image id Returns: list[dict]: list of json annotations in COCO format. """ num_instance = len(instances) if num_instance == 0: return [] boxes = instances.pred_boxes.tensor.numpy() boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) boxes = boxes.tolist() scores = instances.scores.tolist() classes = instances.pred_classes.tolist() has_mask = instances.has("pred_masks") if has_mask: # use RLE to encode the masks, because they are too large and takes memory # since this evaluator stores outputs of the entire dataset rles = [ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks ] for rle in rles: # "counts" is an array encoded by mask_util as a byte-stream. Python3's # json writer which always produces strings cannot serialize a bytestream # unless you decode it. Thankfully, utf-8 works out (which is also what # the annotator.oneformer.pycocotools/_mask.pyx does). rle["counts"] = rle["counts"].decode("utf-8") has_keypoints = instances.has("pred_keypoints") if has_keypoints: keypoints = instances.pred_keypoints results = [] for k in range(num_instance): result = { "image_id": img_id, "category_id": classes[k], "bbox": boxes[k], "score": scores[k], } if has_mask: result["segmentation"] = rles[k] if has_keypoints: # In COCO annotations, # keypoints coordinates are pixel indices. # However our predictions are floating point coordinates. # Therefore we subtract 0.5 to be consistent with the annotation format. # This is the inverse of data loading logic in `datasets/coco.py`. keypoints[k][:, :2] -= 0.5 result["keypoints"] = keypoints[k].flatten().tolist() results.append(result) return results # inspired from Detectron: # https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None): """ Evaluate detection proposal recall metrics. This function is a much faster alternative to the official COCO API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0**2, 1e5**2], # all [0**2, 32**2], # small [32**2, 96**2], # medium [96**2, 1e5**2], # large [96**2, 128**2], # 96-128 [128**2, 256**2], # 128-256 [256**2, 512**2], # 256-512 [512**2, 1e5**2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for prediction_dict in dataset_predictions: predictions = prediction_dict["proposals"] # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = predictions.objectness_logits.sort(descending=True)[1] predictions = predictions[inds] ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"]) anno = coco_api.loadAnns(ann_ids) gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno if obj["iscrowd"] == 0 ] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) if len(gt_boxes) == 0 or len(predictions) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if limit is not None and len(predictions) > limit: predictions = predictions[:limit] overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = ( torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32) ) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, } def _evaluate_predictions_on_coco( coco_gt, coco_results, iou_type, kpt_oks_sigmas=None, cocoeval_fn=COCOeval_opt, img_ids=None, max_dets_per_image=None, ): """ Evaluate the coco results using COCOEval API. """ assert len(coco_results) > 0 if iou_type == "segm": coco_results = copy.deepcopy(coco_results) # When evaluating mask AP, if the results contain bbox, cocoapi will # use the box area as the area of the instance, instead of the mask area. # This leads to a different definition of small/medium/large. # We remove the bbox field to let mask AP use mask area. for c in coco_results: c.pop("bbox", None) coco_dt = coco_gt.loadRes(coco_results) coco_eval = cocoeval_fn(coco_gt, coco_dt, iou_type) # For COCO, the default max_dets_per_image is [1, 10, 100]. if max_dets_per_image is None: max_dets_per_image = [1, 10, 100] # Default from COCOEval else: assert ( len(max_dets_per_image) >= 3 ), "COCOeval requires maxDets (and max_dets_per_image) to have length at least 3" # In the case that user supplies a custom input for max_dets_per_image, # apply COCOevalMaxDets to evaluate AP with the custom input. if max_dets_per_image[2] != 100: coco_eval = COCOevalMaxDets(coco_gt, coco_dt, iou_type) if iou_type != "keypoints": coco_eval.params.maxDets = max_dets_per_image if img_ids is not None: coco_eval.params.imgIds = img_ids if iou_type == "keypoints": # Use the COCO default keypoint OKS sigmas unless overrides are specified if kpt_oks_sigmas: assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "annotator.oneformer.pycocotools is too old!" coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas) # COCOAPI requires every detection and every gt to have keypoints, so # we just take the first entry from both num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3 num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3 num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas) assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, ( f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. " f"Ground truth contains {num_keypoints_gt} keypoints. " f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. " "They have to agree with each other. For meaning of OKS, please refer to " "http://cocodataset.org/#keypoints-eval." ) coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return coco_eval class COCOevalMaxDets(COCOeval): """ Modified version of COCOeval for evaluating AP with a custom maxDets (by default for COCO, maxDets is 100) """ def summarize(self): """ Compute and display summary metrics for evaluation results given a custom value for max_dets_per_image """ def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100): p = self.params iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}" titleStr = "Average Precision" if ap == 1 else "Average Recall" typeStr = "(AP)" if ap == 1 else "(AR)" iouStr = ( "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1]) if iouThr is None else "{:0.2f}".format(iouThr) ) aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] if ap == 1: # dimension of precision: [TxRxKxAxM] s = self.eval["precision"] # IoU if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:, :, :, aind, mind] else: # dimension of recall: [TxKxAxM] s = self.eval["recall"] if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:, :, aind, mind] if len(s[s > -1]) == 0: mean_s = -1 else: mean_s = np.mean(s[s > -1]) print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) return mean_s def _summarizeDets(): stats = np.zeros((12,)) # Evaluate AP using the custom limit on maximum detections per image stats[0] = _summarize(1, maxDets=self.params.maxDets[2]) stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2]) stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2]) stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2]) stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2]) stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2]) stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2]) stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2]) stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2]) return stats def _summarizeKps(): stats = np.zeros((10,)) stats[0] = _summarize(1, maxDets=20) stats[1] = _summarize(1, maxDets=20, iouThr=0.5) stats[2] = _summarize(1, maxDets=20, iouThr=0.75) stats[3] = _summarize(1, maxDets=20, areaRng="medium") stats[4] = _summarize(1, maxDets=20, areaRng="large") stats[5] = _summarize(0, maxDets=20) stats[6] = _summarize(0, maxDets=20, iouThr=0.5) stats[7] = _summarize(0, maxDets=20, iouThr=0.75) stats[8] = _summarize(0, maxDets=20, areaRng="medium") stats[9] = _summarize(0, maxDets=20, areaRng="large") return stats if not self.eval: raise Exception("Please run accumulate() first") iouType = self.params.iouType if iouType == "segm" or iouType == "bbox": summarize = _summarizeDets elif iouType == "keypoints": summarize = _summarizeKps self.stats = summarize() def __str__(self): self.summarize() ================================================ FILE: annotator/oneformer/detectron2/evaluation/evaluator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import datetime import logging import time from collections import OrderedDict, abc from contextlib import ExitStack, contextmanager from typing import List, Union import torch from torch import nn from annotator.oneformer.detectron2.utils.comm import get_world_size, is_main_process from annotator.oneformer.detectron2.utils.logger import log_every_n_seconds class DatasetEvaluator: """ Base class for a dataset evaluator. The function :func:`inference_on_dataset` runs the model over all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs. This class will accumulate information of the inputs/outputs (by :meth:`process`), and produce evaluation results in the end (by :meth:`evaluate`). """ def reset(self): """ Preparation for a new round of evaluation. Should be called before starting a round of evaluation. """ pass def process(self, inputs, outputs): """ Process the pair of inputs and outputs. If they contain batches, the pairs can be consumed one-by-one using `zip`: .. code-block:: python for input_, output in zip(inputs, outputs): # do evaluation on single input/output pair ... Args: inputs (list): the inputs that's used to call the model. outputs (list): the return value of `model(inputs)` """ pass def evaluate(self): """ Evaluate/summarize the performance, after processing all input/output pairs. Returns: dict: A new evaluator class can return a dict of arbitrary format as long as the user can process the results. In our train_net.py, we expect the following format: * key: the name of the task (e.g., bbox) * value: a dict of {metric name: score}, e.g.: {"AP50": 80} """ pass class DatasetEvaluators(DatasetEvaluator): """ Wrapper class to combine multiple :class:`DatasetEvaluator` instances. This class dispatches every evaluation call to all of its :class:`DatasetEvaluator`. """ def __init__(self, evaluators): """ Args: evaluators (list): the evaluators to combine. """ super().__init__() self._evaluators = evaluators def reset(self): for evaluator in self._evaluators: evaluator.reset() def process(self, inputs, outputs): for evaluator in self._evaluators: evaluator.process(inputs, outputs) def evaluate(self): results = OrderedDict() for evaluator in self._evaluators: result = evaluator.evaluate() if is_main_process() and result is not None: for k, v in result.items(): assert ( k not in results ), "Different evaluators produce results with the same key {}".format(k) results[k] = v return results def inference_on_dataset( model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None] ): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.__call__` accurately. The model will be used in eval mode. Args: model (callable): a callable which takes an object from `data_loader` and returns some outputs. If it's an nn.Module, it will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} batches".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) if isinstance(evaluator, abc.MutableSequence): evaluator = DatasetEvaluators(evaluator) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_data_time = 0 total_compute_time = 0 total_eval_time = 0 with ExitStack() as stack: if isinstance(model, nn.Module): stack.enter_context(inference_context(model)) stack.enter_context(torch.no_grad()) start_data_time = time.perf_counter() for idx, inputs in enumerate(data_loader): total_data_time += time.perf_counter() - start_data_time if idx == num_warmup: start_time = time.perf_counter() total_data_time = 0 total_compute_time = 0 total_eval_time = 0 start_compute_time = time.perf_counter() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time start_eval_time = time.perf_counter() evaluator.process(inputs, outputs) total_eval_time += time.perf_counter() - start_eval_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) data_seconds_per_iter = total_data_time / iters_after_start compute_seconds_per_iter = total_compute_time / iters_after_start eval_seconds_per_iter = total_eval_time / iters_after_start total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start if idx >= num_warmup * 2 or compute_seconds_per_iter > 5: eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1))) log_every_n_seconds( logging.INFO, ( f"Inference done {idx + 1}/{total}. " f"Dataloading: {data_seconds_per_iter:.4f} s/iter. " f"Inference: {compute_seconds_per_iter:.4f} s/iter. " f"Eval: {eval_seconds_per_iter:.4f} s/iter. " f"Total: {total_seconds_per_iter:.4f} s/iter. " f"ETA={eta}" ), n=5, ) start_data_time = time.perf_counter() # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format( total_time_str, total_time / (total - num_warmup), num_devices ) ) total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format( total_compute_time_str, total_compute_time / (total - num_warmup), num_devices ) ) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results @contextmanager def inference_context(model): """ A context where the model is temporarily changed to eval mode, and restored to previous mode afterwards. Args: model: a torch Module """ training_mode = model.training model.eval() yield model.train(training_mode) ================================================ FILE: annotator/oneformer/detectron2/evaluation/fast_eval_api.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import logging import numpy as np import time from annotator.oneformer.pycocotools.cocoeval import COCOeval from annotator.oneformer.detectron2 import _C logger = logging.getLogger(__name__) class COCOeval_opt(COCOeval): """ This is a slightly modified version of the original COCO API, where the functions evaluateImg() and accumulate() are implemented in C++ to speedup evaluation """ def evaluate(self): """ Run per image evaluation on given images and store results in self.evalImgs_cpp, a datastructure that isn't readable from Python but is used by a c++ implementation of accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure self.evalImgs because this datastructure is a computational bottleneck. :return: None """ tic = time.time() p = self.params # add backward compatibility if useSegm is specified in params if p.useSegm is not None: p.iouType = "segm" if p.useSegm == 1 else "bbox" logger.info("Evaluate annotation type *{}*".format(p.iouType)) p.imgIds = list(np.unique(p.imgIds)) if p.useCats: p.catIds = list(np.unique(p.catIds)) p.maxDets = sorted(p.maxDets) self.params = p self._prepare() # bottleneck # loop through images, area range, max detection number catIds = p.catIds if p.useCats else [-1] if p.iouType == "segm" or p.iouType == "bbox": computeIoU = self.computeIoU elif p.iouType == "keypoints": computeIoU = self.computeOks self.ious = { (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds } # bottleneck maxDet = p.maxDets[-1] # <<<< Beginning of code differences with original COCO API def convert_instances_to_cpp(instances, is_det=False): # Convert annotations for a list of instances in an image to a format that's fast # to access in C++ instances_cpp = [] for instance in instances: instance_cpp = _C.InstanceAnnotation( int(instance["id"]), instance["score"] if is_det else instance.get("score", 0.0), instance["area"], bool(instance.get("iscrowd", 0)), bool(instance.get("ignore", 0)), ) instances_cpp.append(instance_cpp) return instances_cpp # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++ ground_truth_instances = [ [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds] for imgId in p.imgIds ] detected_instances = [ [convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) for catId in p.catIds] for imgId in p.imgIds ] ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds] if not p.useCats: # For each image, flatten per-category lists into a single list ground_truth_instances = [[[o for c in i for o in c]] for i in ground_truth_instances] detected_instances = [[[o for c in i for o in c]] for i in detected_instances] # Call C++ implementation of self.evaluateImgs() self._evalImgs_cpp = _C.COCOevalEvaluateImages( p.areaRng, maxDet, p.iouThrs, ious, ground_truth_instances, detected_instances ) self._evalImgs = None self._paramsEval = copy.deepcopy(self.params) toc = time.time() logger.info("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic)) # >>>> End of code differences with original COCO API def accumulate(self): """ Accumulate per image evaluation results and store the result in self.eval. Does not support changing parameter settings from those used by self.evaluate() """ logger.info("Accumulating evaluation results...") tic = time.time() assert hasattr( self, "_evalImgs_cpp" ), "evaluate() must be called before accmulate() is called." self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp) # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections self.eval["recall"] = np.array(self.eval["recall"]).reshape( self.eval["counts"][:1] + self.eval["counts"][2:] ) # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X # num_area_ranges X num_max_detections self.eval["precision"] = np.array(self.eval["precision"]).reshape(self.eval["counts"]) self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"]) toc = time.time() logger.info("COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic)) ================================================ FILE: annotator/oneformer/detectron2/evaluation/lvis_evaluation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import itertools import json import logging import os import pickle from collections import OrderedDict import torch import annotator.oneformer.detectron2.utils.comm as comm from annotator.oneformer.detectron2.config import CfgNode from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.structures import Boxes, BoxMode, pairwise_iou from annotator.oneformer.detectron2.utils.file_io import PathManager from annotator.oneformer.detectron2.utils.logger import create_small_table from .coco_evaluation import instances_to_coco_json from .evaluator import DatasetEvaluator class LVISEvaluator(DatasetEvaluator): """ Evaluate object proposal and instance detection/segmentation outputs using LVIS's metrics and evaluation API. """ def __init__( self, dataset_name, tasks=None, distributed=True, output_dir=None, *, max_dets_per_image=None, ): """ Args: dataset_name (str): name of the dataset to be evaluated. It must have the following corresponding metadata: "json_file": the path to the LVIS format annotation tasks (tuple[str]): tasks that can be evaluated under the given configuration. A task is one of "bbox", "segm". By default, will infer this automatically from predictions. distributed (True): if True, will collect results from all ranks for evaluation. Otherwise, will evaluate the results in the current process. output_dir (str): optional, an output directory to dump results. max_dets_per_image (None or int): limit on maximum detections per image in evaluating AP This limit, by default of the LVIS dataset, is 300. """ from lvis import LVIS self._logger = logging.getLogger(__name__) if tasks is not None and isinstance(tasks, CfgNode): self._logger.warn( "COCO Evaluator instantiated using config, this is deprecated behavior." " Please pass in explicit arguments instead." ) self._tasks = None # Infering it from predictions should be better else: self._tasks = tasks self._distributed = distributed self._output_dir = output_dir self._max_dets_per_image = max_dets_per_image self._cpu_device = torch.device("cpu") self._metadata = MetadataCatalog.get(dataset_name) json_file = PathManager.get_local_path(self._metadata.json_file) self._lvis_api = LVIS(json_file) # Test set json files do not contain annotations (evaluation must be # performed using the LVIS evaluation server). self._do_evaluation = len(self._lvis_api.get_ann_ids()) > 0 def reset(self): self._predictions = [] def process(self, inputs, outputs): """ Args: inputs: the inputs to a LVIS model (e.g., GeneralizedRCNN). It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id". outputs: the outputs of a LVIS model. It is a list of dicts with key "instances" that contains :class:`Instances`. """ for input, output in zip(inputs, outputs): prediction = {"image_id": input["image_id"]} if "instances" in output: instances = output["instances"].to(self._cpu_device) prediction["instances"] = instances_to_coco_json(instances, input["image_id"]) if "proposals" in output: prediction["proposals"] = output["proposals"].to(self._cpu_device) self._predictions.append(prediction) def evaluate(self): if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return else: predictions = self._predictions if len(predictions) == 0: self._logger.warning("[LVISEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) self._results = OrderedDict() if "proposals" in predictions[0]: self._eval_box_proposals(predictions) if "instances" in predictions[0]: self._eval_predictions(predictions) # Copy so the caller can do whatever with results return copy.deepcopy(self._results) def _tasks_from_predictions(self, predictions): for pred in predictions: if "segmentation" in pred: return ("bbox", "segm") return ("bbox",) def _eval_predictions(self, predictions): """ Evaluate predictions. Fill self._results with the metrics of the tasks. Args: predictions (list[dict]): list of outputs from the model """ self._logger.info("Preparing results in the LVIS format ...") lvis_results = list(itertools.chain(*[x["instances"] for x in predictions])) tasks = self._tasks or self._tasks_from_predictions(lvis_results) # LVIS evaluator can be used to evaluate results for COCO dataset categories. # In this case `_metadata` variable will have a field with COCO-specific category mapping. if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): reverse_id_mapping = { v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items() } for result in lvis_results: result["category_id"] = reverse_id_mapping[result["category_id"]] else: # unmap the category ids for LVIS (from 0-indexed to 1-indexed) for result in lvis_results: result["category_id"] += 1 if self._output_dir: file_path = os.path.join(self._output_dir, "lvis_instances_results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(lvis_results)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info("Evaluating predictions ...") for task in sorted(tasks): res = _evaluate_predictions_on_lvis( self._lvis_api, lvis_results, task, max_dets_per_image=self._max_dets_per_image, class_names=self._metadata.get("thing_classes"), ) self._results[task] = res def _eval_box_proposals(self, predictions): """ Evaluate the box proposals in predictions. Fill self._results with the metrics for "box_proposals" task. """ if self._output_dir: # Saving generated box proposals to file. # Predicted box_proposals are in XYXY_ABS mode. bbox_mode = BoxMode.XYXY_ABS.value ids, boxes, objectness_logits = [], [], [] for prediction in predictions: ids.append(prediction["image_id"]) boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy()) objectness_logits.append(prediction["proposals"].objectness_logits.numpy()) proposal_data = { "boxes": boxes, "objectness_logits": objectness_logits, "ids": ids, "bbox_mode": bbox_mode, } with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f: pickle.dump(proposal_data, f) if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info("Evaluating bbox proposals ...") res = {} areas = {"all": "", "small": "s", "medium": "m", "large": "l"} for limit in [100, 1000]: for area, suffix in areas.items(): stats = _evaluate_box_proposals(predictions, self._lvis_api, area=area, limit=limit) key = "AR{}@{:d}".format(suffix, limit) res[key] = float(stats["ar"].item() * 100) self._logger.info("Proposal metrics: \n" + create_small_table(res)) self._results["box_proposals"] = res # inspired from Detectron: # https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None): """ Evaluate detection proposal recall metrics. This function is a much faster alternative to the official LVIS API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0**2, 1e5**2], # all [0**2, 32**2], # small [32**2, 96**2], # medium [96**2, 1e5**2], # large [96**2, 128**2], # 96-128 [128**2, 256**2], # 128-256 [256**2, 512**2], # 256-512 [512**2, 1e5**2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for prediction_dict in dataset_predictions: predictions = prediction_dict["proposals"] # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = predictions.objectness_logits.sort(descending=True)[1] predictions = predictions[inds] ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]]) anno = lvis_api.load_anns(ann_ids) gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno ] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_areas = torch.as_tensor([obj["area"] for obj in anno]) if len(gt_boxes) == 0 or len(predictions) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if limit is not None and len(predictions) > limit: predictions = predictions[:limit] overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = ( torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32) ) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, } def _evaluate_predictions_on_lvis( lvis_gt, lvis_results, iou_type, max_dets_per_image=None, class_names=None ): """ Args: iou_type (str): max_dets_per_image (None or int): limit on maximum detections per image in evaluating AP This limit, by default of the LVIS dataset, is 300. class_names (None or list[str]): if provided, will use it to predict per-category AP. Returns: a dict of {metric name: score} """ metrics = { "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"], "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"], }[iou_type] logger = logging.getLogger(__name__) if len(lvis_results) == 0: # TODO: check if needed logger.warn("No predictions from the model!") return {metric: float("nan") for metric in metrics} if iou_type == "segm": lvis_results = copy.deepcopy(lvis_results) # When evaluating mask AP, if the results contain bbox, LVIS API will # use the box area as the area of the instance, instead of the mask area. # This leads to a different definition of small/medium/large. # We remove the bbox field to let mask AP use mask area. for c in lvis_results: c.pop("bbox", None) if max_dets_per_image is None: max_dets_per_image = 300 # Default for LVIS dataset from lvis import LVISEval, LVISResults logger.info(f"Evaluating with max detections per image = {max_dets_per_image}") lvis_results = LVISResults(lvis_gt, lvis_results, max_dets=max_dets_per_image) lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type) lvis_eval.run() lvis_eval.print_results() # Pull the standard metrics from the LVIS results results = lvis_eval.get_results() results = {metric: float(results[metric] * 100) for metric in metrics} logger.info("Evaluation results for {}: \n".format(iou_type) + create_small_table(results)) return results ================================================ FILE: annotator/oneformer/detectron2/evaluation/panoptic_evaluation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import contextlib import io import itertools import json import logging import numpy as np import os import tempfile from collections import OrderedDict from typing import Optional from PIL import Image from tabulate import tabulate from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.utils import comm from annotator.oneformer.detectron2.utils.file_io import PathManager from .evaluator import DatasetEvaluator logger = logging.getLogger(__name__) class COCOPanopticEvaluator(DatasetEvaluator): """ Evaluate Panoptic Quality metrics on COCO using PanopticAPI. It saves panoptic segmentation prediction in `output_dir` It contains a synchronize call and has to be called from all workers. """ def __init__(self, dataset_name: str, output_dir: Optional[str] = None): """ Args: dataset_name: name of the dataset output_dir: output directory to save results for evaluation. """ self._metadata = MetadataCatalog.get(dataset_name) self._thing_contiguous_id_to_dataset_id = { v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items() } self._stuff_contiguous_id_to_dataset_id = { v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items() } self._output_dir = output_dir if self._output_dir is not None: PathManager.mkdirs(self._output_dir) def reset(self): self._predictions = [] def _convert_category_id(self, segment_info): isthing = segment_info.pop("isthing", None) if isthing is None: # the model produces panoptic category id directly. No more conversion needed return segment_info if isthing is True: segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[ segment_info["category_id"] ] else: segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[ segment_info["category_id"] ] return segment_info def process(self, inputs, outputs): from panopticapi.utils import id2rgb for input, output in zip(inputs, outputs): panoptic_img, segments_info = output["panoptic_seg"] panoptic_img = panoptic_img.cpu().numpy() if segments_info is None: # If "segments_info" is None, we assume "panoptic_img" is a # H*W int32 image storing the panoptic_id in the format of # category_id * label_divisor + instance_id. We reserve -1 for # VOID label, and add 1 to panoptic_img since the official # evaluation script uses 0 for VOID label. label_divisor = self._metadata.label_divisor segments_info = [] for panoptic_label in np.unique(panoptic_img): if panoptic_label == -1: # VOID region. continue pred_class = panoptic_label // label_divisor isthing = ( pred_class in self._metadata.thing_dataset_id_to_contiguous_id.values() ) segments_info.append( { "id": int(panoptic_label) + 1, "category_id": int(pred_class), "isthing": bool(isthing), } ) # Official evaluation script uses 0 for VOID label. panoptic_img += 1 file_name = os.path.basename(input["file_name"]) file_name_png = os.path.splitext(file_name)[0] + ".png" with io.BytesIO() as out: Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG") segments_info = [self._convert_category_id(x) for x in segments_info] self._predictions.append( { "image_id": input["image_id"], "file_name": file_name_png, "png_string": out.getvalue(), "segments_info": segments_info, } ) def evaluate(self): comm.synchronize() self._predictions = comm.gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return # PanopticApi requires local files gt_json = PathManager.get_local_path(self._metadata.panoptic_json) gt_folder = PathManager.get_local_path(self._metadata.panoptic_root) with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir: logger.info("Writing all panoptic predictions to {} ...".format(pred_dir)) for p in self._predictions: with open(os.path.join(pred_dir, p["file_name"]), "wb") as f: f.write(p.pop("png_string")) with open(gt_json, "r") as f: json_data = json.load(f) json_data["annotations"] = self._predictions output_dir = self._output_dir or pred_dir predictions_json = os.path.join(output_dir, "predictions.json") with PathManager.open(predictions_json, "w") as f: f.write(json.dumps(json_data)) from panopticapi.evaluation import pq_compute with contextlib.redirect_stdout(io.StringIO()): pq_res = pq_compute( gt_json, PathManager.get_local_path(predictions_json), gt_folder=gt_folder, pred_folder=pred_dir, ) res = {} res["PQ"] = 100 * pq_res["All"]["pq"] res["SQ"] = 100 * pq_res["All"]["sq"] res["RQ"] = 100 * pq_res["All"]["rq"] res["PQ_th"] = 100 * pq_res["Things"]["pq"] res["SQ_th"] = 100 * pq_res["Things"]["sq"] res["RQ_th"] = 100 * pq_res["Things"]["rq"] res["PQ_st"] = 100 * pq_res["Stuff"]["pq"] res["SQ_st"] = 100 * pq_res["Stuff"]["sq"] res["RQ_st"] = 100 * pq_res["Stuff"]["rq"] results = OrderedDict({"panoptic_seg": res}) _print_panoptic_results(pq_res) return results def _print_panoptic_results(pq_res): headers = ["", "PQ", "SQ", "RQ", "#categories"] data = [] for name in ["All", "Things", "Stuff"]: row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]] data.append(row) table = tabulate( data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center" ) logger.info("Panoptic Evaluation Results:\n" + table) if __name__ == "__main__": from annotator.oneformer.detectron2.utils.logger import setup_logger logger = setup_logger() import argparse parser = argparse.ArgumentParser() parser.add_argument("--gt-json") parser.add_argument("--gt-dir") parser.add_argument("--pred-json") parser.add_argument("--pred-dir") args = parser.parse_args() from panopticapi.evaluation import pq_compute with contextlib.redirect_stdout(io.StringIO()): pq_res = pq_compute( args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir ) _print_panoptic_results(pq_res) ================================================ FILE: annotator/oneformer/detectron2/evaluation/pascal_voc_evaluation.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np import os import tempfile import xml.etree.ElementTree as ET from collections import OrderedDict, defaultdict from functools import lru_cache import torch from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.utils import comm from annotator.oneformer.detectron2.utils.file_io import PathManager from .evaluator import DatasetEvaluator class PascalVOCDetectionEvaluator(DatasetEvaluator): """ Evaluate Pascal VOC style AP for Pascal VOC dataset. It contains a synchronization, therefore has to be called from all ranks. Note that the concept of AP can be implemented in different ways and may not produce identical results. This class mimics the implementation of the official Pascal VOC Matlab API, and should produce similar but not identical results to the official API. """ def __init__(self, dataset_name): """ Args: dataset_name (str): name of the dataset, e.g., "voc_2007_test" """ self._dataset_name = dataset_name meta = MetadataCatalog.get(dataset_name) # Too many tiny files, download all to local for speed. annotation_dir_local = PathManager.get_local_path( os.path.join(meta.dirname, "Annotations/") ) self._anno_file_template = os.path.join(annotation_dir_local, "{}.xml") self._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt") self._class_names = meta.thing_classes assert meta.year in [2007, 2012], meta.year self._is_2007 = meta.year == 2007 self._cpu_device = torch.device("cpu") self._logger = logging.getLogger(__name__) def reset(self): self._predictions = defaultdict(list) # class name -> list of prediction strings def process(self, inputs, outputs): for input, output in zip(inputs, outputs): image_id = input["image_id"] instances = output["instances"].to(self._cpu_device) boxes = instances.pred_boxes.tensor.numpy() scores = instances.scores.tolist() classes = instances.pred_classes.tolist() for box, score, cls in zip(boxes, scores, classes): xmin, ymin, xmax, ymax = box # The inverse of data loading logic in `datasets/pascal_voc.py` xmin += 1 ymin += 1 self._predictions[cls].append( f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}" ) def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75". """ all_predictions = comm.gather(self._predictions, dst=0) if not comm.is_main_process(): return predictions = defaultdict(list) for predictions_per_rank in all_predictions: for clsid, lines in predictions_per_rank.items(): predictions[clsid].extend(lines) del all_predictions self._logger.info( "Evaluating {} using {} metric. " "Note that results do not use the official Matlab API.".format( self._dataset_name, 2007 if self._is_2007 else 2012 ) ) with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: res_file_template = os.path.join(dirname, "{}.txt") aps = defaultdict(list) # iou -> ap per class for cls_id, cls_name in enumerate(self._class_names): lines = predictions.get(cls_id, [""]) with open(res_file_template.format(cls_name), "w") as f: f.write("\n".join(lines)) for thresh in range(50, 100, 5): rec, prec, ap = voc_eval( res_file_template, self._anno_file_template, self._image_set_path, cls_name, ovthresh=thresh / 100.0, use_07_metric=self._is_2007, ) aps[thresh].append(ap * 100) ret = OrderedDict() mAP = {iou: np.mean(x) for iou, x in aps.items()} ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]} return ret ############################################################################## # # Below code is modified from # https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py # -------------------------------------------------------- # Fast/er R-CNN # Licensed under The MIT License [see LICENSE for details] # Written by Bharath Hariharan # -------------------------------------------------------- """Python implementation of the PASCAL VOC devkit's AP evaluation code.""" @lru_cache(maxsize=None) def parse_rec(filename): """Parse a PASCAL VOC xml file.""" with PathManager.open(filename) as f: tree = ET.parse(f) objects = [] for obj in tree.findall("object"): obj_struct = {} obj_struct["name"] = obj.find("name").text obj_struct["pose"] = obj.find("pose").text obj_struct["truncated"] = int(obj.find("truncated").text) obj_struct["difficult"] = int(obj.find("difficult").text) bbox = obj.find("bndbox") obj_struct["bbox"] = [ int(bbox.find("xmin").text), int(bbox.find("ymin").text), int(bbox.find("xmax").text), int(bbox.find("ymax").text), ] objects.append(obj_struct) return objects def voc_ap(rec, prec, use_07_metric=False): """Compute VOC AP given precision and recall. If use_07_metric is true, uses the VOC 07 11-point method (default:False). """ if use_07_metric: # 11 point metric ap = 0.0 for t in np.arange(0.0, 1.1, 0.1): if np.sum(rec >= t) == 0: p = 0 else: p = np.max(prec[rec >= t]) ap = ap + p / 11.0 else: # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.0], rec, [1.0])) mpre = np.concatenate(([0.0], prec, [0.0])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False): """rec, prec, ap = voc_eval(detpath, annopath, imagesetfile, classname, [ovthresh], [use_07_metric]) Top level function that does the PASCAL VOC evaluation. detpath: Path to detections detpath.format(classname) should produce the detection results file. annopath: Path to annotations annopath.format(imagename) should be the xml annotations file. imagesetfile: Text file containing the list of images, one image per line. classname: Category name (duh) [ovthresh]: Overlap threshold (default = 0.5) [use_07_metric]: Whether to use VOC07's 11 point AP computation (default False) """ # assumes detections are in detpath.format(classname) # assumes annotations are in annopath.format(imagename) # assumes imagesetfile is a text file with each line an image name # first load gt # read list of images with PathManager.open(imagesetfile, "r") as f: lines = f.readlines() imagenames = [x.strip() for x in lines] # load annots recs = {} for imagename in imagenames: recs[imagename] = parse_rec(annopath.format(imagename)) # extract gt objects for this class class_recs = {} npos = 0 for imagename in imagenames: R = [obj for obj in recs[imagename] if obj["name"] == classname] bbox = np.array([x["bbox"] for x in R]) difficult = np.array([x["difficult"] for x in R]).astype(bool) # difficult = np.array([False for x in R]).astype(bool) # treat all "difficult" as GT det = [False] * len(R) npos = npos + sum(~difficult) class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det} # read dets detfile = detpath.format(classname) with open(detfile, "r") as f: lines = f.readlines() splitlines = [x.strip().split(" ") for x in lines] image_ids = [x[0] for x in splitlines] confidence = np.array([float(x[1]) for x in splitlines]) BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4) # sort by confidence sorted_ind = np.argsort(-confidence) BB = BB[sorted_ind, :] image_ids = [image_ids[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp = np.zeros(nd) fp = np.zeros(nd) for d in range(nd): R = class_recs[image_ids[d]] bb = BB[d, :].astype(float) ovmax = -np.inf BBGT = R["bbox"].astype(float) if BBGT.size > 0: # compute overlaps # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin + 1.0, 0.0) ih = np.maximum(iymax - iymin + 1.0, 0.0) inters = iw * ih # union uni = ( (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0) + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0) - inters ) overlaps = inters / uni ovmax = np.max(overlaps) jmax = np.argmax(overlaps) if ovmax > ovthresh: if not R["difficult"][jmax]: if not R["det"][jmax]: tp[d] = 1.0 R["det"][jmax] = 1 else: fp[d] = 1.0 else: fp[d] = 1.0 # compute precision recall fp = np.cumsum(fp) tp = np.cumsum(tp) rec = tp / float(npos) # avoid divide by zero in case the first detection matches a difficult # ground truth prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = voc_ap(rec, prec, use_07_metric) return rec, prec, ap ================================================ FILE: annotator/oneformer/detectron2/evaluation/rotated_coco_evaluation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import itertools import json import numpy as np import os import torch from annotator.oneformer.pycocotools.cocoeval import COCOeval, maskUtils from annotator.oneformer.detectron2.structures import BoxMode, RotatedBoxes, pairwise_iou_rotated from annotator.oneformer.detectron2.utils.file_io import PathManager from .coco_evaluation import COCOEvaluator class RotatedCOCOeval(COCOeval): @staticmethod def is_rotated(box_list): if type(box_list) == np.ndarray: return box_list.shape[1] == 5 elif type(box_list) == list: if box_list == []: # cannot decide the box_dim return False return np.all( np.array( [ (len(obj) == 5) and ((type(obj) == list) or (type(obj) == np.ndarray)) for obj in box_list ] ) ) return False @staticmethod def boxlist_to_tensor(boxlist, output_box_dim): if type(boxlist) == np.ndarray: box_tensor = torch.from_numpy(boxlist) elif type(boxlist) == list: if boxlist == []: return torch.zeros((0, output_box_dim), dtype=torch.float32) else: box_tensor = torch.FloatTensor(boxlist) else: raise Exception("Unrecognized boxlist type") input_box_dim = box_tensor.shape[1] if input_box_dim != output_box_dim: if input_box_dim == 4 and output_box_dim == 5: box_tensor = BoxMode.convert(box_tensor, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS) else: raise Exception( "Unable to convert from {}-dim box to {}-dim box".format( input_box_dim, output_box_dim ) ) return box_tensor def compute_iou_dt_gt(self, dt, gt, is_crowd): if self.is_rotated(dt) or self.is_rotated(gt): # TODO: take is_crowd into consideration assert all(c == 0 for c in is_crowd) dt = RotatedBoxes(self.boxlist_to_tensor(dt, output_box_dim=5)) gt = RotatedBoxes(self.boxlist_to_tensor(gt, output_box_dim=5)) return pairwise_iou_rotated(dt, gt) else: # This is the same as the classical COCO evaluation return maskUtils.iou(dt, gt, is_crowd) def computeIoU(self, imgId, catId): p = self.params if p.useCats: gt = self._gts[imgId, catId] dt = self._dts[imgId, catId] else: gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] if len(gt) == 0 and len(dt) == 0: return [] inds = np.argsort([-d["score"] for d in dt], kind="mergesort") dt = [dt[i] for i in inds] if len(dt) > p.maxDets[-1]: dt = dt[0 : p.maxDets[-1]] assert p.iouType == "bbox", "unsupported iouType for iou computation" g = [g["bbox"] for g in gt] d = [d["bbox"] for d in dt] # compute iou between each dt and gt region iscrowd = [int(o["iscrowd"]) for o in gt] # Note: this function is copied from cocoeval.py in cocoapi # and the major difference is here. ious = self.compute_iou_dt_gt(d, g, iscrowd) return ious class RotatedCOCOEvaluator(COCOEvaluator): """ Evaluate object proposal/instance detection outputs using COCO-like metrics and APIs, with rotated boxes support. Note: this uses IOU only and does not consider angle differences. """ def process(self, inputs, outputs): """ Args: inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id". outputs: the outputs of a COCO model. It is a list of dicts with key "instances" that contains :class:`Instances`. """ for input, output in zip(inputs, outputs): prediction = {"image_id": input["image_id"]} if "instances" in output: instances = output["instances"].to(self._cpu_device) prediction["instances"] = self.instances_to_json(instances, input["image_id"]) if "proposals" in output: prediction["proposals"] = output["proposals"].to(self._cpu_device) self._predictions.append(prediction) def instances_to_json(self, instances, img_id): num_instance = len(instances) if num_instance == 0: return [] boxes = instances.pred_boxes.tensor.numpy() if boxes.shape[1] == 4: boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) boxes = boxes.tolist() scores = instances.scores.tolist() classes = instances.pred_classes.tolist() results = [] for k in range(num_instance): result = { "image_id": img_id, "category_id": classes[k], "bbox": boxes[k], "score": scores[k], } results.append(result) return results def _eval_predictions(self, predictions, img_ids=None): # img_ids: unused """ Evaluate predictions on the given tasks. Fill self._results with the metrics of the tasks. """ self._logger.info("Preparing results for COCO format ...") coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) # unmap the category ids for COCO if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): reverse_id_mapping = { v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items() } for result in coco_results: result["category_id"] = reverse_id_mapping[result["category_id"]] if self._output_dir: file_path = os.path.join(self._output_dir, "coco_instances_results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(coco_results)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info("Evaluating predictions ...") assert self._tasks is None or set(self._tasks) == { "bbox" }, "[RotatedCOCOEvaluator] Only bbox evaluation is supported" coco_eval = ( self._evaluate_predictions_on_coco(self._coco_api, coco_results) if len(coco_results) > 0 else None # cocoapi does not handle empty results very well ) task = "bbox" res = self._derive_coco_results( coco_eval, task, class_names=self._metadata.get("thing_classes") ) self._results[task] = res def _evaluate_predictions_on_coco(self, coco_gt, coco_results): """ Evaluate the coco results using COCOEval API. """ assert len(coco_results) > 0 coco_dt = coco_gt.loadRes(coco_results) # Only bbox is supported for now coco_eval = RotatedCOCOeval(coco_gt, coco_dt, iouType="bbox") coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return coco_eval ================================================ FILE: annotator/oneformer/detectron2/evaluation/sem_seg_evaluation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import itertools import json import logging import numpy as np import os from collections import OrderedDict from typing import Optional, Union import annotator.oneformer.pycocotools.mask as mask_util import torch from PIL import Image from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from annotator.oneformer.detectron2.utils.comm import all_gather, is_main_process, synchronize from annotator.oneformer.detectron2.utils.file_io import PathManager from .evaluator import DatasetEvaluator _CV2_IMPORTED = True try: import cv2 # noqa except ImportError: # OpenCV is an optional dependency at the moment _CV2_IMPORTED = False def load_image_into_numpy_array( filename: str, copy: bool = False, dtype: Optional[Union[np.dtype, str]] = None, ) -> np.ndarray: with PathManager.open(filename, "rb") as f: array = np.array(Image.open(f), copy=copy, dtype=dtype) return array class SemSegEvaluator(DatasetEvaluator): """ Evaluate semantic segmentation metrics. """ def __init__( self, dataset_name, distributed=True, output_dir=None, *, sem_seg_loading_fn=load_image_into_numpy_array, num_classes=None, ignore_label=None, ): """ Args: dataset_name (str): name of the dataset to be evaluated. distributed (bool): if True, will collect results from all ranks for evaluation. Otherwise, will evaluate the results in the current process. output_dir (str): an output directory to dump results. sem_seg_loading_fn: function to read sem seg file and load into numpy array. Default provided, but projects can customize. num_classes, ignore_label: deprecated argument """ self._logger = logging.getLogger(__name__) if num_classes is not None: self._logger.warn( "SemSegEvaluator(num_classes) is deprecated! It should be obtained from metadata." ) if ignore_label is not None: self._logger.warn( "SemSegEvaluator(ignore_label) is deprecated! It should be obtained from metadata." ) self._dataset_name = dataset_name self._distributed = distributed self._output_dir = output_dir self._cpu_device = torch.device("cpu") self.input_file_to_gt_file = { dataset_record["file_name"]: dataset_record["sem_seg_file_name"] for dataset_record in DatasetCatalog.get(dataset_name) } meta = MetadataCatalog.get(dataset_name) # Dict that maps contiguous training ids to COCO category ids try: c2d = meta.stuff_dataset_id_to_contiguous_id self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()} except AttributeError: self._contiguous_id_to_dataset_id = None self._class_names = meta.stuff_classes self.sem_seg_loading_fn = sem_seg_loading_fn self._num_classes = len(meta.stuff_classes) if num_classes is not None: assert self._num_classes == num_classes, f"{self._num_classes} != {num_classes}" self._ignore_label = ignore_label if ignore_label is not None else meta.ignore_label # This is because cv2.erode did not work for int datatype. Only works for uint8. self._compute_boundary_iou = True if not _CV2_IMPORTED: self._compute_boundary_iou = False self._logger.warn( """Boundary IoU calculation requires OpenCV. B-IoU metrics are not going to be computed because OpenCV is not available to import.""" ) if self._num_classes >= np.iinfo(np.uint8).max: self._compute_boundary_iou = False self._logger.warn( f"""SemSegEvaluator(num_classes) is more than supported value for Boundary IoU calculation! B-IoU metrics are not going to be computed. Max allowed value (exclusive) for num_classes for calculating Boundary IoU is {np.iinfo(np.uint8).max}. The number of classes of dataset {self._dataset_name} is {self._num_classes}""" ) def reset(self): self._conf_matrix = np.zeros((self._num_classes + 1, self._num_classes + 1), dtype=np.int64) self._b_conf_matrix = np.zeros( (self._num_classes + 1, self._num_classes + 1), dtype=np.int64 ) self._predictions = [] def process(self, inputs, outputs): """ Args: inputs: the inputs to a model. It is a list of dicts. Each dict corresponds to an image and contains keys like "height", "width", "file_name". outputs: the outputs of a model. It is either list of semantic segmentation predictions (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic segmentation prediction in the same format. """ for input, output in zip(inputs, outputs): output = output["sem_seg"].argmax(dim=0).to(self._cpu_device) pred = np.array(output, dtype=np.int) gt_filename = self.input_file_to_gt_file[input["file_name"]] gt = self.sem_seg_loading_fn(gt_filename, dtype=np.int) gt[gt == self._ignore_label] = self._num_classes self._conf_matrix += np.bincount( (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1), minlength=self._conf_matrix.size, ).reshape(self._conf_matrix.shape) if self._compute_boundary_iou: b_gt = self._mask_to_boundary(gt.astype(np.uint8)) b_pred = self._mask_to_boundary(pred.astype(np.uint8)) self._b_conf_matrix += np.bincount( (self._num_classes + 1) * b_pred.reshape(-1) + b_gt.reshape(-1), minlength=self._conf_matrix.size, ).reshape(self._conf_matrix.shape) self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"])) def evaluate(self): """ Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval): * Mean intersection-over-union averaged across classes (mIoU) * Frequency Weighted IoU (fwIoU) * Mean pixel accuracy averaged across classes (mACC) * Pixel Accuracy (pACC) """ if self._distributed: synchronize() conf_matrix_list = all_gather(self._conf_matrix) b_conf_matrix_list = all_gather(self._b_conf_matrix) self._predictions = all_gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not is_main_process(): return self._conf_matrix = np.zeros_like(self._conf_matrix) for conf_matrix in conf_matrix_list: self._conf_matrix += conf_matrix self._b_conf_matrix = np.zeros_like(self._b_conf_matrix) for b_conf_matrix in b_conf_matrix_list: self._b_conf_matrix += b_conf_matrix if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "sem_seg_predictions.json") with PathManager.open(file_path, "w") as f: f.write(json.dumps(self._predictions)) acc = np.full(self._num_classes, np.nan, dtype=np.float) iou = np.full(self._num_classes, np.nan, dtype=np.float) tp = self._conf_matrix.diagonal()[:-1].astype(np.float) pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float) class_weights = pos_gt / np.sum(pos_gt) pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float) acc_valid = pos_gt > 0 acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] union = pos_gt + pos_pred - tp iou_valid = np.logical_and(acc_valid, union > 0) iou[iou_valid] = tp[iou_valid] / union[iou_valid] macc = np.sum(acc[acc_valid]) / np.sum(acc_valid) miou = np.sum(iou[iou_valid]) / np.sum(iou_valid) fiou = np.sum(iou[iou_valid] * class_weights[iou_valid]) pacc = np.sum(tp) / np.sum(pos_gt) if self._compute_boundary_iou: b_iou = np.full(self._num_classes, np.nan, dtype=np.float) b_tp = self._b_conf_matrix.diagonal()[:-1].astype(np.float) b_pos_gt = np.sum(self._b_conf_matrix[:-1, :-1], axis=0).astype(np.float) b_pos_pred = np.sum(self._b_conf_matrix[:-1, :-1], axis=1).astype(np.float) b_union = b_pos_gt + b_pos_pred - b_tp b_iou_valid = b_union > 0 b_iou[b_iou_valid] = b_tp[b_iou_valid] / b_union[b_iou_valid] res = {} res["mIoU"] = 100 * miou res["fwIoU"] = 100 * fiou for i, name in enumerate(self._class_names): res[f"IoU-{name}"] = 100 * iou[i] if self._compute_boundary_iou: res[f"BoundaryIoU-{name}"] = 100 * b_iou[i] res[f"min(IoU, B-Iou)-{name}"] = 100 * min(iou[i], b_iou[i]) res["mACC"] = 100 * macc res["pACC"] = 100 * pacc for i, name in enumerate(self._class_names): res[f"ACC-{name}"] = 100 * acc[i] if self._output_dir: file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth") with PathManager.open(file_path, "wb") as f: torch.save(res, f) results = OrderedDict({"sem_seg": res}) self._logger.info(results) return results def encode_json_sem_seg(self, sem_seg, input_file_name): """ Convert semantic segmentation to COCO stuff format with segments encoded as RLEs. See http://cocodataset.org/#format-results """ json_list = [] for label in np.unique(sem_seg): if self._contiguous_id_to_dataset_id is not None: assert ( label in self._contiguous_id_to_dataset_id ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name) dataset_id = self._contiguous_id_to_dataset_id[label] else: dataset_id = int(label) mask = (sem_seg == label).astype(np.uint8) mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0] mask_rle["counts"] = mask_rle["counts"].decode("utf-8") json_list.append( {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle} ) return json_list def _mask_to_boundary(self, mask: np.ndarray, dilation_ratio=0.02): assert mask.ndim == 2, "mask_to_boundary expects a 2-dimensional image" h, w = mask.shape diag_len = np.sqrt(h**2 + w**2) dilation = max(1, int(round(dilation_ratio * diag_len))) kernel = np.ones((3, 3), dtype=np.uint8) padded_mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=0) eroded_mask_with_padding = cv2.erode(padded_mask, kernel, iterations=dilation) eroded_mask = eroded_mask_with_padding[1:-1, 1:-1] boundary = mask - eroded_mask return boundary ================================================ FILE: annotator/oneformer/detectron2/evaluation/testing.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np import pprint import sys from collections.abc import Mapping def print_csv_format(results): """ Print main metrics in a format similar to Detectron, so that they are easy to copypaste into a spreadsheet. Args: results (OrderedDict[dict]): task_name -> {metric -> score} unordered dict can also be printed, but in arbitrary order """ assert isinstance(results, Mapping) or not len(results), results logger = logging.getLogger(__name__) for task, res in results.items(): if isinstance(res, Mapping): # Don't print "AP-category" metrics since they are usually not tracked. important_res = [(k, v) for k, v in res.items() if "-" not in k] logger.info("copypaste: Task: {}".format(task)) logger.info("copypaste: " + ",".join([k[0] for k in important_res])) logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res])) else: logger.info(f"copypaste: {task}={res}") def verify_results(cfg, results): """ Args: results (OrderedDict[dict]): task_name -> {metric -> score} Returns: bool: whether the verification succeeds or not """ expected_results = cfg.TEST.EXPECTED_RESULTS if not len(expected_results): return True ok = True for task, metric, expected, tolerance in expected_results: actual = results[task].get(metric, None) if actual is None: ok = False continue if not np.isfinite(actual): ok = False continue diff = abs(actual - expected) if diff > tolerance: ok = False logger = logging.getLogger(__name__) if not ok: logger.error("Result verification failed!") logger.error("Expected Results: " + str(expected_results)) logger.error("Actual Results: " + pprint.pformat(results)) sys.exit(1) else: logger.info("Results verification passed.") return ok def flatten_results_dict(results): """ Expand a hierarchical dict of scalars into a flat dict of scalars. If results[k1][k2][k3] = v, the returned dict will have the entry {"k1/k2/k3": v}. Args: results (dict): """ r = {} for k, v in results.items(): if isinstance(v, Mapping): v = flatten_results_dict(v) for kk, vv in v.items(): r[k + "/" + kk] = vv else: r[k] = v return r ================================================ FILE: annotator/oneformer/detectron2/export/README.md ================================================ This directory contains code to prepare a detectron2 model for deployment. Currently it supports exporting a detectron2 model to TorchScript, ONNX, or (deprecated) Caffe2 format. Please see [documentation](https://detectron2.readthedocs.io/tutorials/deployment.html) for its usage. ### Acknowledgements Thanks to Mobile Vision team at Facebook for developing the Caffe2 conversion tools. Thanks to Computing Platform Department - PAI team at Alibaba Group (@bddpqq, @chenbohua3) who help export Detectron2 models to TorchScript. Thanks to ONNX Converter team at Microsoft who help export Detectron2 models to ONNX. ================================================ FILE: annotator/oneformer/detectron2/export/__init__.py ================================================ # -*- coding: utf-8 -*- import warnings from .flatten import TracingAdapter from .torchscript import dump_torchscript_IR, scripting_with_instances try: from caffe2.proto import caffe2_pb2 as _tmp from caffe2.python import core # caffe2 is optional except ImportError: pass else: from .api import * # TODO: Update ONNX Opset version and run tests when a newer PyTorch is supported STABLE_ONNX_OPSET_VERSION = 11 def add_export_config(cfg): warnings.warn( "add_export_config has been deprecated and behaves as no-op function.", DeprecationWarning ) return cfg __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: annotator/oneformer/detectron2/export/api.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import logging import os import torch from caffe2.proto import caffe2_pb2 from torch import nn from annotator.oneformer.detectron2.config import CfgNode from annotator.oneformer.detectron2.utils.file_io import PathManager from .caffe2_inference import ProtobufDetectionModel from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format from .shared import get_pb_arg_vali, get_pb_arg_vals, save_graph __all__ = [ "Caffe2Model", "Caffe2Tracer", ] class Caffe2Tracer: """ Make a detectron2 model traceable with Caffe2 operators. This class creates a traceable version of a detectron2 model which: 1. Rewrite parts of the model using ops in Caffe2. Note that some ops do not have GPU implementation in Caffe2. 2. Remove post-processing and only produce raw layer outputs After making a traceable model, the class provide methods to export such a model to different deployment formats. Exported graph produced by this class take two input tensors: 1. (1, C, H, W) float "data" which is an image (usually in [0, 255]). (H, W) often has to be padded to multiple of 32 (depend on the model architecture). 2. 1x3 float "im_info", each row of which is (height, width, 1.0). Height and width are true image shapes before padding. The class currently only supports models using builtin meta architectures. Batch inference is not supported, and contributions are welcome. """ def __init__(self, cfg: CfgNode, model: nn.Module, inputs): """ Args: cfg (CfgNode): a detectron2 config used to construct caffe2-compatible model. model (nn.Module): An original pytorch model. Must be among a few official models in detectron2 that can be converted to become caffe2-compatible automatically. Weights have to be already loaded to this model. inputs: sample inputs that the given model takes for inference. Will be used to trace the model. For most models, random inputs with no detected objects will not work as they lead to wrong traces. """ assert isinstance(cfg, CfgNode), cfg assert isinstance(model, torch.nn.Module), type(model) # TODO make it support custom models, by passing in c2 model directly C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[cfg.MODEL.META_ARCHITECTURE] self.traceable_model = C2MetaArch(cfg, copy.deepcopy(model)) self.inputs = inputs self.traceable_inputs = self.traceable_model.get_caffe2_inputs(inputs) def export_caffe2(self): """ Export the model to Caffe2's protobuf format. The returned object can be saved with its :meth:`.save_protobuf()` method. The result can be loaded and executed using Caffe2 runtime. Returns: :class:`Caffe2Model` """ from .caffe2_export import export_caffe2_detection_model predict_net, init_net = export_caffe2_detection_model( self.traceable_model, self.traceable_inputs ) return Caffe2Model(predict_net, init_net) def export_onnx(self): """ Export the model to ONNX format. Note that the exported model contains custom ops only available in caffe2, therefore it cannot be directly executed by other runtime (such as onnxruntime or TensorRT). Post-processing or transformation passes may be applied on the model to accommodate different runtimes, but we currently do not provide support for them. Returns: onnx.ModelProto: an onnx model. """ from .caffe2_export import export_onnx_model as export_onnx_model_impl return export_onnx_model_impl(self.traceable_model, (self.traceable_inputs,)) def export_torchscript(self): """ Export the model to a ``torch.jit.TracedModule`` by tracing. The returned object can be saved to a file by ``.save()``. Returns: torch.jit.TracedModule: a torch TracedModule """ logger = logging.getLogger(__name__) logger.info("Tracing the model with torch.jit.trace ...") with torch.no_grad(): return torch.jit.trace(self.traceable_model, (self.traceable_inputs,)) class Caffe2Model(nn.Module): """ A wrapper around the traced model in Caffe2's protobuf format. The exported graph has different inputs/outputs from the original Pytorch model, as explained in :class:`Caffe2Tracer`. This class wraps around the exported graph to simulate the same interface as the original Pytorch model. It also provides functions to save/load models in Caffe2's format.' Examples: :: c2_model = Caffe2Tracer(cfg, torch_model, inputs).export_caffe2() inputs = [{"image": img_tensor_CHW}] outputs = c2_model(inputs) orig_outputs = torch_model(inputs) """ def __init__(self, predict_net, init_net): super().__init__() self.eval() # always in eval mode self._predict_net = predict_net self._init_net = init_net self._predictor = None __init__.__HIDE_SPHINX_DOC__ = True @property def predict_net(self): """ caffe2.core.Net: the underlying caffe2 predict net """ return self._predict_net @property def init_net(self): """ caffe2.core.Net: the underlying caffe2 init net """ return self._init_net def save_protobuf(self, output_dir): """ Save the model as caffe2's protobuf format. It saves the following files: * "model.pb": definition of the graph. Can be visualized with tools like `netron `_. * "model_init.pb": model parameters * "model.pbtxt": human-readable definition of the graph. Not needed for deployment. Args: output_dir (str): the output directory to save protobuf files. """ logger = logging.getLogger(__name__) logger.info("Saving model to {} ...".format(output_dir)) if not PathManager.exists(output_dir): PathManager.mkdirs(output_dir) with PathManager.open(os.path.join(output_dir, "model.pb"), "wb") as f: f.write(self._predict_net.SerializeToString()) with PathManager.open(os.path.join(output_dir, "model.pbtxt"), "w") as f: f.write(str(self._predict_net)) with PathManager.open(os.path.join(output_dir, "model_init.pb"), "wb") as f: f.write(self._init_net.SerializeToString()) def save_graph(self, output_file, inputs=None): """ Save the graph as SVG format. Args: output_file (str): a SVG file inputs: optional inputs given to the model. If given, the inputs will be used to run the graph to record shape of every tensor. The shape information will be saved together with the graph. """ from .caffe2_export import run_and_save_graph if inputs is None: save_graph(self._predict_net, output_file, op_only=False) else: size_divisibility = get_pb_arg_vali(self._predict_net, "size_divisibility", 0) device = get_pb_arg_vals(self._predict_net, "device", b"cpu").decode("ascii") inputs = convert_batched_inputs_to_c2_format(inputs, size_divisibility, device) inputs = [x.cpu().numpy() for x in inputs] run_and_save_graph(self._predict_net, self._init_net, inputs, output_file) @staticmethod def load_protobuf(dir): """ Args: dir (str): a directory used to save Caffe2Model with :meth:`save_protobuf`. The files "model.pb" and "model_init.pb" are needed. Returns: Caffe2Model: the caffe2 model loaded from this directory. """ predict_net = caffe2_pb2.NetDef() with PathManager.open(os.path.join(dir, "model.pb"), "rb") as f: predict_net.ParseFromString(f.read()) init_net = caffe2_pb2.NetDef() with PathManager.open(os.path.join(dir, "model_init.pb"), "rb") as f: init_net.ParseFromString(f.read()) return Caffe2Model(predict_net, init_net) def __call__(self, inputs): """ An interface that wraps around a Caffe2 model and mimics detectron2's models' input/output format. See details about the format at :doc:`/tutorials/models`. This is used to compare the outputs of caffe2 model with its original torch model. Due to the extra conversion between Pytorch/Caffe2, this method is not meant for benchmark. Because of the conversion, this method also has dependency on detectron2 in order to convert to detectron2's output format. """ if self._predictor is None: self._predictor = ProtobufDetectionModel(self._predict_net, self._init_net) return self._predictor(inputs) ================================================ FILE: annotator/oneformer/detectron2/export/c10.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import math from typing import Dict import torch import torch.nn.functional as F from annotator.oneformer.detectron2.layers import ShapeSpec, cat from annotator.oneformer.detectron2.layers.roi_align_rotated import ROIAlignRotated from annotator.oneformer.detectron2.modeling import poolers from annotator.oneformer.detectron2.modeling.proposal_generator import rpn from annotator.oneformer.detectron2.modeling.roi_heads.mask_head import mask_rcnn_inference from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, Keypoints, RotatedBoxes from .shared import alias, to_device """ This file contains caffe2-compatible implementation of several detectron2 components. """ class Caffe2Boxes(Boxes): """ Representing a list of detectron2.structures.Boxes from minibatch, each box is represented by a 5d vector (batch index + 4 coordinates), or a 6d vector (batch index + 5 coordinates) for RotatedBoxes. """ def __init__(self, tensor): assert isinstance(tensor, torch.Tensor) assert tensor.dim() == 2 and tensor.size(-1) in [4, 5, 6], tensor.size() # TODO: make tensor immutable when dim is Nx5 for Boxes, # and Nx6 for RotatedBoxes? self.tensor = tensor # TODO clean up this class, maybe just extend Instances class InstancesList(object): """ Tensor representation of a list of Instances object for a batch of images. When dealing with a batch of images with Caffe2 ops, a list of bboxes (instances) are usually represented by single Tensor with size (sigma(Ni), 5) or (sigma(Ni), 4) plus a batch split Tensor. This class is for providing common functions to convert between these two representations. """ def __init__(self, im_info, indices, extra_fields=None): # [N, 3] -> (H, W, Scale) self.im_info = im_info # [N,] -> indice of batch to which the instance belongs self.indices = indices # [N, ...] self.batch_extra_fields = extra_fields or {} self.image_size = self.im_info def get_fields(self): """like `get_fields` in the Instances object, but return each field in tensor representations""" ret = {} for k, v in self.batch_extra_fields.items(): # if isinstance(v, torch.Tensor): # tensor_rep = v # elif isinstance(v, (Boxes, Keypoints)): # tensor_rep = v.tensor # else: # raise ValueError("Can't find tensor representation for: {}".format()) ret[k] = v return ret def has(self, name): return name in self.batch_extra_fields def set(self, name, value): # len(tensor) is a bad practice that generates ONNX constants during tracing. # Although not a problem for the `assert` statement below, torch ONNX exporter # still raises a misleading warning as it does not this call comes from `assert` if isinstance(value, Boxes): data_len = value.tensor.shape[0] elif isinstance(value, torch.Tensor): data_len = value.shape[0] else: data_len = len(value) if len(self.batch_extra_fields): assert ( len(self) == data_len ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self)) self.batch_extra_fields[name] = value def __getattr__(self, name): if name not in self.batch_extra_fields: raise AttributeError("Cannot find field '{}' in the given Instances!".format(name)) return self.batch_extra_fields[name] def __len__(self): return len(self.indices) def flatten(self): ret = [] for _, v in self.batch_extra_fields.items(): if isinstance(v, (Boxes, Keypoints)): ret.append(v.tensor) else: ret.append(v) return ret @staticmethod def to_d2_instances_list(instances_list): """ Convert InstancesList to List[Instances]. The input `instances_list` can also be a List[Instances], in this case this method is a non-op. """ if not isinstance(instances_list, InstancesList): assert all(isinstance(x, Instances) for x in instances_list) return instances_list ret = [] for i, info in enumerate(instances_list.im_info): instances = Instances(torch.Size([int(info[0].item()), int(info[1].item())])) ids = instances_list.indices == i for k, v in instances_list.batch_extra_fields.items(): if isinstance(v, torch.Tensor): instances.set(k, v[ids]) continue elif isinstance(v, Boxes): instances.set(k, v[ids, -4:]) continue target_type, tensor_source = v assert isinstance(tensor_source, torch.Tensor) assert tensor_source.shape[0] == instances_list.indices.shape[0] tensor_source = tensor_source[ids] if issubclass(target_type, Boxes): instances.set(k, Boxes(tensor_source[:, -4:])) elif issubclass(target_type, Keypoints): instances.set(k, Keypoints(tensor_source)) elif issubclass(target_type, torch.Tensor): instances.set(k, tensor_source) else: raise ValueError("Can't handle targe type: {}".format(target_type)) ret.append(instances) return ret class Caffe2Compatible(object): """ A model can inherit this class to indicate that it can be traced and deployed with caffe2. """ def _get_tensor_mode(self): return self._tensor_mode def _set_tensor_mode(self, v): self._tensor_mode = v tensor_mode = property(_get_tensor_mode, _set_tensor_mode) """ If true, the model expects C2-style tensor only inputs/outputs format. """ class Caffe2RPN(Caffe2Compatible, rpn.RPN): @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = super(Caffe2Compatible, cls).from_config(cfg, input_shape) assert tuple(cfg.MODEL.RPN.BBOX_REG_WEIGHTS) == (1.0, 1.0, 1.0, 1.0) or tuple( cfg.MODEL.RPN.BBOX_REG_WEIGHTS ) == (1.0, 1.0, 1.0, 1.0, 1.0) return ret def _generate_proposals( self, images, objectness_logits_pred, anchor_deltas_pred, gt_instances=None ): assert isinstance(images, ImageList) if self.tensor_mode: im_info = images.image_sizes else: im_info = torch.tensor([[im_sz[0], im_sz[1], 1.0] for im_sz in images.image_sizes]).to( images.tensor.device ) assert isinstance(im_info, torch.Tensor) rpn_rois_list = [] rpn_roi_probs_list = [] for scores, bbox_deltas, cell_anchors_tensor, feat_stride in zip( objectness_logits_pred, anchor_deltas_pred, [b for (n, b) in self.anchor_generator.cell_anchors.named_buffers()], self.anchor_generator.strides, ): scores = scores.detach() bbox_deltas = bbox_deltas.detach() rpn_rois, rpn_roi_probs = torch.ops._caffe2.GenerateProposals( scores, bbox_deltas, im_info, cell_anchors_tensor, spatial_scale=1.0 / feat_stride, pre_nms_topN=self.pre_nms_topk[self.training], post_nms_topN=self.post_nms_topk[self.training], nms_thresh=self.nms_thresh, min_size=self.min_box_size, # correct_transform_coords=True, # deprecated argument angle_bound_on=True, # Default angle_bound_lo=-180, angle_bound_hi=180, clip_angle_thresh=1.0, # Default legacy_plus_one=False, ) rpn_rois_list.append(rpn_rois) rpn_roi_probs_list.append(rpn_roi_probs) # For FPN in D2, in RPN all proposals from different levels are concated # together, ranked and picked by top post_nms_topk. Then in ROIPooler # it calculates level_assignments and calls the RoIAlign from # the corresponding level. if len(objectness_logits_pred) == 1: rpn_rois = rpn_rois_list[0] rpn_roi_probs = rpn_roi_probs_list[0] else: assert len(rpn_rois_list) == len(rpn_roi_probs_list) rpn_post_nms_topN = self.post_nms_topk[self.training] device = rpn_rois_list[0].device input_list = [to_device(x, "cpu") for x in (rpn_rois_list + rpn_roi_probs_list)] # TODO remove this after confirming rpn_max_level/rpn_min_level # is not needed in CollectRpnProposals. feature_strides = list(self.anchor_generator.strides) rpn_min_level = int(math.log2(feature_strides[0])) rpn_max_level = int(math.log2(feature_strides[-1])) assert (rpn_max_level - rpn_min_level + 1) == len( rpn_rois_list ), "CollectRpnProposals requires continuous levels" rpn_rois = torch.ops._caffe2.CollectRpnProposals( input_list, # NOTE: in current implementation, rpn_max_level and rpn_min_level # are not needed, only the subtraction of two matters and it # can be infer from the number of inputs. Keep them now for # consistency. rpn_max_level=2 + len(rpn_rois_list) - 1, rpn_min_level=2, rpn_post_nms_topN=rpn_post_nms_topN, ) rpn_rois = to_device(rpn_rois, device) rpn_roi_probs = [] proposals = self.c2_postprocess(im_info, rpn_rois, rpn_roi_probs, self.tensor_mode) return proposals, {} def forward(self, images, features, gt_instances=None): assert not self.training features = [features[f] for f in self.in_features] objectness_logits_pred, anchor_deltas_pred = self.rpn_head(features) return self._generate_proposals( images, objectness_logits_pred, anchor_deltas_pred, gt_instances, ) @staticmethod def c2_postprocess(im_info, rpn_rois, rpn_roi_probs, tensor_mode): proposals = InstancesList( im_info=im_info, indices=rpn_rois[:, 0], extra_fields={ "proposal_boxes": Caffe2Boxes(rpn_rois), "objectness_logits": (torch.Tensor, rpn_roi_probs), }, ) if not tensor_mode: proposals = InstancesList.to_d2_instances_list(proposals) else: proposals = [proposals] return proposals class Caffe2ROIPooler(Caffe2Compatible, poolers.ROIPooler): @staticmethod def c2_preprocess(box_lists): assert all(isinstance(x, Boxes) for x in box_lists) if all(isinstance(x, Caffe2Boxes) for x in box_lists): # input is pure-tensor based assert len(box_lists) == 1 pooler_fmt_boxes = box_lists[0].tensor else: pooler_fmt_boxes = poolers.convert_boxes_to_pooler_format(box_lists) return pooler_fmt_boxes def forward(self, x, box_lists): assert not self.training pooler_fmt_boxes = self.c2_preprocess(box_lists) num_level_assignments = len(self.level_poolers) if num_level_assignments == 1: if isinstance(self.level_poolers[0], ROIAlignRotated): c2_roi_align = torch.ops._caffe2.RoIAlignRotated aligned = True else: c2_roi_align = torch.ops._caffe2.RoIAlign aligned = self.level_poolers[0].aligned x0 = x[0] if x0.is_quantized: x0 = x0.dequantize() out = c2_roi_align( x0, pooler_fmt_boxes, order="NCHW", spatial_scale=float(self.level_poolers[0].spatial_scale), pooled_h=int(self.output_size[0]), pooled_w=int(self.output_size[1]), sampling_ratio=int(self.level_poolers[0].sampling_ratio), aligned=aligned, ) return out device = pooler_fmt_boxes.device assert ( self.max_level - self.min_level + 1 == 4 ), "Currently DistributeFpnProposals only support 4 levels" fpn_outputs = torch.ops._caffe2.DistributeFpnProposals( to_device(pooler_fmt_boxes, "cpu"), roi_canonical_scale=self.canonical_box_size, roi_canonical_level=self.canonical_level, roi_max_level=self.max_level, roi_min_level=self.min_level, legacy_plus_one=False, ) fpn_outputs = [to_device(x, device) for x in fpn_outputs] rois_fpn_list = fpn_outputs[:-1] rois_idx_restore_int32 = fpn_outputs[-1] roi_feat_fpn_list = [] for roi_fpn, x_level, pooler in zip(rois_fpn_list, x, self.level_poolers): if isinstance(pooler, ROIAlignRotated): c2_roi_align = torch.ops._caffe2.RoIAlignRotated aligned = True else: c2_roi_align = torch.ops._caffe2.RoIAlign aligned = bool(pooler.aligned) if x_level.is_quantized: x_level = x_level.dequantize() roi_feat_fpn = c2_roi_align( x_level, roi_fpn, order="NCHW", spatial_scale=float(pooler.spatial_scale), pooled_h=int(self.output_size[0]), pooled_w=int(self.output_size[1]), sampling_ratio=int(pooler.sampling_ratio), aligned=aligned, ) roi_feat_fpn_list.append(roi_feat_fpn) roi_feat_shuffled = cat(roi_feat_fpn_list, dim=0) assert roi_feat_shuffled.numel() > 0 and rois_idx_restore_int32.numel() > 0, ( "Caffe2 export requires tracing with a model checkpoint + input that can produce valid" " detections. But no detections were obtained with the given checkpoint and input!" ) roi_feat = torch.ops._caffe2.BatchPermutation(roi_feat_shuffled, rois_idx_restore_int32) return roi_feat class Caffe2FastRCNNOutputsInference: def __init__(self, tensor_mode): self.tensor_mode = tensor_mode # whether the output is caffe2 tensor mode def __call__(self, box_predictor, predictions, proposals): """equivalent to FastRCNNOutputLayers.inference""" num_classes = box_predictor.num_classes score_thresh = box_predictor.test_score_thresh nms_thresh = box_predictor.test_nms_thresh topk_per_image = box_predictor.test_topk_per_image is_rotated = len(box_predictor.box2box_transform.weights) == 5 if is_rotated: box_dim = 5 assert box_predictor.box2box_transform.weights[4] == 1, ( "The weights for Rotated BBoxTransform in C2 have only 4 dimensions," + " thus enforcing the angle weight to be 1 for now" ) box2box_transform_weights = box_predictor.box2box_transform.weights[:4] else: box_dim = 4 box2box_transform_weights = box_predictor.box2box_transform.weights class_logits, box_regression = predictions if num_classes + 1 == class_logits.shape[1]: class_prob = F.softmax(class_logits, -1) else: assert num_classes == class_logits.shape[1] class_prob = F.sigmoid(class_logits) # BoxWithNMSLimit will infer num_classes from the shape of the class_prob # So append a zero column as placeholder for the background class class_prob = torch.cat((class_prob, torch.zeros(class_prob.shape[0], 1)), dim=1) assert box_regression.shape[1] % box_dim == 0 cls_agnostic_bbox_reg = box_regression.shape[1] // box_dim == 1 input_tensor_mode = proposals[0].proposal_boxes.tensor.shape[1] == box_dim + 1 proposal_boxes = proposals[0].proposal_boxes if isinstance(proposal_boxes, Caffe2Boxes): rois = Caffe2Boxes.cat([p.proposal_boxes for p in proposals]) elif isinstance(proposal_boxes, RotatedBoxes): rois = RotatedBoxes.cat([p.proposal_boxes for p in proposals]) elif isinstance(proposal_boxes, Boxes): rois = Boxes.cat([p.proposal_boxes for p in proposals]) else: raise NotImplementedError( 'Expected proposals[0].proposal_boxes to be type "Boxes", ' f"instead got {type(proposal_boxes)}" ) device, dtype = rois.tensor.device, rois.tensor.dtype if input_tensor_mode: im_info = proposals[0].image_size rois = rois.tensor else: im_info = torch.tensor( [[sz[0], sz[1], 1.0] for sz in [x.image_size for x in proposals]] ) batch_ids = cat( [ torch.full((b, 1), i, dtype=dtype, device=device) for i, b in enumerate(len(p) for p in proposals) ], dim=0, ) rois = torch.cat([batch_ids, rois.tensor], dim=1) roi_pred_bbox, roi_batch_splits = torch.ops._caffe2.BBoxTransform( to_device(rois, "cpu"), to_device(box_regression, "cpu"), to_device(im_info, "cpu"), weights=box2box_transform_weights, apply_scale=True, rotated=is_rotated, angle_bound_on=True, angle_bound_lo=-180, angle_bound_hi=180, clip_angle_thresh=1.0, legacy_plus_one=False, ) roi_pred_bbox = to_device(roi_pred_bbox, device) roi_batch_splits = to_device(roi_batch_splits, device) nms_outputs = torch.ops._caffe2.BoxWithNMSLimit( to_device(class_prob, "cpu"), to_device(roi_pred_bbox, "cpu"), to_device(roi_batch_splits, "cpu"), score_thresh=float(score_thresh), nms=float(nms_thresh), detections_per_im=int(topk_per_image), soft_nms_enabled=False, soft_nms_method="linear", soft_nms_sigma=0.5, soft_nms_min_score_thres=0.001, rotated=is_rotated, cls_agnostic_bbox_reg=cls_agnostic_bbox_reg, input_boxes_include_bg_cls=False, output_classes_include_bg_cls=False, legacy_plus_one=False, ) roi_score_nms = to_device(nms_outputs[0], device) roi_bbox_nms = to_device(nms_outputs[1], device) roi_class_nms = to_device(nms_outputs[2], device) roi_batch_splits_nms = to_device(nms_outputs[3], device) roi_keeps_nms = to_device(nms_outputs[4], device) roi_keeps_size_nms = to_device(nms_outputs[5], device) if not self.tensor_mode: roi_class_nms = roi_class_nms.to(torch.int64) roi_batch_ids = cat( [ torch.full((b, 1), i, dtype=dtype, device=device) for i, b in enumerate(int(x.item()) for x in roi_batch_splits_nms) ], dim=0, ) roi_class_nms = alias(roi_class_nms, "class_nms") roi_score_nms = alias(roi_score_nms, "score_nms") roi_bbox_nms = alias(roi_bbox_nms, "bbox_nms") roi_batch_splits_nms = alias(roi_batch_splits_nms, "batch_splits_nms") roi_keeps_nms = alias(roi_keeps_nms, "keeps_nms") roi_keeps_size_nms = alias(roi_keeps_size_nms, "keeps_size_nms") results = InstancesList( im_info=im_info, indices=roi_batch_ids[:, 0], extra_fields={ "pred_boxes": Caffe2Boxes(roi_bbox_nms), "scores": roi_score_nms, "pred_classes": roi_class_nms, }, ) if not self.tensor_mode: results = InstancesList.to_d2_instances_list(results) batch_splits = roi_batch_splits_nms.int().tolist() kept_indices = list(roi_keeps_nms.to(torch.int64).split(batch_splits)) else: results = [results] kept_indices = [roi_keeps_nms] return results, kept_indices class Caffe2MaskRCNNInference: def __call__(self, pred_mask_logits, pred_instances): """equivalent to mask_head.mask_rcnn_inference""" if all(isinstance(x, InstancesList) for x in pred_instances): assert len(pred_instances) == 1 mask_probs_pred = pred_mask_logits.sigmoid() mask_probs_pred = alias(mask_probs_pred, "mask_fcn_probs") pred_instances[0].set("pred_masks", mask_probs_pred) else: mask_rcnn_inference(pred_mask_logits, pred_instances) class Caffe2KeypointRCNNInference: def __init__(self, use_heatmap_max_keypoint): self.use_heatmap_max_keypoint = use_heatmap_max_keypoint def __call__(self, pred_keypoint_logits, pred_instances): # just return the keypoint heatmap for now, # there will be option to call HeatmapMaxKeypointOp output = alias(pred_keypoint_logits, "kps_score") if all(isinstance(x, InstancesList) for x in pred_instances): assert len(pred_instances) == 1 if self.use_heatmap_max_keypoint: device = output.device output = torch.ops._caffe2.HeatmapMaxKeypoint( to_device(output, "cpu"), pred_instances[0].pred_boxes.tensor, should_output_softmax=True, # worth make it configerable? ) output = to_device(output, device) output = alias(output, "keypoints_out") pred_instances[0].set("pred_keypoints", output) return pred_keypoint_logits ================================================ FILE: annotator/oneformer/detectron2/export/caffe2_export.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import io import logging import numpy as np from typing import List import onnx import onnx.optimizer import torch from caffe2.proto import caffe2_pb2 from caffe2.python import core from caffe2.python.onnx.backend import Caffe2Backend from tabulate import tabulate from termcolor import colored from torch.onnx import OperatorExportTypes from .shared import ( ScopedWS, construct_init_net_from_params, fuse_alias_placeholder, fuse_copy_between_cpu_and_gpu, get_params_from_init_net, group_norm_replace_aten_with_caffe2, infer_device_type, remove_dead_end_ops, remove_reshape_for_fc, save_graph, ) logger = logging.getLogger(__name__) def export_onnx_model(model, inputs): """ Trace and export a model to onnx format. Args: model (nn.Module): inputs (tuple[args]): the model will be called by `model(*inputs)` Returns: an onnx model """ assert isinstance(model, torch.nn.Module) # make sure all modules are in eval mode, onnx may change the training state # of the module if the states are not consistent def _check_eval(module): assert not module.training model.apply(_check_eval) # Export the model to ONNX with torch.no_grad(): with io.BytesIO() as f: torch.onnx.export( model, inputs, f, operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK, # verbose=True, # NOTE: uncomment this for debugging # export_params=True, ) onnx_model = onnx.load_from_string(f.getvalue()) return onnx_model def _op_stats(net_def): type_count = {} for t in [op.type for op in net_def.op]: type_count[t] = type_count.get(t, 0) + 1 type_count_list = sorted(type_count.items(), key=lambda kv: kv[0]) # alphabet type_count_list = sorted(type_count_list, key=lambda kv: -kv[1]) # count return "\n".join("{:>4}x {}".format(count, name) for name, count in type_count_list) def _assign_device_option( predict_net: caffe2_pb2.NetDef, init_net: caffe2_pb2.NetDef, tensor_inputs: List[torch.Tensor] ): """ ONNX exported network doesn't have concept of device, assign necessary device option for each op in order to make it runable on GPU runtime. """ def _get_device_type(torch_tensor): assert torch_tensor.device.type in ["cpu", "cuda"] assert torch_tensor.device.index == 0 return torch_tensor.device.type def _assign_op_device_option(net_proto, net_ssa, blob_device_types): for op, ssa_i in zip(net_proto.op, net_ssa): if op.type in ["CopyCPUToGPU", "CopyGPUToCPU"]: op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0)) else: devices = [blob_device_types[b] for b in ssa_i[0] + ssa_i[1]] assert all(d == devices[0] for d in devices) if devices[0] == "cuda": op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0)) # update ops in predict_net predict_net_input_device_types = { (name, 0): _get_device_type(tensor) for name, tensor in zip(predict_net.external_input, tensor_inputs) } predict_net_device_types = infer_device_type( predict_net, known_status=predict_net_input_device_types, device_name_style="pytorch" ) predict_net_ssa, _ = core.get_ssa(predict_net) _assign_op_device_option(predict_net, predict_net_ssa, predict_net_device_types) # update ops in init_net init_net_ssa, versions = core.get_ssa(init_net) init_net_output_device_types = { (name, versions[name]): predict_net_device_types[(name, 0)] for name in init_net.external_output } init_net_device_types = infer_device_type( init_net, known_status=init_net_output_device_types, device_name_style="pytorch" ) _assign_op_device_option(init_net, init_net_ssa, init_net_device_types) def export_caffe2_detection_model(model: torch.nn.Module, tensor_inputs: List[torch.Tensor]): """ Export a caffe2-compatible Detectron2 model to caffe2 format via ONNX. Arg: model: a caffe2-compatible version of detectron2 model, defined in caffe2_modeling.py tensor_inputs: a list of tensors that caffe2 model takes as input. """ model = copy.deepcopy(model) assert isinstance(model, torch.nn.Module) assert hasattr(model, "encode_additional_info") # Export via ONNX logger.info( "Exporting a {} model via ONNX ...".format(type(model).__name__) + " Some warnings from ONNX are expected and are usually not to worry about." ) onnx_model = export_onnx_model(model, (tensor_inputs,)) # Convert ONNX model to Caffe2 protobuf init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model) ops_table = [[op.type, op.input, op.output] for op in predict_net.op] table = tabulate(ops_table, headers=["type", "input", "output"], tablefmt="pipe") logger.info( "ONNX export Done. Exported predict_net (before optimizations):\n" + colored(table, "cyan") ) # Apply protobuf optimization fuse_alias_placeholder(predict_net, init_net) if any(t.device.type != "cpu" for t in tensor_inputs): fuse_copy_between_cpu_and_gpu(predict_net) remove_dead_end_ops(init_net) _assign_device_option(predict_net, init_net, tensor_inputs) params, device_options = get_params_from_init_net(init_net) predict_net, params = remove_reshape_for_fc(predict_net, params) init_net = construct_init_net_from_params(params, device_options) group_norm_replace_aten_with_caffe2(predict_net) # Record necessary information for running the pb model in Detectron2 system. model.encode_additional_info(predict_net, init_net) logger.info("Operators used in predict_net: \n{}".format(_op_stats(predict_net))) logger.info("Operators used in init_net: \n{}".format(_op_stats(init_net))) return predict_net, init_net def run_and_save_graph(predict_net, init_net, tensor_inputs, graph_save_path): """ Run the caffe2 model on given inputs, recording the shape and draw the graph. predict_net/init_net: caffe2 model. tensor_inputs: a list of tensors that caffe2 model takes as input. graph_save_path: path for saving graph of exported model. """ logger.info("Saving graph of ONNX exported model to {} ...".format(graph_save_path)) save_graph(predict_net, graph_save_path, op_only=False) # Run the exported Caffe2 net logger.info("Running ONNX exported model ...") with ScopedWS("__ws_tmp__", True) as ws: ws.RunNetOnce(init_net) initialized_blobs = set(ws.Blobs()) uninitialized = [inp for inp in predict_net.external_input if inp not in initialized_blobs] for name, blob in zip(uninitialized, tensor_inputs): ws.FeedBlob(name, blob) try: ws.RunNetOnce(predict_net) except RuntimeError as e: logger.warning("Encountered RuntimeError: \n{}".format(str(e))) ws_blobs = {b: ws.FetchBlob(b) for b in ws.Blobs()} blob_sizes = {b: ws_blobs[b].shape for b in ws_blobs if isinstance(ws_blobs[b], np.ndarray)} logger.info("Saving graph with blob shapes to {} ...".format(graph_save_path)) save_graph(predict_net, graph_save_path, op_only=False, blob_sizes=blob_sizes) return ws_blobs ================================================ FILE: annotator/oneformer/detectron2/export/caffe2_inference.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np from itertools import count import torch from caffe2.proto import caffe2_pb2 from caffe2.python import core from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format from .shared import ScopedWS, get_pb_arg_vali, get_pb_arg_vals, infer_device_type logger = logging.getLogger(__name__) # ===== ref: mobile-vision predictor's 'Caffe2Wrapper' class ====== class ProtobufModel(torch.nn.Module): """ Wrapper of a caffe2's protobuf model. It works just like nn.Module, but running caffe2 under the hood. Input/Output are tuple[tensor] that match the caffe2 net's external_input/output. """ _ids = count(0) def __init__(self, predict_net, init_net): logger.info(f"Initializing ProtobufModel for: {predict_net.name} ...") super().__init__() assert isinstance(predict_net, caffe2_pb2.NetDef) assert isinstance(init_net, caffe2_pb2.NetDef) # create unique temporary workspace for each instance self.ws_name = "__tmp_ProtobufModel_{}__".format(next(self._ids)) self.net = core.Net(predict_net) logger.info("Running init_net once to fill the parameters ...") with ScopedWS(self.ws_name, is_reset=True, is_cleanup=False) as ws: ws.RunNetOnce(init_net) uninitialized_external_input = [] for blob in self.net.Proto().external_input: if blob not in ws.Blobs(): uninitialized_external_input.append(blob) ws.CreateBlob(blob) ws.CreateNet(self.net) self._error_msgs = set() self._input_blobs = uninitialized_external_input def _infer_output_devices(self, inputs): """ Returns: list[str]: list of device for each external output """ def _get_device_type(torch_tensor): assert torch_tensor.device.type in ["cpu", "cuda"] assert torch_tensor.device.index == 0 return torch_tensor.device.type predict_net = self.net.Proto() input_device_types = { (name, 0): _get_device_type(tensor) for name, tensor in zip(self._input_blobs, inputs) } device_type_map = infer_device_type( predict_net, known_status=input_device_types, device_name_style="pytorch" ) ssa, versions = core.get_ssa(predict_net) versioned_outputs = [(name, versions[name]) for name in predict_net.external_output] output_devices = [device_type_map[outp] for outp in versioned_outputs] return output_devices def forward(self, inputs): """ Args: inputs (tuple[torch.Tensor]) Returns: tuple[torch.Tensor] """ assert len(inputs) == len(self._input_blobs), ( f"Length of inputs ({len(inputs)}) " f"doesn't match the required input blobs: {self._input_blobs}" ) with ScopedWS(self.ws_name, is_reset=False, is_cleanup=False) as ws: for b, tensor in zip(self._input_blobs, inputs): ws.FeedBlob(b, tensor) try: ws.RunNet(self.net.Proto().name) except RuntimeError as e: if not str(e) in self._error_msgs: self._error_msgs.add(str(e)) logger.warning("Encountered new RuntimeError: \n{}".format(str(e))) logger.warning("Catch the error and use partial results.") c2_outputs = [ws.FetchBlob(b) for b in self.net.Proto().external_output] # Remove outputs of current run, this is necessary in order to # prevent fetching the result from previous run if the model fails # in the middle. for b in self.net.Proto().external_output: # Needs to create uninitialized blob to make the net runable. # This is "equivalent" to: ws.RemoveBlob(b) then ws.CreateBlob(b), # but there'no such API. ws.FeedBlob(b, f"{b}, a C++ native class of type nullptr (uninitialized).") # Cast output to torch.Tensor on the desired device output_devices = ( self._infer_output_devices(inputs) if any(t.device.type != "cpu" for t in inputs) else ["cpu" for _ in self.net.Proto().external_output] ) outputs = [] for name, c2_output, device in zip( self.net.Proto().external_output, c2_outputs, output_devices ): if not isinstance(c2_output, np.ndarray): raise RuntimeError( "Invalid output for blob {}, received: {}".format(name, c2_output) ) outputs.append(torch.tensor(c2_output).to(device=device)) return tuple(outputs) class ProtobufDetectionModel(torch.nn.Module): """ A class works just like a pytorch meta arch in terms of inference, but running caffe2 model under the hood. """ def __init__(self, predict_net, init_net, *, convert_outputs=None): """ Args: predict_net, init_net (core.Net): caffe2 nets convert_outptus (callable): a function that converts caffe2 outputs to the same format of the original pytorch model. By default, use the one defined in the caffe2 meta_arch. """ super().__init__() self.protobuf_model = ProtobufModel(predict_net, init_net) self.size_divisibility = get_pb_arg_vali(predict_net, "size_divisibility", 0) self.device = get_pb_arg_vals(predict_net, "device", b"cpu").decode("ascii") if convert_outputs is None: meta_arch = get_pb_arg_vals(predict_net, "meta_architecture", b"GeneralizedRCNN") meta_arch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[meta_arch.decode("ascii")] self._convert_outputs = meta_arch.get_outputs_converter(predict_net, init_net) else: self._convert_outputs = convert_outputs def _convert_inputs(self, batched_inputs): # currently all models convert inputs in the same way return convert_batched_inputs_to_c2_format( batched_inputs, self.size_divisibility, self.device ) def forward(self, batched_inputs): c2_inputs = self._convert_inputs(batched_inputs) c2_results = self.protobuf_model(c2_inputs) c2_results = dict(zip(self.protobuf_model.net.Proto().external_output, c2_results)) return self._convert_outputs(batched_inputs, c2_inputs, c2_results) ================================================ FILE: annotator/oneformer/detectron2/export/caffe2_modeling.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import functools import io import struct import types import torch from annotator.oneformer.detectron2.modeling import meta_arch from annotator.oneformer.detectron2.modeling.box_regression import Box2BoxTransform from annotator.oneformer.detectron2.modeling.roi_heads import keypoint_head from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes from .c10 import Caffe2Compatible from .caffe2_patch import ROIHeadsPatcher, patch_generalized_rcnn from .shared import ( alias, check_set_pb_arg, get_pb_arg_floats, get_pb_arg_valf, get_pb_arg_vali, get_pb_arg_vals, mock_torch_nn_functional_interpolate, ) def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False): """ A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor]) to detectron2's format (i.e. list of Instances instance). This only works when the model follows the Caffe2 detectron's naming convention. Args: image_sizes (List[List[int, int]]): [H, W] of every image. tensor_outputs (Dict[str, Tensor]): external_output to its tensor. force_mask_on (Bool): if true, the it make sure there'll be pred_masks even if the mask is not found from tensor_outputs (usually due to model crash) """ results = [Instances(image_size) for image_size in image_sizes] batch_splits = tensor_outputs.get("batch_splits", None) if batch_splits: raise NotImplementedError() assert len(image_sizes) == 1 result = results[0] bbox_nms = tensor_outputs["bbox_nms"] score_nms = tensor_outputs["score_nms"] class_nms = tensor_outputs["class_nms"] # Detection will always success because Conv support 0-batch assert bbox_nms is not None assert score_nms is not None assert class_nms is not None if bbox_nms.shape[1] == 5: result.pred_boxes = RotatedBoxes(bbox_nms) else: result.pred_boxes = Boxes(bbox_nms) result.scores = score_nms result.pred_classes = class_nms.to(torch.int64) mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None) if mask_fcn_probs is not None: # finish the mask pred mask_probs_pred = mask_fcn_probs num_masks = mask_probs_pred.shape[0] class_pred = result.pred_classes indices = torch.arange(num_masks, device=class_pred.device) mask_probs_pred = mask_probs_pred[indices, class_pred][:, None] result.pred_masks = mask_probs_pred elif force_mask_on: # NOTE: there's no way to know the height/width of mask here, it won't be # used anyway when batch size is 0, so just set them to 0. result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8) keypoints_out = tensor_outputs.get("keypoints_out", None) kps_score = tensor_outputs.get("kps_score", None) if keypoints_out is not None: # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob) keypoints_tensor = keypoints_out # NOTE: it's possible that prob is not calculated if "should_output_softmax" # is set to False in HeatmapMaxKeypoint, so just using raw score, seems # it doesn't affect mAP. TODO: check more carefully. keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]] result.pred_keypoints = keypoint_xyp elif kps_score is not None: # keypoint heatmap to sparse data structure pred_keypoint_logits = kps_score keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result]) return results def _cast_to_f32(f64): return struct.unpack("f", struct.pack("f", f64))[0] def set_caffe2_compatible_tensor_mode(model, enable=True): def _fn(m): if isinstance(m, Caffe2Compatible): m.tensor_mode = enable model.apply(_fn) def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device): """ See get_caffe2_inputs() below. """ assert all(isinstance(x, dict) for x in batched_inputs) assert all(x["image"].dim() == 3 for x in batched_inputs) images = [x["image"] for x in batched_inputs] images = ImageList.from_tensors(images, size_divisibility) im_info = [] for input_per_image, image_size in zip(batched_inputs, images.image_sizes): target_height = input_per_image.get("height", image_size[0]) target_width = input_per_image.get("width", image_size[1]) # noqa # NOTE: The scale inside im_info is kept as convention and for providing # post-processing information if further processing is needed. For # current Caffe2 model definitions that don't include post-processing inside # the model, this number is not used. # NOTE: There can be a slight difference between width and height # scales, using a single number can results in numerical difference # compared with D2's post-processing. scale = target_height / image_size[0] im_info.append([image_size[0], image_size[1], scale]) im_info = torch.Tensor(im_info) return images.tensor.to(device), im_info.to(device) class Caffe2MetaArch(Caffe2Compatible, torch.nn.Module): """ Base class for caffe2-compatible implementation of a meta architecture. The forward is traceable and its traced graph can be converted to caffe2 graph through ONNX. """ def __init__(self, cfg, torch_model): """ Args: cfg (CfgNode): torch_model (nn.Module): the detectron2 model (meta_arch) to be converted. """ super().__init__() self._wrapped_model = torch_model self.eval() set_caffe2_compatible_tensor_mode(self, True) def get_caffe2_inputs(self, batched_inputs): """ Convert pytorch-style structured inputs to caffe2-style inputs that are tuples of tensors. Args: batched_inputs (list[dict]): inputs to a detectron2 model in its standard format. Each dict has "image" (CHW tensor), and optionally "height" and "width". Returns: tuple[Tensor]: tuple of tensors that will be the inputs to the :meth:`forward` method. For existing models, the first is an NCHW tensor (padded and batched); the second is a im_info Nx3 tensor, where the rows are (height, width, unused legacy parameter) """ return convert_batched_inputs_to_c2_format( batched_inputs, self._wrapped_model.backbone.size_divisibility, self._wrapped_model.device, ) def encode_additional_info(self, predict_net, init_net): """ Save extra metadata that will be used by inference in the output protobuf. """ pass def forward(self, inputs): """ Run the forward in caffe2-style. It has to use caffe2-compatible ops and the method will be used for tracing. Args: inputs (tuple[Tensor]): inputs defined by :meth:`get_caffe2_input`. They will be the inputs of the converted caffe2 graph. Returns: tuple[Tensor]: output tensors. They will be the outputs of the converted caffe2 graph. """ raise NotImplementedError def _caffe2_preprocess_image(self, inputs): """ Caffe2 implementation of preprocess_image, which is called inside each MetaArch's forward. It normalizes the input images, and the final caffe2 graph assumes the inputs have been batched already. """ data, im_info = inputs data = alias(data, "data") im_info = alias(im_info, "im_info") mean, std = self._wrapped_model.pixel_mean, self._wrapped_model.pixel_std normalized_data = (data - mean) / std normalized_data = alias(normalized_data, "normalized_data") # Pack (data, im_info) into ImageList which is recognized by self.inference. images = ImageList(tensor=normalized_data, image_sizes=im_info) return images @staticmethod def get_outputs_converter(predict_net, init_net): """ Creates a function that converts outputs of the caffe2 model to detectron2's standard format. The function uses information in `predict_net` and `init_net` that are available at inferene time. Therefore the function logic can be used in inference. The returned function has the following signature: def convert(batched_inputs, c2_inputs, c2_results) -> detectron2_outputs Where * batched_inputs (list[dict]): the original input format of the meta arch * c2_inputs (tuple[Tensor]): the caffe2 inputs. * c2_results (dict[str, Tensor]): the caffe2 output format, corresponding to the outputs of the :meth:`forward` function. * detectron2_outputs: the original output format of the meta arch. This function can be used to compare the outputs of the original meta arch and the converted caffe2 graph. Returns: callable: a callable of the above signature. """ raise NotImplementedError class Caffe2GeneralizedRCNN(Caffe2MetaArch): def __init__(self, cfg, torch_model): assert isinstance(torch_model, meta_arch.GeneralizedRCNN) torch_model = patch_generalized_rcnn(torch_model) super().__init__(cfg, torch_model) try: use_heatmap_max_keypoint = cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT except AttributeError: use_heatmap_max_keypoint = False self.roi_heads_patcher = ROIHeadsPatcher( self._wrapped_model.roi_heads, use_heatmap_max_keypoint ) def encode_additional_info(self, predict_net, init_net): size_divisibility = self._wrapped_model.backbone.size_divisibility check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility) check_set_pb_arg( predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii") ) check_set_pb_arg(predict_net, "meta_architecture", "s", b"GeneralizedRCNN") @mock_torch_nn_functional_interpolate() def forward(self, inputs): if not self.tensor_mode: return self._wrapped_model.inference(inputs) images = self._caffe2_preprocess_image(inputs) features = self._wrapped_model.backbone(images.tensor) proposals, _ = self._wrapped_model.proposal_generator(images, features) with self.roi_heads_patcher.mock_roi_heads(): detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals) return tuple(detector_results[0].flatten()) @staticmethod def get_outputs_converter(predict_net, init_net): def f(batched_inputs, c2_inputs, c2_results): _, im_info = c2_inputs image_sizes = [[int(im[0]), int(im[1])] for im in im_info] results = assemble_rcnn_outputs_by_name(image_sizes, c2_results) return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes) return f class Caffe2RetinaNet(Caffe2MetaArch): def __init__(self, cfg, torch_model): assert isinstance(torch_model, meta_arch.RetinaNet) super().__init__(cfg, torch_model) @mock_torch_nn_functional_interpolate() def forward(self, inputs): assert self.tensor_mode images = self._caffe2_preprocess_image(inputs) # explicitly return the images sizes to avoid removing "im_info" by ONNX # since it's not used in the forward path return_tensors = [images.image_sizes] features = self._wrapped_model.backbone(images.tensor) features = [features[f] for f in self._wrapped_model.head_in_features] for i, feature_i in enumerate(features): features[i] = alias(feature_i, "feature_{}".format(i), is_backward=True) return_tensors.append(features[i]) pred_logits, pred_anchor_deltas = self._wrapped_model.head(features) for i, (box_cls_i, box_delta_i) in enumerate(zip(pred_logits, pred_anchor_deltas)): return_tensors.append(alias(box_cls_i, "box_cls_{}".format(i))) return_tensors.append(alias(box_delta_i, "box_delta_{}".format(i))) return tuple(return_tensors) def encode_additional_info(self, predict_net, init_net): size_divisibility = self._wrapped_model.backbone.size_divisibility check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility) check_set_pb_arg( predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii") ) check_set_pb_arg(predict_net, "meta_architecture", "s", b"RetinaNet") # Inference parameters: check_set_pb_arg( predict_net, "score_threshold", "f", _cast_to_f32(self._wrapped_model.test_score_thresh) ) check_set_pb_arg( predict_net, "topk_candidates", "i", self._wrapped_model.test_topk_candidates ) check_set_pb_arg( predict_net, "nms_threshold", "f", _cast_to_f32(self._wrapped_model.test_nms_thresh) ) check_set_pb_arg( predict_net, "max_detections_per_image", "i", self._wrapped_model.max_detections_per_image, ) check_set_pb_arg( predict_net, "bbox_reg_weights", "floats", [_cast_to_f32(w) for w in self._wrapped_model.box2box_transform.weights], ) self._encode_anchor_generator_cfg(predict_net) def _encode_anchor_generator_cfg(self, predict_net): # serialize anchor_generator for future use serialized_anchor_generator = io.BytesIO() torch.save(self._wrapped_model.anchor_generator, serialized_anchor_generator) # Ideally we can put anchor generating inside the model, then we don't # need to store this information. bytes = serialized_anchor_generator.getvalue() check_set_pb_arg(predict_net, "serialized_anchor_generator", "s", bytes) @staticmethod def get_outputs_converter(predict_net, init_net): self = types.SimpleNamespace() serialized_anchor_generator = io.BytesIO( get_pb_arg_vals(predict_net, "serialized_anchor_generator", None) ) self.anchor_generator = torch.load(serialized_anchor_generator) bbox_reg_weights = get_pb_arg_floats(predict_net, "bbox_reg_weights", None) self.box2box_transform = Box2BoxTransform(weights=tuple(bbox_reg_weights)) self.test_score_thresh = get_pb_arg_valf(predict_net, "score_threshold", None) self.test_topk_candidates = get_pb_arg_vali(predict_net, "topk_candidates", None) self.test_nms_thresh = get_pb_arg_valf(predict_net, "nms_threshold", None) self.max_detections_per_image = get_pb_arg_vali( predict_net, "max_detections_per_image", None ) # hack to reuse inference code from RetinaNet for meth in [ "forward_inference", "inference_single_image", "_transpose_dense_predictions", "_decode_multi_level_predictions", "_decode_per_level_predictions", ]: setattr(self, meth, functools.partial(getattr(meta_arch.RetinaNet, meth), self)) def f(batched_inputs, c2_inputs, c2_results): _, im_info = c2_inputs image_sizes = [[int(im[0]), int(im[1])] for im in im_info] dummy_images = ImageList( torch.randn( ( len(im_info), 3, ) + tuple(image_sizes[0]) ), image_sizes, ) num_features = len([x for x in c2_results.keys() if x.startswith("box_cls_")]) pred_logits = [c2_results["box_cls_{}".format(i)] for i in range(num_features)] pred_anchor_deltas = [c2_results["box_delta_{}".format(i)] for i in range(num_features)] # For each feature level, feature should have the same batch size and # spatial dimension as the box_cls and box_delta. dummy_features = [x.clone()[:, 0:0, :, :] for x in pred_logits] # self.num_classess can be inferred self.num_classes = pred_logits[0].shape[1] // (pred_anchor_deltas[0].shape[1] // 4) results = self.forward_inference( dummy_images, dummy_features, [pred_logits, pred_anchor_deltas] ) return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes) return f META_ARCH_CAFFE2_EXPORT_TYPE_MAP = { "GeneralizedRCNN": Caffe2GeneralizedRCNN, "RetinaNet": Caffe2RetinaNet, } ================================================ FILE: annotator/oneformer/detectron2/export/caffe2_patch.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import contextlib from unittest import mock import torch from annotator.oneformer.detectron2.modeling import poolers from annotator.oneformer.detectron2.modeling.proposal_generator import rpn from annotator.oneformer.detectron2.modeling.roi_heads import keypoint_head, mask_head from annotator.oneformer.detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers from .c10 import ( Caffe2Compatible, Caffe2FastRCNNOutputsInference, Caffe2KeypointRCNNInference, Caffe2MaskRCNNInference, Caffe2ROIPooler, Caffe2RPN, ) class GenericMixin(object): pass class Caffe2CompatibleConverter(object): """ A GenericUpdater which implements the `create_from` interface, by modifying module object and assign it with another class replaceCls. """ def __init__(self, replaceCls): self.replaceCls = replaceCls def create_from(self, module): # update module's class to the new class assert isinstance(module, torch.nn.Module) if issubclass(self.replaceCls, GenericMixin): # replaceCls should act as mixin, create a new class on-the-fly new_class = type( "{}MixedWith{}".format(self.replaceCls.__name__, module.__class__.__name__), (self.replaceCls, module.__class__), {}, # {"new_method": lambda self: ...}, ) module.__class__ = new_class else: # replaceCls is complete class, this allow arbitrary class swap module.__class__ = self.replaceCls # initialize Caffe2Compatible if isinstance(module, Caffe2Compatible): module.tensor_mode = False return module def patch(model, target, updater, *args, **kwargs): """ recursively (post-order) update all modules with the target type and its subclasses, make a initialization/composition/inheritance/... via the updater.create_from. """ for name, module in model.named_children(): model._modules[name] = patch(module, target, updater, *args, **kwargs) if isinstance(model, target): return updater.create_from(model, *args, **kwargs) return model def patch_generalized_rcnn(model): ccc = Caffe2CompatibleConverter model = patch(model, rpn.RPN, ccc(Caffe2RPN)) model = patch(model, poolers.ROIPooler, ccc(Caffe2ROIPooler)) return model @contextlib.contextmanager def mock_fastrcnn_outputs_inference( tensor_mode, check=True, box_predictor_type=FastRCNNOutputLayers ): with mock.patch.object( box_predictor_type, "inference", autospec=True, side_effect=Caffe2FastRCNNOutputsInference(tensor_mode), ) as mocked_func: yield if check: assert mocked_func.call_count > 0 @contextlib.contextmanager def mock_mask_rcnn_inference(tensor_mode, patched_module, check=True): with mock.patch( "{}.mask_rcnn_inference".format(patched_module), side_effect=Caffe2MaskRCNNInference() ) as mocked_func: yield if check: assert mocked_func.call_count > 0 @contextlib.contextmanager def mock_keypoint_rcnn_inference(tensor_mode, patched_module, use_heatmap_max_keypoint, check=True): with mock.patch( "{}.keypoint_rcnn_inference".format(patched_module), side_effect=Caffe2KeypointRCNNInference(use_heatmap_max_keypoint), ) as mocked_func: yield if check: assert mocked_func.call_count > 0 class ROIHeadsPatcher: def __init__(self, heads, use_heatmap_max_keypoint): self.heads = heads self.use_heatmap_max_keypoint = use_heatmap_max_keypoint @contextlib.contextmanager def mock_roi_heads(self, tensor_mode=True): """ Patching several inference functions inside ROIHeads and its subclasses Args: tensor_mode (bool): whether the inputs/outputs are caffe2's tensor format or not. Default to True. """ # NOTE: this requries the `keypoint_rcnn_inference` and `mask_rcnn_inference` # are called inside the same file as BaseXxxHead due to using mock.patch. kpt_heads_mod = keypoint_head.BaseKeypointRCNNHead.__module__ mask_head_mod = mask_head.BaseMaskRCNNHead.__module__ mock_ctx_managers = [ mock_fastrcnn_outputs_inference( tensor_mode=tensor_mode, check=True, box_predictor_type=type(self.heads.box_predictor), ) ] if getattr(self.heads, "keypoint_on", False): mock_ctx_managers += [ mock_keypoint_rcnn_inference( tensor_mode, kpt_heads_mod, self.use_heatmap_max_keypoint ) ] if getattr(self.heads, "mask_on", False): mock_ctx_managers += [mock_mask_rcnn_inference(tensor_mode, mask_head_mod)] with contextlib.ExitStack() as stack: # python 3.3+ for mgr in mock_ctx_managers: stack.enter_context(mgr) yield ================================================ FILE: annotator/oneformer/detectron2/export/flatten.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import collections from dataclasses import dataclass from typing import Callable, List, Optional, Tuple import torch from torch import nn from annotator.oneformer.detectron2.structures import Boxes, Instances, ROIMasks from annotator.oneformer.detectron2.utils.registry import _convert_target_to_string, locate from .torchscript_patch import patch_builtin_len @dataclass class Schema: """ A Schema defines how to flatten a possibly hierarchical object into tuple of primitive objects, so it can be used as inputs/outputs of PyTorch's tracing. PyTorch does not support tracing a function that produces rich output structures (e.g. dict, Instances, Boxes). To trace such a function, we flatten the rich object into tuple of tensors, and return this tuple of tensors instead. Meanwhile, we also need to know how to "rebuild" the original object from the flattened results, so we can evaluate the flattened results. A Schema defines how to flatten an object, and while flattening it, it records necessary schemas so that the object can be rebuilt using the flattened outputs. The flattened object and the schema object is returned by ``.flatten`` classmethod. Then the original object can be rebuilt with the ``__call__`` method of schema. A Schema is a dataclass that can be serialized easily. """ # inspired by FetchMapper in tensorflow/python/client/session.py @classmethod def flatten(cls, obj): raise NotImplementedError def __call__(self, values): raise NotImplementedError @staticmethod def _concat(values): ret = () sizes = [] for v in values: assert isinstance(v, tuple), "Flattened results must be a tuple" ret = ret + v sizes.append(len(v)) return ret, sizes @staticmethod def _split(values, sizes): if len(sizes): expected_len = sum(sizes) assert ( len(values) == expected_len ), f"Values has length {len(values)} but expect length {expected_len}." ret = [] for k in range(len(sizes)): begin, end = sum(sizes[:k]), sum(sizes[: k + 1]) ret.append(values[begin:end]) return ret @dataclass class ListSchema(Schema): schemas: List[Schema] # the schemas that define how to flatten each element in the list sizes: List[int] # the flattened length of each element def __call__(self, values): values = self._split(values, self.sizes) if len(values) != len(self.schemas): raise ValueError( f"Values has length {len(values)} but schemas " f"has length {len(self.schemas)}!" ) values = [m(v) for m, v in zip(self.schemas, values)] return list(values) @classmethod def flatten(cls, obj): res = [flatten_to_tuple(k) for k in obj] values, sizes = cls._concat([k[0] for k in res]) return values, cls([k[1] for k in res], sizes) @dataclass class TupleSchema(ListSchema): def __call__(self, values): return tuple(super().__call__(values)) @dataclass class IdentitySchema(Schema): def __call__(self, values): return values[0] @classmethod def flatten(cls, obj): return (obj,), cls() @dataclass class DictSchema(ListSchema): keys: List[str] def __call__(self, values): values = super().__call__(values) return dict(zip(self.keys, values)) @classmethod def flatten(cls, obj): for k in obj.keys(): if not isinstance(k, str): raise KeyError("Only support flattening dictionaries if keys are str.") keys = sorted(obj.keys()) values = [obj[k] for k in keys] ret, schema = ListSchema.flatten(values) return ret, cls(schema.schemas, schema.sizes, keys) @dataclass class InstancesSchema(DictSchema): def __call__(self, values): image_size, fields = values[-1], values[:-1] fields = super().__call__(fields) return Instances(image_size, **fields) @classmethod def flatten(cls, obj): ret, schema = super().flatten(obj.get_fields()) size = obj.image_size if not isinstance(size, torch.Tensor): size = torch.tensor(size) return ret + (size,), schema @dataclass class TensorWrapSchema(Schema): """ For classes that are simple wrapper of tensors, e.g. Boxes, RotatedBoxes, BitMasks """ class_name: str def __call__(self, values): return locate(self.class_name)(values[0]) @classmethod def flatten(cls, obj): return (obj.tensor,), cls(_convert_target_to_string(type(obj))) # if more custom structures needed in the future, can allow # passing in extra schemas for custom types def flatten_to_tuple(obj): """ Flatten an object so it can be used for PyTorch tracing. Also returns how to rebuild the original object from the flattened outputs. Returns: res (tuple): the flattened results that can be used as tracing outputs schema: an object with a ``__call__`` method such that ``schema(res) == obj``. It is a pure dataclass that can be serialized. """ schemas = [ ((str, bytes), IdentitySchema), (list, ListSchema), (tuple, TupleSchema), (collections.abc.Mapping, DictSchema), (Instances, InstancesSchema), ((Boxes, ROIMasks), TensorWrapSchema), ] for klass, schema in schemas: if isinstance(obj, klass): F = schema break else: F = IdentitySchema return F.flatten(obj) class TracingAdapter(nn.Module): """ A model may take rich input/output format (e.g. dict or custom classes), but `torch.jit.trace` requires tuple of tensors as input/output. This adapter flattens input/output format of a model so it becomes traceable. It also records the necessary schema to rebuild model's inputs/outputs from flattened inputs/outputs. Example: :: outputs = model(inputs) # inputs/outputs may be rich structure adapter = TracingAdapter(model, inputs) # can now trace the model, with adapter.flattened_inputs, or another # tuple of tensors with the same length and meaning traced = torch.jit.trace(adapter, adapter.flattened_inputs) # traced model can only produce flattened outputs (tuple of tensors) flattened_outputs = traced(*adapter.flattened_inputs) # adapter knows the schema to convert it back (new_outputs == outputs) new_outputs = adapter.outputs_schema(flattened_outputs) """ flattened_inputs: Tuple[torch.Tensor] = None """ Flattened version of inputs given to this class's constructor. """ inputs_schema: Schema = None """ Schema of the inputs given to this class's constructor. """ outputs_schema: Schema = None """ Schema of the output produced by calling the given model with inputs. """ def __init__( self, model: nn.Module, inputs, inference_func: Optional[Callable] = None, allow_non_tensor: bool = False, ): """ Args: model: an nn.Module inputs: An input argument or a tuple of input arguments used to call model. After flattening, it has to only consist of tensors. inference_func: a callable that takes (model, *inputs), calls the model with inputs, and return outputs. By default it is ``lambda model, *inputs: model(*inputs)``. Can be override if you need to call the model differently. allow_non_tensor: allow inputs/outputs to contain non-tensor objects. This option will filter out non-tensor objects to make the model traceable, but ``inputs_schema``/``outputs_schema`` cannot be used anymore because inputs/outputs cannot be rebuilt from pure tensors. This is useful when you're only interested in the single trace of execution (e.g. for flop count), but not interested in generalizing the traced graph to new inputs. """ super().__init__() if isinstance(model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)): model = model.module self.model = model if not isinstance(inputs, tuple): inputs = (inputs,) self.inputs = inputs self.allow_non_tensor = allow_non_tensor if inference_func is None: inference_func = lambda model, *inputs: model(*inputs) # noqa self.inference_func = inference_func self.flattened_inputs, self.inputs_schema = flatten_to_tuple(inputs) if all(isinstance(x, torch.Tensor) for x in self.flattened_inputs): return if self.allow_non_tensor: self.flattened_inputs = tuple( [x for x in self.flattened_inputs if isinstance(x, torch.Tensor)] ) self.inputs_schema = None else: for input in self.flattened_inputs: if not isinstance(input, torch.Tensor): raise ValueError( "Inputs for tracing must only contain tensors. " f"Got a {type(input)} instead." ) def forward(self, *args: torch.Tensor): with torch.no_grad(), patch_builtin_len(): if self.inputs_schema is not None: inputs_orig_format = self.inputs_schema(args) else: if len(args) != len(self.flattened_inputs) or any( x is not y for x, y in zip(args, self.flattened_inputs) ): raise ValueError( "TracingAdapter does not contain valid inputs_schema." " So it cannot generalize to other inputs and must be" " traced with `.flattened_inputs`." ) inputs_orig_format = self.inputs outputs = self.inference_func(self.model, *inputs_orig_format) flattened_outputs, schema = flatten_to_tuple(outputs) flattened_output_tensors = tuple( [x for x in flattened_outputs if isinstance(x, torch.Tensor)] ) if len(flattened_output_tensors) < len(flattened_outputs): if self.allow_non_tensor: flattened_outputs = flattened_output_tensors self.outputs_schema = None else: raise ValueError( "Model cannot be traced because some model outputs " "cannot flatten to tensors." ) else: # schema is valid if self.outputs_schema is None: self.outputs_schema = schema else: assert self.outputs_schema == schema, ( "Model should always return outputs with the same " "structure so it can be traced!" ) return flattened_outputs def _create_wrapper(self, traced_model): """ Return a function that has an input/output interface the same as the original model, but it calls the given traced model under the hood. """ def forward(*args): flattened_inputs, _ = flatten_to_tuple(args) flattened_outputs = traced_model(*flattened_inputs) return self.outputs_schema(flattened_outputs) return forward ================================================ FILE: annotator/oneformer/detectron2/export/shared.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import collections import copy import functools import logging import numpy as np import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union from unittest import mock import caffe2.python.utils as putils import torch import torch.nn.functional as F from caffe2.proto import caffe2_pb2 from caffe2.python import core, net_drawer, workspace from torch.nn.functional import interpolate as interp logger = logging.getLogger(__name__) # ==== torch/utils_toffee/cast.py ======================================= def to_device(t, device_str): """ This function is a replacement of .to(another_device) such that it allows the casting to be traced properly by explicitly calling the underlying copy ops. It also avoids introducing unncessary op when casting to the same device. """ src = t.device dst = torch.device(device_str) if src == dst: return t elif src.type == "cuda" and dst.type == "cpu": return torch.ops._caffe2.CopyGPUToCPU(t) elif src.type == "cpu" and dst.type == "cuda": return torch.ops._caffe2.CopyCPUToGPU(t) else: raise RuntimeError("Can't cast tensor from device {} to device {}".format(src, dst)) # ==== torch/utils_toffee/interpolate.py ======================================= # Note: borrowed from vision/detection/fair/detectron/detectron/modeling/detector.py def BilinearInterpolation(tensor_in, up_scale): assert up_scale % 2 == 0, "Scale should be even" def upsample_filt(size): factor = (size + 1) // 2 if size % 2 == 1: center = factor - 1 else: center = factor - 0.5 og = np.ogrid[:size, :size] return (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor) kernel_size = int(up_scale) * 2 bil_filt = upsample_filt(kernel_size) dim = int(tensor_in.shape[1]) kernel = np.zeros((dim, dim, kernel_size, kernel_size), dtype=np.float32) kernel[range(dim), range(dim), :, :] = bil_filt tensor_out = F.conv_transpose2d( tensor_in, weight=to_device(torch.Tensor(kernel), tensor_in.device), bias=None, stride=int(up_scale), padding=int(up_scale / 2), ) return tensor_out # NOTE: ONNX is incompatible with traced torch.nn.functional.interpolate if # using dynamic `scale_factor` rather than static `size`. (T43166860) # NOTE: Caffe2 Int8 conversion might not be able to quantize `size` properly. def onnx_compatibale_interpolate( input, size=None, scale_factor=None, mode="nearest", align_corners=None ): # NOTE: The input dimensions are interpreted in the form: # `mini-batch x channels x [optional depth] x [optional height] x width`. if size is None and scale_factor is not None: if input.dim() == 4: if isinstance(scale_factor, (int, float)): height_scale, width_scale = (scale_factor, scale_factor) else: assert isinstance(scale_factor, (tuple, list)) assert len(scale_factor) == 2 height_scale, width_scale = scale_factor assert not align_corners, "No matching C2 op for align_corners == True" if mode == "nearest": return torch.ops._caffe2.ResizeNearest( input, order="NCHW", width_scale=width_scale, height_scale=height_scale ) elif mode == "bilinear": logger.warning( "Use F.conv_transpose2d for bilinear interpolate" " because there's no such C2 op, this may cause significant" " slowdown and the boundary pixels won't be as same as" " using F.interpolate due to padding." ) assert height_scale == width_scale return BilinearInterpolation(input, up_scale=height_scale) logger.warning("Output size is not static, it might cause ONNX conversion issue") return interp(input, size, scale_factor, mode, align_corners) def mock_torch_nn_functional_interpolate(): def decorator(func): @functools.wraps(func) def _mock_torch_nn_functional_interpolate(*args, **kwargs): if torch.onnx.is_in_onnx_export(): with mock.patch( "torch.nn.functional.interpolate", side_effect=onnx_compatibale_interpolate ): return func(*args, **kwargs) else: return func(*args, **kwargs) return _mock_torch_nn_functional_interpolate return decorator # ==== torch/utils_caffe2/ws_utils.py ========================================== class ScopedWS(object): def __init__(self, ws_name, is_reset, is_cleanup=False): self.ws_name = ws_name self.is_reset = is_reset self.is_cleanup = is_cleanup self.org_ws = "" def __enter__(self): self.org_ws = workspace.CurrentWorkspace() if self.ws_name is not None: workspace.SwitchWorkspace(self.ws_name, True) if self.is_reset: workspace.ResetWorkspace() return workspace def __exit__(self, *args): if self.is_cleanup: workspace.ResetWorkspace() if self.ws_name is not None: workspace.SwitchWorkspace(self.org_ws) def fetch_any_blob(name): bb = None try: bb = workspace.FetchBlob(name) except TypeError: bb = workspace.FetchInt8Blob(name) except Exception as e: logger.error("Get blob {} error: {}".format(name, e)) return bb # ==== torch/utils_caffe2/protobuf.py ========================================== def get_pb_arg(pb, arg_name): for x in pb.arg: if x.name == arg_name: return x return None def get_pb_arg_valf(pb, arg_name, default_val): arg = get_pb_arg(pb, arg_name) return arg.f if arg is not None else default_val def get_pb_arg_floats(pb, arg_name, default_val): arg = get_pb_arg(pb, arg_name) return list(map(float, arg.floats)) if arg is not None else default_val def get_pb_arg_ints(pb, arg_name, default_val): arg = get_pb_arg(pb, arg_name) return list(map(int, arg.ints)) if arg is not None else default_val def get_pb_arg_vali(pb, arg_name, default_val): arg = get_pb_arg(pb, arg_name) return arg.i if arg is not None else default_val def get_pb_arg_vals(pb, arg_name, default_val): arg = get_pb_arg(pb, arg_name) return arg.s if arg is not None else default_val def get_pb_arg_valstrings(pb, arg_name, default_val): arg = get_pb_arg(pb, arg_name) return list(arg.strings) if arg is not None else default_val def check_set_pb_arg(pb, arg_name, arg_attr, arg_value, allow_override=False): arg = get_pb_arg(pb, arg_name) if arg is None: arg = putils.MakeArgument(arg_name, arg_value) assert hasattr(arg, arg_attr) pb.arg.extend([arg]) if allow_override and getattr(arg, arg_attr) != arg_value: logger.warning( "Override argument {}: {} -> {}".format(arg_name, getattr(arg, arg_attr), arg_value) ) setattr(arg, arg_attr, arg_value) else: assert arg is not None assert getattr(arg, arg_attr) == arg_value, "Existing value {}, new value {}".format( getattr(arg, arg_attr), arg_value ) def _create_const_fill_op_from_numpy(name, tensor, device_option=None): assert type(tensor) == np.ndarray kTypeNameMapper = { np.dtype("float32"): "GivenTensorFill", np.dtype("int32"): "GivenTensorIntFill", np.dtype("int64"): "GivenTensorInt64Fill", np.dtype("uint8"): "GivenTensorStringFill", } args_dict = {} if tensor.dtype == np.dtype("uint8"): args_dict.update({"values": [str(tensor.data)], "shape": [1]}) else: args_dict.update({"values": tensor, "shape": tensor.shape}) if device_option is not None: args_dict["device_option"] = device_option return core.CreateOperator(kTypeNameMapper[tensor.dtype], [], [name], **args_dict) def _create_const_fill_op_from_c2_int8_tensor(name, int8_tensor): assert type(int8_tensor) == workspace.Int8Tensor kTypeNameMapper = { np.dtype("int32"): "Int8GivenIntTensorFill", np.dtype("uint8"): "Int8GivenTensorFill", } tensor = int8_tensor.data assert tensor.dtype in [np.dtype("uint8"), np.dtype("int32")] values = tensor.tobytes() if tensor.dtype == np.dtype("uint8") else tensor return core.CreateOperator( kTypeNameMapper[tensor.dtype], [], [name], values=values, shape=tensor.shape, Y_scale=int8_tensor.scale, Y_zero_point=int8_tensor.zero_point, ) def create_const_fill_op( name: str, blob: Union[np.ndarray, workspace.Int8Tensor], device_option: Optional[caffe2_pb2.DeviceOption] = None, ) -> caffe2_pb2.OperatorDef: """ Given a blob object, return the Caffe2 operator that creates this blob as constant. Currently support NumPy tensor and Caffe2 Int8Tensor. """ tensor_type = type(blob) assert tensor_type in [ np.ndarray, workspace.Int8Tensor, ], 'Error when creating const fill op for "{}", unsupported blob type: {}'.format( name, type(blob) ) if tensor_type == np.ndarray: return _create_const_fill_op_from_numpy(name, blob, device_option) elif tensor_type == workspace.Int8Tensor: assert device_option is None return _create_const_fill_op_from_c2_int8_tensor(name, blob) def construct_init_net_from_params( params: Dict[str, Any], device_options: Optional[Dict[str, caffe2_pb2.DeviceOption]] = None ) -> caffe2_pb2.NetDef: """ Construct the init_net from params dictionary """ init_net = caffe2_pb2.NetDef() device_options = device_options or {} for name, blob in params.items(): if isinstance(blob, str): logger.warning( ( "Blob {} with type {} is not supported in generating init net," " skipped.".format(name, type(blob)) ) ) continue init_net.op.extend( [create_const_fill_op(name, blob, device_option=device_options.get(name, None))] ) init_net.external_output.append(name) return init_net def get_producer_map(ssa): """ Return dict from versioned blob to (i, j), where i is index of producer op, j is the index of output of that op. """ producer_map = {} for i in range(len(ssa)): outputs = ssa[i][1] for j, outp in enumerate(outputs): producer_map[outp] = (i, j) return producer_map def get_consumer_map(ssa): """ Return dict from versioned blob to list of (i, j), where i is index of consumer op, j is the index of input of that op. """ consumer_map = collections.defaultdict(list) for i in range(len(ssa)): inputs = ssa[i][0] for j, inp in enumerate(inputs): consumer_map[inp].append((i, j)) return consumer_map def get_params_from_init_net( init_net: caffe2_pb2.NetDef, ) -> [Dict[str, Any], Dict[str, caffe2_pb2.DeviceOption]]: """ Take the output blobs from init_net by running it. Outputs: params: dict from blob name to numpy array device_options: dict from blob name to the device option of its creating op """ # NOTE: this assumes that the params is determined by producer op with the # only exception be CopyGPUToCPU which is CUDA op but returns CPU tensor. def _get_device_option(producer_op): if producer_op.type == "CopyGPUToCPU": return caffe2_pb2.DeviceOption() else: return producer_op.device_option with ScopedWS("__get_params_from_init_net__", is_reset=True, is_cleanup=True) as ws: ws.RunNetOnce(init_net) params = {b: fetch_any_blob(b) for b in init_net.external_output} ssa, versions = core.get_ssa(init_net) producer_map = get_producer_map(ssa) device_options = { b: _get_device_option(init_net.op[producer_map[(b, versions[b])][0]]) for b in init_net.external_output } return params, device_options def _updater_raise(op, input_types, output_types): raise RuntimeError( "Failed to apply updater for op {} given input_types {} and" " output_types {}".format(op, input_types, output_types) ) def _generic_status_identifier( predict_net: caffe2_pb2.NetDef, status_updater: Callable, known_status: Dict[Tuple[str, int], Any], ) -> Dict[Tuple[str, int], Any]: """ Statically infer the status of each blob, the status can be such as device type (CPU/GPU), layout (NCHW/NHWC), data type (float32/int8), etc. "Blob" here is versioned blob (Tuple[str, int]) in the format compatible with ssa. Inputs: predict_net: the caffe2 network status_updater: a callable, given an op and the status of its input/output, it returns the updated status of input/output. `None` is used for representing unknown status. known_status: a dict containing known status, used as initialization. Outputs: A dict mapping from versioned blob to its status """ ssa, versions = core.get_ssa(predict_net) versioned_ext_input = [(b, 0) for b in predict_net.external_input] versioned_ext_output = [(b, versions[b]) for b in predict_net.external_output] all_versioned_blobs = set().union(*[set(x[0] + x[1]) for x in ssa]) allowed_vbs = all_versioned_blobs.union(versioned_ext_input).union(versioned_ext_output) assert all(k in allowed_vbs for k in known_status) assert all(v is not None for v in known_status.values()) _known_status = copy.deepcopy(known_status) def _check_and_update(key, value): assert value is not None if key in _known_status: if not _known_status[key] == value: raise RuntimeError( "Confilict status for {}, existing status {}, new status {}".format( key, _known_status[key], value ) ) _known_status[key] = value def _update_i(op, ssa_i): versioned_inputs = ssa_i[0] versioned_outputs = ssa_i[1] inputs_status = [_known_status.get(b, None) for b in versioned_inputs] outputs_status = [_known_status.get(b, None) for b in versioned_outputs] new_inputs_status, new_outputs_status = status_updater(op, inputs_status, outputs_status) for versioned_blob, status in zip( versioned_inputs + versioned_outputs, new_inputs_status + new_outputs_status ): if status is not None: _check_and_update(versioned_blob, status) for op, ssa_i in zip(predict_net.op, ssa): _update_i(op, ssa_i) for op, ssa_i in zip(reversed(predict_net.op), reversed(ssa)): _update_i(op, ssa_i) # NOTE: This strictly checks all the blob from predict_net must be assgined # a known status. However sometimes it's impossible (eg. having deadend op), # we may relax this constraint if for k in all_versioned_blobs: if k not in _known_status: raise NotImplementedError( "Can not infer the status for {}. Currently only support the case where" " a single forward and backward pass can identify status for all blobs.".format(k) ) return _known_status def infer_device_type( predict_net: caffe2_pb2.NetDef, known_status: Dict[Tuple[str, int], Any], device_name_style: str = "caffe2", ) -> Dict[Tuple[str, int], str]: """Return the device type ("cpu" or "gpu"/"cuda") of each (versioned) blob""" assert device_name_style in ["caffe2", "pytorch"] _CPU_STR = "cpu" _GPU_STR = "gpu" if device_name_style == "caffe2" else "cuda" def _copy_cpu_to_gpu_updater(op, input_types, output_types): if input_types[0] == _GPU_STR or output_types[0] == _CPU_STR: _updater_raise(op, input_types, output_types) return ([_CPU_STR], [_GPU_STR]) def _copy_gpu_to_cpu_updater(op, input_types, output_types): if input_types[0] == _CPU_STR or output_types[0] == _GPU_STR: _updater_raise(op, input_types, output_types) return ([_GPU_STR], [_CPU_STR]) def _other_ops_updater(op, input_types, output_types): non_none_types = [x for x in input_types + output_types if x is not None] if len(non_none_types) > 0: the_type = non_none_types[0] if not all(x == the_type for x in non_none_types): _updater_raise(op, input_types, output_types) else: the_type = None return ([the_type for _ in op.input], [the_type for _ in op.output]) def _device_updater(op, *args, **kwargs): return { "CopyCPUToGPU": _copy_cpu_to_gpu_updater, "CopyGPUToCPU": _copy_gpu_to_cpu_updater, }.get(op.type, _other_ops_updater)(op, *args, **kwargs) return _generic_status_identifier(predict_net, _device_updater, known_status) # ==== torch/utils_caffe2/vis.py =============================================== def _modify_blob_names(ops, blob_rename_f): ret = [] def _replace_list(blob_list, replaced_list): del blob_list[:] blob_list.extend(replaced_list) for x in ops: cur = copy.deepcopy(x) _replace_list(cur.input, list(map(blob_rename_f, cur.input))) _replace_list(cur.output, list(map(blob_rename_f, cur.output))) ret.append(cur) return ret def _rename_blob(name, blob_sizes, blob_ranges): def _list_to_str(bsize): ret = ", ".join([str(x) for x in bsize]) ret = "[" + ret + "]" return ret ret = name if blob_sizes is not None and name in blob_sizes: ret += "\n" + _list_to_str(blob_sizes[name]) if blob_ranges is not None and name in blob_ranges: ret += "\n" + _list_to_str(blob_ranges[name]) return ret # graph_name could not contain word 'graph' def save_graph(net, file_name, graph_name="net", op_only=True, blob_sizes=None, blob_ranges=None): blob_rename_f = functools.partial(_rename_blob, blob_sizes=blob_sizes, blob_ranges=blob_ranges) return save_graph_base(net, file_name, graph_name, op_only, blob_rename_f) def save_graph_base(net, file_name, graph_name="net", op_only=True, blob_rename_func=None): graph = None ops = net.op if blob_rename_func is not None: ops = _modify_blob_names(ops, blob_rename_func) if not op_only: graph = net_drawer.GetPydotGraph(ops, graph_name, rankdir="TB") else: graph = net_drawer.GetPydotGraphMinimal( ops, graph_name, rankdir="TB", minimal_dependency=True ) try: par_dir = os.path.dirname(file_name) if not os.path.exists(par_dir): os.makedirs(par_dir) format = os.path.splitext(os.path.basename(file_name))[-1] if format == ".png": graph.write_png(file_name) elif format == ".pdf": graph.write_pdf(file_name) elif format == ".svg": graph.write_svg(file_name) else: print("Incorrect format {}".format(format)) except Exception as e: print("Error when writing graph to image {}".format(e)) return graph # ==== torch/utils_toffee/aten_to_caffe2.py ==================================== def group_norm_replace_aten_with_caffe2(predict_net: caffe2_pb2.NetDef): """ For ONNX exported model, GroupNorm will be represented as ATen op, this can be a drop in replacement from ATen to GroupNorm """ count = 0 for op in predict_net.op: if op.type == "ATen": op_name = get_pb_arg_vals(op, "operator", None) # return byte in py3 if op_name and op_name.decode() == "group_norm": op.arg.remove(get_pb_arg(op, "operator")) if get_pb_arg_vali(op, "cudnn_enabled", None): op.arg.remove(get_pb_arg(op, "cudnn_enabled")) num_groups = get_pb_arg_vali(op, "num_groups", None) if num_groups is not None: op.arg.remove(get_pb_arg(op, "num_groups")) check_set_pb_arg(op, "group", "i", num_groups) op.type = "GroupNorm" count += 1 if count > 1: logger.info("Replaced {} ATen operator to GroupNormOp".format(count)) # ==== torch/utils_toffee/alias.py ============================================= def alias(x, name, is_backward=False): if not torch.onnx.is_in_onnx_export(): return x assert isinstance(x, torch.Tensor) return torch.ops._caffe2.AliasWithName(x, name, is_backward=is_backward) def fuse_alias_placeholder(predict_net, init_net): """Remove AliasWithName placeholder and rename the input/output of it""" # First we finish all the re-naming for i, op in enumerate(predict_net.op): if op.type == "AliasWithName": assert len(op.input) == 1 assert len(op.output) == 1 name = get_pb_arg_vals(op, "name", None).decode() is_backward = bool(get_pb_arg_vali(op, "is_backward", 0)) rename_op_input(predict_net, init_net, i, 0, name, from_producer=is_backward) rename_op_output(predict_net, i, 0, name) # Remove AliasWithName, should be very safe since it's a non-op new_ops = [] for op in predict_net.op: if op.type != "AliasWithName": new_ops.append(op) else: # safety check assert op.input == op.output assert op.input[0] == op.arg[0].s.decode() del predict_net.op[:] predict_net.op.extend(new_ops) # ==== torch/utils_caffe2/graph_transform.py =================================== class IllegalGraphTransformError(ValueError): """When a graph transform function call can't be executed.""" def _rename_versioned_blob_in_proto( proto: caffe2_pb2.NetDef, old_name: str, new_name: str, version: int, ssa: List[Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]], start_versions: Dict[str, int], end_versions: Dict[str, int], ): """In given proto, rename all blobs with matched version""" # Operater list for op, i_th_ssa in zip(proto.op, ssa): versioned_inputs, versioned_outputs = i_th_ssa for i in range(len(op.input)): if versioned_inputs[i] == (old_name, version): op.input[i] = new_name for i in range(len(op.output)): if versioned_outputs[i] == (old_name, version): op.output[i] = new_name # external_input if start_versions.get(old_name, 0) == version: for i in range(len(proto.external_input)): if proto.external_input[i] == old_name: proto.external_input[i] = new_name # external_output if end_versions.get(old_name, 0) == version: for i in range(len(proto.external_output)): if proto.external_output[i] == old_name: proto.external_output[i] = new_name def rename_op_input( predict_net: caffe2_pb2.NetDef, init_net: caffe2_pb2.NetDef, op_id: int, input_id: int, new_name: str, from_producer: bool = False, ): """ Rename the op_id-th operator in predict_net, change it's input_id-th input's name to the new_name. It also does automatic re-route and change external_input and init_net if necessary. - It requires the input is only consumed by this op. - This function modifies predict_net and init_net in-place. - When from_producer is enable, this also updates other operators that consumes the same input. Be cautious because may trigger unintended behavior. """ assert isinstance(predict_net, caffe2_pb2.NetDef) assert isinstance(init_net, caffe2_pb2.NetDef) init_net_ssa, init_net_versions = core.get_ssa(init_net) predict_net_ssa, predict_net_versions = core.get_ssa( predict_net, copy.deepcopy(init_net_versions) ) versioned_inputs, versioned_outputs = predict_net_ssa[op_id] old_name, version = versioned_inputs[input_id] if from_producer: producer_map = get_producer_map(predict_net_ssa) if not (old_name, version) in producer_map: raise NotImplementedError( "Can't find producer, the input {} is probably from" " init_net, this is not supported yet.".format(old_name) ) producer = producer_map[(old_name, version)] rename_op_output(predict_net, producer[0], producer[1], new_name) return def contain_targets(op_ssa): return (old_name, version) in op_ssa[0] is_consumer = [contain_targets(op_ssa) for op_ssa in predict_net_ssa] if sum(is_consumer) > 1: raise IllegalGraphTransformError( ( "Input '{}' of operator(#{}) are consumed by other ops, please use" + " rename_op_output on the producer instead. Offending op: \n{}" ).format(old_name, op_id, predict_net.op[op_id]) ) # update init_net _rename_versioned_blob_in_proto( init_net, old_name, new_name, version, init_net_ssa, {}, init_net_versions ) # update predict_net _rename_versioned_blob_in_proto( predict_net, old_name, new_name, version, predict_net_ssa, init_net_versions, predict_net_versions, ) def rename_op_output(predict_net: caffe2_pb2.NetDef, op_id: int, output_id: int, new_name: str): """ Rename the op_id-th operator in predict_net, change it's output_id-th input's name to the new_name. It also does automatic re-route and change external_output and if necessary. - It allows multiple consumers of its output. - This function modifies predict_net in-place, doesn't need init_net. """ assert isinstance(predict_net, caffe2_pb2.NetDef) ssa, blob_versions = core.get_ssa(predict_net) versioned_inputs, versioned_outputs = ssa[op_id] old_name, version = versioned_outputs[output_id] # update predict_net _rename_versioned_blob_in_proto( predict_net, old_name, new_name, version, ssa, {}, blob_versions ) def get_sub_graph_external_input_output( predict_net: caffe2_pb2.NetDef, sub_graph_op_indices: List[int] ) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]: """ Return the list of external input/output of sub-graph, each element is tuple of the name and corresponding version in predict_net. external input/output is defined the same way as caffe2 NetDef. """ ssa, versions = core.get_ssa(predict_net) all_inputs = [] all_outputs = [] for op_id in sub_graph_op_indices: all_inputs += [inp for inp in ssa[op_id][0] if inp not in all_inputs] all_outputs += list(ssa[op_id][1]) # ssa output won't repeat # for versioned blobs, external inputs are just those blob in all_inputs # but not in all_outputs ext_inputs = [inp for inp in all_inputs if inp not in all_outputs] # external outputs are essentially outputs of this subgraph that are used # outside of this sub-graph (including predict_net.external_output) all_other_inputs = sum( (ssa[i][0] for i in range(len(ssa)) if i not in sub_graph_op_indices), [(outp, versions[outp]) for outp in predict_net.external_output], ) ext_outputs = [outp for outp in all_outputs if outp in set(all_other_inputs)] return ext_inputs, ext_outputs class DiGraph: """A DAG representation of caffe2 graph, each vertice is a versioned blob.""" def __init__(self): self.vertices = set() self.graph = collections.defaultdict(list) def add_edge(self, u, v): self.graph[u].append(v) self.vertices.add(u) self.vertices.add(v) # grab from https://www.geeksforgeeks.org/find-paths-given-source-destination/ def get_all_paths(self, s, d): visited = {k: False for k in self.vertices} path = [] all_paths = [] def _get_all_paths_util(graph, u, d, visited, path): visited[u] = True path.append(u) if u == d: all_paths.append(copy.deepcopy(path)) else: for i in graph[u]: if not visited[i]: _get_all_paths_util(graph, i, d, visited, path) path.pop() visited[u] = False _get_all_paths_util(self.graph, s, d, visited, path) return all_paths @staticmethod def from_ssa(ssa): graph = DiGraph() for op_id in range(len(ssa)): for inp in ssa[op_id][0]: for outp in ssa[op_id][1]: graph.add_edge(inp, outp) return graph def _get_dependency_chain(ssa, versioned_target, versioned_source): """ Return the index list of relevant operator to produce target blob from source blob, if there's no dependency, return empty list. """ # finding all paths between nodes can be O(N!), thus we can only search # in the subgraph using the op starting from the first consumer of source blob # to the producer of the target blob. consumer_map = get_consumer_map(ssa) producer_map = get_producer_map(ssa) start_op = min(x[0] for x in consumer_map[versioned_source]) - 15 end_op = ( producer_map[versioned_target][0] + 15 if versioned_target in producer_map else start_op ) sub_graph_ssa = ssa[start_op : end_op + 1] if len(sub_graph_ssa) > 30: logger.warning( "Subgraph bebetween {} and {} is large (from op#{} to op#{}), it" " might take non-trival time to find all paths between them.".format( versioned_source, versioned_target, start_op, end_op ) ) dag = DiGraph.from_ssa(sub_graph_ssa) paths = dag.get_all_paths(versioned_source, versioned_target) # include two ends ops_in_paths = [[producer_map[blob][0] for blob in path[1:]] for path in paths] return sorted(set().union(*[set(ops) for ops in ops_in_paths])) def identify_reshape_sub_graph(predict_net: caffe2_pb2.NetDef) -> List[List[int]]: """ Idenfity the reshape sub-graph in a protobuf. The reshape sub-graph is defined as matching the following pattern: (input_blob) -> Op_1 -> ... -> Op_N -> (new_shape) -─┐ └-------------------------------------------> Reshape -> (output_blob) Return: List of sub-graphs, each sub-graph is represented as a list of indices of the relavent ops, [Op_1, Op_2, ..., Op_N, Reshape] """ ssa, _ = core.get_ssa(predict_net) ret = [] for i, op in enumerate(predict_net.op): if op.type == "Reshape": assert len(op.input) == 2 input_ssa = ssa[i][0] data_source = input_ssa[0] shape_source = input_ssa[1] op_indices = _get_dependency_chain(ssa, shape_source, data_source) ret.append(op_indices + [i]) return ret def remove_reshape_for_fc(predict_net, params): """ In PyTorch nn.Linear has to take 2D tensor, this often leads to reshape a 4D tensor to 2D by calling .view(). However this (dynamic) reshaping doesn't work well with ONNX and Int8 tools, and cause using extra ops (eg. ExpandDims) that might not be available on mobile. Luckily Caffe2 supports 4D tensor for FC, so we can remove those reshape after exporting ONNX model. """ from caffe2.python import core # find all reshape sub-graph that can be removed, which is now all Reshape # sub-graph whose output is only consumed by FC. # TODO: to make it safer, we may need the actually value to better determine # if a Reshape before FC is removable. reshape_sub_graphs = identify_reshape_sub_graph(predict_net) sub_graphs_to_remove = [] for reshape_sub_graph in reshape_sub_graphs: reshape_op_id = reshape_sub_graph[-1] assert predict_net.op[reshape_op_id].type == "Reshape" ssa, _ = core.get_ssa(predict_net) reshape_output = ssa[reshape_op_id][1][0] consumers = [i for i in range(len(ssa)) if reshape_output in ssa[i][0]] if all(predict_net.op[consumer].type == "FC" for consumer in consumers): # safety check if the sub-graph is isolated, for this reshape sub-graph, # it means it has one non-param external input and one external output. ext_inputs, ext_outputs = get_sub_graph_external_input_output( predict_net, reshape_sub_graph ) non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0] if len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1: sub_graphs_to_remove.append(reshape_sub_graph) # perform removing subgraph by: # 1: rename the Reshape's output to its input, then the graph can be # seen as in-place itentify, meaning whose external input/output are the same. # 2: simply remove those ops. remove_op_ids = [] params_to_remove = [] for sub_graph in sub_graphs_to_remove: logger.info( "Remove Reshape sub-graph:\n{}".format( "".join(["(#{:>4})\n{}".format(i, predict_net.op[i]) for i in sub_graph]) ) ) reshape_op_id = sub_graph[-1] new_reshap_output = predict_net.op[reshape_op_id].input[0] rename_op_output(predict_net, reshape_op_id, 0, new_reshap_output) ext_inputs, ext_outputs = get_sub_graph_external_input_output(predict_net, sub_graph) non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0] params_ext_inputs = [inp for inp in ext_inputs if inp[1] == 0] assert len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1 assert ext_outputs[0][0] == non_params_ext_inputs[0][0] assert ext_outputs[0][1] == non_params_ext_inputs[0][1] + 1 remove_op_ids.extend(sub_graph) params_to_remove.extend(params_ext_inputs) predict_net = copy.deepcopy(predict_net) new_ops = [op for i, op in enumerate(predict_net.op) if i not in remove_op_ids] del predict_net.op[:] predict_net.op.extend(new_ops) for versioned_params in params_to_remove: name = versioned_params[0] logger.info("Remove params: {} from init_net and predict_net.external_input".format(name)) del params[name] predict_net.external_input.remove(name) return predict_net, params def fuse_copy_between_cpu_and_gpu(predict_net: caffe2_pb2.NetDef): """ In-place fuse extra copy ops between cpu/gpu for the following case: a -CopyAToB-> b -CopyBToA> c1 -NextOp1-> d1 -CopyBToA> c2 -NextOp2-> d2 The fused network will look like: a -NextOp1-> d1 -NextOp2-> d2 """ _COPY_OPS = ["CopyCPUToGPU", "CopyGPUToCPU"] def _fuse_once(predict_net): ssa, blob_versions = core.get_ssa(predict_net) consumer_map = get_consumer_map(ssa) versioned_external_output = [ (name, blob_versions[name]) for name in predict_net.external_output ] for op_id, op in enumerate(predict_net.op): if op.type in _COPY_OPS: fw_copy_versioned_output = ssa[op_id][1][0] consumer_ids = [x[0] for x in consumer_map[fw_copy_versioned_output]] reverse_op_type = _COPY_OPS[1 - _COPY_OPS.index(op.type)] is_fusable = ( len(consumer_ids) > 0 and fw_copy_versioned_output not in versioned_external_output and all( predict_net.op[_op_id].type == reverse_op_type and ssa[_op_id][1][0] not in versioned_external_output for _op_id in consumer_ids ) ) if is_fusable: for rv_copy_op_id in consumer_ids: # making each NextOp uses "a" directly and removing Copy ops rs_copy_versioned_output = ssa[rv_copy_op_id][1][0] next_op_id, inp_id = consumer_map[rs_copy_versioned_output][0] predict_net.op[next_op_id].input[inp_id] = op.input[0] # remove CopyOps new_ops = [ op for i, op in enumerate(predict_net.op) if i != op_id and i not in consumer_ids ] del predict_net.op[:] predict_net.op.extend(new_ops) return True return False # _fuse_once returns False is nothing can be fused while _fuse_once(predict_net): pass def remove_dead_end_ops(net_def: caffe2_pb2.NetDef): """remove ops if its output is not used or not in external_output""" ssa, versions = core.get_ssa(net_def) versioned_external_output = [(name, versions[name]) for name in net_def.external_output] consumer_map = get_consumer_map(ssa) removed_op_ids = set() def _is_dead_end(versioned_blob): return not ( versioned_blob in versioned_external_output or ( len(consumer_map[versioned_blob]) > 0 and all(x[0] not in removed_op_ids for x in consumer_map[versioned_blob]) ) ) for i, ssa_i in reversed(list(enumerate(ssa))): versioned_outputs = ssa_i[1] if all(_is_dead_end(outp) for outp in versioned_outputs): removed_op_ids.add(i) # simply removing those deadend ops should have no effect to external_output new_ops = [op for i, op in enumerate(net_def.op) if i not in removed_op_ids] del net_def.op[:] net_def.op.extend(new_ops) ================================================ FILE: annotator/oneformer/detectron2/export/torchscript.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import os import torch from annotator.oneformer.detectron2.utils.file_io import PathManager from .torchscript_patch import freeze_training_mode, patch_instances __all__ = ["scripting_with_instances", "dump_torchscript_IR"] def scripting_with_instances(model, fields): """ Run :func:`torch.jit.script` on a model that uses the :class:`Instances` class. Since attributes of :class:`Instances` are "dynamically" added in eager mode,it is difficult for scripting to support it out of the box. This function is made to support scripting a model that uses :class:`Instances`. It does the following: 1. Create a scriptable ``new_Instances`` class which behaves similarly to ``Instances``, but with all attributes been "static". The attributes need to be statically declared in the ``fields`` argument. 2. Register ``new_Instances``, and force scripting compiler to use it when trying to compile ``Instances``. After this function, the process will be reverted. User should be able to script another model using different fields. Example: Assume that ``Instances`` in the model consist of two attributes named ``proposal_boxes`` and ``objectness_logits`` with type :class:`Boxes` and :class:`Tensor` respectively during inference. You can call this function like: :: fields = {"proposal_boxes": Boxes, "objectness_logits": torch.Tensor} torchscipt_model = scripting_with_instances(model, fields) Note: It only support models in evaluation mode. Args: model (nn.Module): The input model to be exported by scripting. fields (Dict[str, type]): Attribute names and corresponding type that ``Instances`` will use in the model. Note that all attributes used in ``Instances`` need to be added, regardless of whether they are inputs/outputs of the model. Data type not defined in detectron2 is not supported for now. Returns: torch.jit.ScriptModule: the model in torchscript format """ assert ( not model.training ), "Currently we only support exporting models in evaluation mode to torchscript" with freeze_training_mode(model), patch_instances(fields): scripted_model = torch.jit.script(model) return scripted_model # alias for old name export_torchscript_with_instances = scripting_with_instances def dump_torchscript_IR(model, dir): """ Dump IR of a TracedModule/ScriptModule/Function in various format (code, graph, inlined graph). Useful for debugging. Args: model (TracedModule/ScriptModule/ScriptFUnction): traced or scripted module dir (str): output directory to dump files. """ dir = os.path.expanduser(dir) PathManager.mkdirs(dir) def _get_script_mod(mod): if isinstance(mod, torch.jit.TracedModule): return mod._actual_script_module return mod # Dump pretty-printed code: https://pytorch.org/docs/stable/jit.html#inspecting-code with PathManager.open(os.path.join(dir, "model_ts_code.txt"), "w") as f: def get_code(mod): # Try a few ways to get code using private attributes. try: # This contains more information than just `mod.code` return _get_script_mod(mod)._c.code except AttributeError: pass try: return mod.code except AttributeError: return None def dump_code(prefix, mod): code = get_code(mod) name = prefix or "root model" if code is None: f.write(f"Could not found code for {name} (type={mod.original_name})\n") f.write("\n") else: f.write(f"\nCode for {name}, type={mod.original_name}:\n") f.write(code) f.write("\n") f.write("-" * 80) for name, m in mod.named_children(): dump_code(prefix + "." + name, m) if isinstance(model, torch.jit.ScriptFunction): f.write(get_code(model)) else: dump_code("", model) def _get_graph(model): try: # Recursively dump IR of all modules return _get_script_mod(model)._c.dump_to_str(True, False, False) except AttributeError: return model.graph.str() with PathManager.open(os.path.join(dir, "model_ts_IR.txt"), "w") as f: f.write(_get_graph(model)) # Dump IR of the entire graph (all submodules inlined) with PathManager.open(os.path.join(dir, "model_ts_IR_inlined.txt"), "w") as f: f.write(str(model.inlined_graph)) if not isinstance(model, torch.jit.ScriptFunction): # Dump the model structure in pytorch style with PathManager.open(os.path.join(dir, "model.txt"), "w") as f: f.write(str(model)) ================================================ FILE: annotator/oneformer/detectron2/export/torchscript_patch.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import os import sys import tempfile from contextlib import ExitStack, contextmanager from copy import deepcopy from unittest import mock import torch from torch import nn # need some explicit imports due to https://github.com/pytorch/pytorch/issues/38964 import annotator.oneformer.detectron2 # noqa F401 from annotator.oneformer.detectron2.structures import Boxes, Instances from annotator.oneformer.detectron2.utils.env import _import_file _counter = 0 def _clear_jit_cache(): from torch.jit._recursive import concrete_type_store from torch.jit._state import _jit_caching_layer concrete_type_store.type_store.clear() # for modules _jit_caching_layer.clear() # for free functions def _add_instances_conversion_methods(newInstances): """ Add from_instances methods to the scripted Instances class. """ cls_name = newInstances.__name__ @torch.jit.unused def from_instances(instances: Instances): """ Create scripted Instances from original Instances """ fields = instances.get_fields() image_size = instances.image_size ret = newInstances(image_size) for name, val in fields.items(): assert hasattr(ret, f"_{name}"), f"No attribute named {name} in {cls_name}" setattr(ret, name, deepcopy(val)) return ret newInstances.from_instances = from_instances @contextmanager def patch_instances(fields): """ A contextmanager, under which the Instances class in detectron2 is replaced by a statically-typed scriptable class, defined by `fields`. See more in `scripting_with_instances`. """ with tempfile.TemporaryDirectory(prefix="detectron2") as dir, tempfile.NamedTemporaryFile( mode="w", encoding="utf-8", suffix=".py", dir=dir, delete=False ) as f: try: # Objects that use Instances should not reuse previously-compiled # results in cache, because `Instances` could be a new class each time. _clear_jit_cache() cls_name, s = _gen_instance_module(fields) f.write(s) f.flush() f.close() module = _import(f.name) new_instances = getattr(module, cls_name) _ = torch.jit.script(new_instances) # let torchscript think Instances was scripted already Instances.__torch_script_class__ = True # let torchscript find new_instances when looking for the jit type of Instances Instances._jit_override_qualname = torch._jit_internal._qualified_name(new_instances) _add_instances_conversion_methods(new_instances) yield new_instances finally: try: del Instances.__torch_script_class__ del Instances._jit_override_qualname except AttributeError: pass sys.modules.pop(module.__name__) def _gen_instance_class(fields): """ Args: fields (dict[name: type]) """ class _FieldType: def __init__(self, name, type_): assert isinstance(name, str), f"Field name must be str, got {name}" self.name = name self.type_ = type_ self.annotation = f"{type_.__module__}.{type_.__name__}" fields = [_FieldType(k, v) for k, v in fields.items()] def indent(level, s): return " " * 4 * level + s lines = [] global _counter _counter += 1 cls_name = "ScriptedInstances{}".format(_counter) field_names = tuple(x.name for x in fields) extra_args = ", ".join([f"{f.name}: Optional[{f.annotation}] = None" for f in fields]) lines.append( f""" class {cls_name}: def __init__(self, image_size: Tuple[int, int], {extra_args}): self.image_size = image_size self._field_names = {field_names} """ ) for f in fields: lines.append( indent(2, f"self._{f.name} = torch.jit.annotate(Optional[{f.annotation}], {f.name})") ) for f in fields: lines.append( f""" @property def {f.name}(self) -> {f.annotation}: # has to use a local for type refinement # https://pytorch.org/docs/stable/jit_language_reference.html#optional-type-refinement t = self._{f.name} assert t is not None, "{f.name} is None and cannot be accessed!" return t @{f.name}.setter def {f.name}(self, value: {f.annotation}) -> None: self._{f.name} = value """ ) # support method `__len__` lines.append( """ def __len__(self) -> int: """ ) for f in fields: lines.append( f""" t = self._{f.name} if t is not None: return len(t) """ ) lines.append( """ raise NotImplementedError("Empty Instances does not support __len__!") """ ) # support method `has` lines.append( """ def has(self, name: str) -> bool: """ ) for f in fields: lines.append( f""" if name == "{f.name}": return self._{f.name} is not None """ ) lines.append( """ return False """ ) # support method `to` none_args = ", None" * len(fields) lines.append( f""" def to(self, device: torch.device) -> "{cls_name}": ret = {cls_name}(self.image_size{none_args}) """ ) for f in fields: if hasattr(f.type_, "to"): lines.append( f""" t = self._{f.name} if t is not None: ret._{f.name} = t.to(device) """ ) else: # For now, ignore fields that cannot be moved to devices. # Maybe can support other tensor-like classes (e.g. __torch_function__) pass lines.append( """ return ret """ ) # support method `getitem` none_args = ", None" * len(fields) lines.append( f""" def __getitem__(self, item) -> "{cls_name}": ret = {cls_name}(self.image_size{none_args}) """ ) for f in fields: lines.append( f""" t = self._{f.name} if t is not None: ret._{f.name} = t[item] """ ) lines.append( """ return ret """ ) # support method `cat` # this version does not contain checks that all instances have same size and fields none_args = ", None" * len(fields) lines.append( f""" def cat(self, instances: List["{cls_name}"]) -> "{cls_name}": ret = {cls_name}(self.image_size{none_args}) """ ) for f in fields: lines.append( f""" t = self._{f.name} if t is not None: values: List[{f.annotation}] = [x.{f.name} for x in instances] if torch.jit.isinstance(t, torch.Tensor): ret._{f.name} = torch.cat(values, dim=0) else: ret._{f.name} = t.cat(values) """ ) lines.append( """ return ret""" ) # support method `get_fields()` lines.append( """ def get_fields(self) -> Dict[str, Tensor]: ret = {} """ ) for f in fields: if f.type_ == Boxes: stmt = "t.tensor" elif f.type_ == torch.Tensor: stmt = "t" else: stmt = f'assert False, "unsupported type {str(f.type_)}"' lines.append( f""" t = self._{f.name} if t is not None: ret["{f.name}"] = {stmt} """ ) lines.append( """ return ret""" ) return cls_name, os.linesep.join(lines) def _gen_instance_module(fields): # TODO: find a more automatic way to enable import of other classes s = """ from copy import deepcopy import torch from torch import Tensor import typing from typing import * import annotator.oneformer.detectron2 from annotator.oneformer.detectron2.structures import Boxes, Instances """ cls_name, cls_def = _gen_instance_class(fields) s += cls_def return cls_name, s def _import(path): return _import_file( "{}{}".format(sys.modules[__name__].__name__, _counter), path, make_importable=True ) @contextmanager def patch_builtin_len(modules=()): """ Patch the builtin len() function of a few detectron2 modules to use __len__ instead, because __len__ does not convert values to integers and therefore is friendly to tracing. Args: modules (list[stsr]): names of extra modules to patch len(), in addition to those in detectron2. """ def _new_len(obj): return obj.__len__() with ExitStack() as stack: MODULES = [ "detectron2.modeling.roi_heads.fast_rcnn", "detectron2.modeling.roi_heads.mask_head", "detectron2.modeling.roi_heads.keypoint_head", ] + list(modules) ctxs = [stack.enter_context(mock.patch(mod + ".len")) for mod in MODULES] for m in ctxs: m.side_effect = _new_len yield def patch_nonscriptable_classes(): """ Apply patches on a few nonscriptable detectron2 classes. Should not have side-effects on eager usage. """ # __prepare_scriptable__ can also be added to models for easier maintenance. # But it complicates the clean model code. from annotator.oneformer.detectron2.modeling.backbone import ResNet, FPN # Due to https://github.com/pytorch/pytorch/issues/36061, # we change backbone to use ModuleList for scripting. # (note: this changes param names in state_dict) def prepare_resnet(self): ret = deepcopy(self) ret.stages = nn.ModuleList(ret.stages) for k in self.stage_names: delattr(ret, k) return ret ResNet.__prepare_scriptable__ = prepare_resnet def prepare_fpn(self): ret = deepcopy(self) ret.lateral_convs = nn.ModuleList(ret.lateral_convs) ret.output_convs = nn.ModuleList(ret.output_convs) for name, _ in self.named_children(): if name.startswith("fpn_"): delattr(ret, name) return ret FPN.__prepare_scriptable__ = prepare_fpn # Annotate some attributes to be constants for the purpose of scripting, # even though they are not constants in eager mode. from annotator.oneformer.detectron2.modeling.roi_heads import StandardROIHeads if hasattr(StandardROIHeads, "__annotations__"): # copy first to avoid editing annotations of base class StandardROIHeads.__annotations__ = deepcopy(StandardROIHeads.__annotations__) StandardROIHeads.__annotations__["mask_on"] = torch.jit.Final[bool] StandardROIHeads.__annotations__["keypoint_on"] = torch.jit.Final[bool] # These patches are not supposed to have side-effects. patch_nonscriptable_classes() @contextmanager def freeze_training_mode(model): """ A context manager that annotates the "training" attribute of every submodule to constant, so that the training codepath in these modules can be meta-compiled away. Upon exiting, the annotations are reverted. """ classes = {type(x) for x in model.modules()} # __constants__ is the old way to annotate constants and not compatible # with __annotations__ . classes = {x for x in classes if not hasattr(x, "__constants__")} for cls in classes: cls.__annotations__["training"] = torch.jit.Final[bool] yield for cls in classes: cls.__annotations__["training"] = bool ================================================ FILE: annotator/oneformer/detectron2/layers/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm, CycleBatchNormList from .deform_conv import DeformConv, ModulatedDeformConv from .mask_ops import paste_masks_in_image from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated from .roi_align import ROIAlign, roi_align from .roi_align_rotated import ROIAlignRotated, roi_align_rotated from .shape_spec import ShapeSpec from .wrappers import ( BatchNorm2d, Conv2d, ConvTranspose2d, cat, interpolate, Linear, nonzero_tuple, cross_entropy, empty_input_loss_func_wrapper, shapes_to_tensor, move_device_like, ) from .blocks import CNNBlockBase, DepthwiseSeparableConv2d from .aspp import ASPP from .losses import ciou_loss, diou_loss __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: annotator/oneformer/detectron2/layers/aspp.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from copy import deepcopy import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from .batch_norm import get_norm from .blocks import DepthwiseSeparableConv2d from .wrappers import Conv2d class ASPP(nn.Module): """ Atrous Spatial Pyramid Pooling (ASPP). """ def __init__( self, in_channels, out_channels, dilations, *, norm, activation, pool_kernel_size=None, dropout: float = 0.0, use_depthwise_separable_conv=False, ): """ Args: in_channels (int): number of input channels for ASPP. out_channels (int): number of output channels. dilations (list): a list of 3 dilations in ASPP. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. norm is applied to all conv layers except the conv following global average pooling. activation (callable): activation function. pool_kernel_size (tuple, list): the average pooling size (kh, kw) for image pooling layer in ASPP. If set to None, it always performs global average pooling. If not None, it must be divisible by the shape of inputs in forward(). It is recommended to use a fixed input feature size in training, and set this option to match this size, so that it performs global average pooling in training, and the size of the pooling window stays consistent in inference. dropout (float): apply dropout on the output of ASPP. It is used in the official DeepLab implementation with a rate of 0.1: https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/model.py#L532 # noqa use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d for 3x3 convs in ASPP, proposed in :paper:`DeepLabV3+`. """ super(ASPP, self).__init__() assert len(dilations) == 3, "ASPP expects 3 dilations, got {}".format(len(dilations)) self.pool_kernel_size = pool_kernel_size self.dropout = dropout use_bias = norm == "" self.convs = nn.ModuleList() # conv 1x1 self.convs.append( Conv2d( in_channels, out_channels, kernel_size=1, bias=use_bias, norm=get_norm(norm, out_channels), activation=deepcopy(activation), ) ) weight_init.c2_xavier_fill(self.convs[-1]) # atrous convs for dilation in dilations: if use_depthwise_separable_conv: self.convs.append( DepthwiseSeparableConv2d( in_channels, out_channels, kernel_size=3, padding=dilation, dilation=dilation, norm1=norm, activation1=deepcopy(activation), norm2=norm, activation2=deepcopy(activation), ) ) else: self.convs.append( Conv2d( in_channels, out_channels, kernel_size=3, padding=dilation, dilation=dilation, bias=use_bias, norm=get_norm(norm, out_channels), activation=deepcopy(activation), ) ) weight_init.c2_xavier_fill(self.convs[-1]) # image pooling # We do not add BatchNorm because the spatial resolution is 1x1, # the original TF implementation has BatchNorm. if pool_kernel_size is None: image_pooling = nn.Sequential( nn.AdaptiveAvgPool2d(1), Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)), ) else: image_pooling = nn.Sequential( nn.AvgPool2d(kernel_size=pool_kernel_size, stride=1), Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)), ) weight_init.c2_xavier_fill(image_pooling[1]) self.convs.append(image_pooling) self.project = Conv2d( 5 * out_channels, out_channels, kernel_size=1, bias=use_bias, norm=get_norm(norm, out_channels), activation=deepcopy(activation), ) weight_init.c2_xavier_fill(self.project) def forward(self, x): size = x.shape[-2:] if self.pool_kernel_size is not None: if size[0] % self.pool_kernel_size[0] or size[1] % self.pool_kernel_size[1]: raise ValueError( "`pool_kernel_size` must be divisible by the shape of inputs. " "Input size: {} `pool_kernel_size`: {}".format(size, self.pool_kernel_size) ) res = [] for conv in self.convs: res.append(conv(x)) res[-1] = F.interpolate(res[-1], size=size, mode="bilinear", align_corners=False) res = torch.cat(res, dim=1) res = self.project(res) res = F.dropout(res, self.dropout, training=self.training) if self.dropout > 0 else res return res ================================================ FILE: annotator/oneformer/detectron2/layers/batch_norm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch import torch.distributed as dist from fvcore.nn.distributed import differentiable_all_reduce from torch import nn from torch.nn import functional as F from annotator.oneformer.detectron2.utils import comm, env from .wrappers import BatchNorm2d class FrozenBatchNorm2d(nn.Module): """ BatchNorm2d where the batch statistics and the affine parameters are fixed. It contains non-trainable buffers called "weight" and "bias", "running_mean", "running_var", initialized to perform identity transformation. The pre-trained backbone models from Caffe2 only contain "weight" and "bias", which are computed from the original four parameters of BN. The affine transform `x * weight + bias` will perform the equivalent computation of `(x - running_mean) / sqrt(running_var) * weight + bias`. When loading a backbone model from Caffe2, "running_mean" and "running_var" will be left unchanged as identity transformation. Other pre-trained backbone models may contain all 4 parameters. The forward is implemented by `F.batch_norm(..., training=False)`. """ _version = 3 def __init__(self, num_features, eps=1e-5): super().__init__() self.num_features = num_features self.eps = eps self.register_buffer("weight", torch.ones(num_features)) self.register_buffer("bias", torch.zeros(num_features)) self.register_buffer("running_mean", torch.zeros(num_features)) self.register_buffer("running_var", torch.ones(num_features) - eps) def forward(self, x): if x.requires_grad: # When gradients are needed, F.batch_norm will use extra memory # because its backward op computes gradients for weight/bias as well. scale = self.weight * (self.running_var + self.eps).rsqrt() bias = self.bias - self.running_mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) out_dtype = x.dtype # may be half return x * scale.to(out_dtype) + bias.to(out_dtype) else: # When gradients are not needed, F.batch_norm is a single fused op # and provide more optimization opportunities. return F.batch_norm( x, self.running_mean, self.running_var, self.weight, self.bias, training=False, eps=self.eps, ) def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # No running_mean/var in early versions # This will silent the warnings if prefix + "running_mean" not in state_dict: state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean) if prefix + "running_var" not in state_dict: state_dict[prefix + "running_var"] = torch.ones_like(self.running_var) super()._load_from_state_dict( state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ) def __repr__(self): return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps) @classmethod def convert_frozen_batchnorm(cls, module): """ Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm. Args: module (torch.nn.Module): Returns: If module is BatchNorm/SyncBatchNorm, returns a new module. Otherwise, in-place convert module and return it. Similar to convert_sync_batchnorm in https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py """ bn_module = nn.modules.batchnorm bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm) res = module if isinstance(module, bn_module): res = cls(module.num_features) if module.affine: res.weight.data = module.weight.data.clone().detach() res.bias.data = module.bias.data.clone().detach() res.running_mean.data = module.running_mean.data res.running_var.data = module.running_var.data res.eps = module.eps else: for name, child in module.named_children(): new_child = cls.convert_frozen_batchnorm(child) if new_child is not child: res.add_module(name, new_child) return res def get_norm(norm, out_channels): """ Args: norm (str or callable): either one of BN, SyncBN, FrozenBN, GN; or a callable that takes a channel number and returns the normalization layer as a nn.Module. Returns: nn.Module or None: the normalization layer """ if norm is None: return None if isinstance(norm, str): if len(norm) == 0: return None norm = { "BN": BatchNorm2d, # Fixed in https://github.com/pytorch/pytorch/pull/36382 "SyncBN": NaiveSyncBatchNorm if env.TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm, "FrozenBN": FrozenBatchNorm2d, "GN": lambda channels: nn.GroupNorm(32, channels), # for debugging: "nnSyncBN": nn.SyncBatchNorm, "naiveSyncBN": NaiveSyncBatchNorm, # expose stats_mode N as an option to caller, required for zero-len inputs "naiveSyncBN_N": lambda channels: NaiveSyncBatchNorm(channels, stats_mode="N"), "LN": lambda channels: LayerNorm(channels), }[norm] return norm(out_channels) class NaiveSyncBatchNorm(BatchNorm2d): """ In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient when the batch size on each worker is different. (e.g., when scale augmentation is used, or when it is applied to mask head). This is a slower but correct alternative to `nn.SyncBatchNorm`. Note: There isn't a single definition of Sync BatchNorm. When ``stats_mode==""``, this module computes overall statistics by using statistics of each worker with equal weight. The result is true statistics of all samples (as if they are all on one worker) only when all workers have the same (N, H, W). This mode does not support inputs with zero batch size. When ``stats_mode=="N"``, this module computes overall statistics by weighting the statistics of each worker by their ``N``. The result is true statistics of all samples (as if they are all on one worker) only when all workers have the same (H, W). It is slower than ``stats_mode==""``. Even though the result of this module may not be the true statistics of all samples, it may still be reasonable because it might be preferrable to assign equal weights to all workers, regardless of their (H, W) dimension, instead of putting larger weight on larger images. From preliminary experiments, little difference is found between such a simplified implementation and an accurate computation of overall mean & variance. """ def __init__(self, *args, stats_mode="", **kwargs): super().__init__(*args, **kwargs) assert stats_mode in ["", "N"] self._stats_mode = stats_mode def forward(self, input): if comm.get_world_size() == 1 or not self.training: return super().forward(input) B, C = input.shape[0], input.shape[1] half_input = input.dtype == torch.float16 if half_input: # fp16 does not have good enough numerics for the reduction here input = input.float() mean = torch.mean(input, dim=[0, 2, 3]) meansqr = torch.mean(input * input, dim=[0, 2, 3]) if self._stats_mode == "": assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.' vec = torch.cat([mean, meansqr], dim=0) vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) momentum = self.momentum else: if B == 0: vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype) vec = vec + input.sum() # make sure there is gradient w.r.t input else: vec = torch.cat( [mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype)], dim=0 ) vec = differentiable_all_reduce(vec * B) total_batch = vec[-1].detach() momentum = total_batch.clamp(max=1) * self.momentum # no update if total_batch is 0 mean, meansqr, _ = torch.split(vec / total_batch.clamp(min=1), C) # avoid div-by-zero var = meansqr - mean * mean invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) self.running_mean += momentum * (mean.detach() - self.running_mean) self.running_var += momentum * (var.detach() - self.running_var) ret = input * scale + bias if half_input: ret = ret.half() return ret class CycleBatchNormList(nn.ModuleList): """ Implement domain-specific BatchNorm by cycling. When a BatchNorm layer is used for multiple input domains or input features, it might need to maintain a separate test-time statistics for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`. This module implements it by using N separate BN layers and it cycles through them every time a forward() is called. NOTE: The caller of this module MUST guarantee to always call this module by multiple of N times. Otherwise its test-time statistics will be incorrect. """ def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs): """ Args: length: number of BatchNorm layers to cycle. bn_class: the BatchNorm class to use kwargs: arguments of the BatchNorm class, such as num_features. """ self._affine = kwargs.pop("affine", True) super().__init__([bn_class(**kwargs, affine=False) for k in range(length)]) if self._affine: # shared affine, domain-specific BN channels = self[0].num_features self.weight = nn.Parameter(torch.ones(channels)) self.bias = nn.Parameter(torch.zeros(channels)) self._pos = 0 def forward(self, x): ret = self[self._pos](x) self._pos = (self._pos + 1) % len(self) if self._affine: w = self.weight.reshape(1, -1, 1, 1) b = self.bias.reshape(1, -1, 1, 1) return ret * w + b else: return ret def extra_repr(self): return f"affine={self._affine}" class LayerNorm(nn.Module): """ A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the channel dimension for inputs that have shape (batch_size, channels, height, width). https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa B950 """ def __init__(self, normalized_shape, eps=1e-6): super().__init__() self.weight = nn.Parameter(torch.ones(normalized_shape)) self.bias = nn.Parameter(torch.zeros(normalized_shape)) self.eps = eps self.normalized_shape = (normalized_shape,) def forward(self, x): u = x.mean(1, keepdim=True) s = (x - u).pow(2).mean(1, keepdim=True) x = (x - u) / torch.sqrt(s + self.eps) x = self.weight[:, None, None] * x + self.bias[:, None, None] return x ================================================ FILE: annotator/oneformer/detectron2/layers/blocks.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import fvcore.nn.weight_init as weight_init from torch import nn from .batch_norm import FrozenBatchNorm2d, get_norm from .wrappers import Conv2d """ CNN building blocks. """ class CNNBlockBase(nn.Module): """ A CNN block is assumed to have input channels, output channels and a stride. The input and output of `forward()` method must be NCHW tensors. The method can perform arbitrary computation but must match the given channels and stride specification. Attribute: in_channels (int): out_channels (int): stride (int): """ def __init__(self, in_channels, out_channels, stride): """ The `__init__` method of any subclass should also contain these arguments. Args: in_channels (int): out_channels (int): stride (int): """ super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.stride = stride def freeze(self): """ Make this block not trainable. This method sets all parameters to `requires_grad=False`, and convert all BatchNorm layers to FrozenBatchNorm Returns: the block itself """ for p in self.parameters(): p.requires_grad = False FrozenBatchNorm2d.convert_frozen_batchnorm(self) return self class DepthwiseSeparableConv2d(nn.Module): """ A kxk depthwise convolution + a 1x1 convolution. In :paper:`xception`, norm & activation are applied on the second conv. :paper:`mobilenet` uses norm & activation on both convs. """ def __init__( self, in_channels, out_channels, kernel_size=3, padding=1, dilation=1, *, norm1=None, activation1=None, norm2=None, activation2=None, ): """ Args: norm1, norm2 (str or callable): normalization for the two conv layers. activation1, activation2 (callable(Tensor) -> Tensor): activation function for the two conv layers. """ super().__init__() self.depthwise = Conv2d( in_channels, in_channels, kernel_size=kernel_size, padding=padding, dilation=dilation, groups=in_channels, bias=not norm1, norm=get_norm(norm1, in_channels), activation=activation1, ) self.pointwise = Conv2d( in_channels, out_channels, kernel_size=1, bias=not norm2, norm=get_norm(norm2, out_channels), activation=activation2, ) # default initialization weight_init.c2_msra_fill(self.depthwise) weight_init.c2_msra_fill(self.pointwise) def forward(self, x): return self.pointwise(self.depthwise(x)) ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/README.md ================================================ To add a new Op: 1. Create a new directory 2. Implement new ops there 3. Delcare its Python interface in `vision.cpp`. ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #pragma once #include namespace detectron2 { at::Tensor ROIAlignRotated_forward_cpu( const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio); at::Tensor ROIAlignRotated_backward_cpu( const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio); #if defined(WITH_CUDA) || defined(WITH_HIP) at::Tensor ROIAlignRotated_forward_cuda( const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio); at::Tensor ROIAlignRotated_backward_cuda( const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio); #endif // Interface for Python inline at::Tensor ROIAlignRotated_forward( const at::Tensor& input, const at::Tensor& rois, const double spatial_scale, const int64_t pooled_height, const int64_t pooled_width, const int64_t sampling_ratio) { if (input.is_cuda()) { #if defined(WITH_CUDA) || defined(WITH_HIP) return ROIAlignRotated_forward_cuda( input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); #else AT_ERROR("Detectron2 is not compiled with GPU support!"); #endif } return ROIAlignRotated_forward_cpu( input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); } inline at::Tensor ROIAlignRotated_backward( const at::Tensor& grad, const at::Tensor& rois, const double spatial_scale, const int64_t pooled_height, const int64_t pooled_width, const int64_t batch_size, const int64_t channels, const int64_t height, const int64_t width, const int64_t sampling_ratio) { if (grad.is_cuda()) { #if defined(WITH_CUDA) || defined(WITH_HIP) return ROIAlignRotated_backward_cuda( grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); #else AT_ERROR("Detectron2 is not compiled with GPU support!"); #endif } return ROIAlignRotated_backward_cpu( grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #include #include "ROIAlignRotated.h" // Note: this implementation originates from the Caffe2 ROIAlignRotated Op // and PyTorch ROIAlign (non-rotated) Op implementations. // The key difference between this implementation and those ones is // we don't do "legacy offset" in this version, as there aren't many previous // works, if any, using the "legacy" ROIAlignRotated Op. // This would make the interface a bit cleaner. namespace detectron2 { namespace { template struct PreCalc { int pos1; int pos2; int pos3; int pos4; T w1; T w2; T w3; T w4; }; template void pre_calc_for_bilinear_interpolate( const int height, const int width, const int pooled_height, const int pooled_width, const int iy_upper, const int ix_upper, T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w, T cos_theta, T sin_theta, std::vector>& pre_calc) { int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < iy_upper; iy++) { const T yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < ix_upper; ix++) { const T xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta around the center and translate // In image space, (y, x) is the order for Right Handed System, // and this is essentially multiplying the point by a rotation matrix // to rotate it counterclockwise through angle theta. T y = yy * cos_theta - xx * sin_theta + roi_center_h; T x = yy * sin_theta + xx * cos_theta + roi_center_w; // deal with: inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty PreCalc pc; pc.pos1 = 0; pc.pos2 = 0; pc.pos3 = 0; pc.pos4 = 0; pc.w1 = 0; pc.w2 = 0; pc.w3 = 0; pc.w4 = 0; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; continue; } if (y < 0) { y = 0; } if (x < 0) { x = 0; } int y_low = (int)y; int x_low = (int)x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; // save weights and indices PreCalc pc; pc.pos1 = y_low * width + x_low; pc.pos2 = y_low * width + x_high; pc.pos3 = y_high * width + x_low; pc.pos4 = y_high * width + x_high; pc.w1 = w1; pc.w2 = w2; pc.w3 = w3; pc.w4 = w4; pre_calc[pre_calc_index] = pc; pre_calc_index += 1; } } } } } template void bilinear_interpolate_gradient( const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4, int& x_low, int& x_high, int& y_low, int& y_high) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty w1 = w2 = w3 = w4 = 0.; x_low = x_high = y_low = y_high = -1; return; } if (y < 0) { y = 0; } if (x < 0) { x = 0; } y_low = (int)y; x_low = (int)x; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // reference in forward // T v1 = input[y_low * width + x_low]; // T v2 = input[y_low * width + x_high]; // T v3 = input[y_high * width + x_low]; // T v4 = input[y_high * width + x_high]; // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; return; } template inline void add(T* address, const T& val) { *address += val; } } // namespace template void ROIAlignRotatedForward( const int nthreads, const T* input, const T& spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const T* rois, T* output) { int n_rois = nthreads / channels / pooled_width / pooled_height; // (n, c, ph, pw) is an element in the pooled output // can be parallelized using omp // #pragma omp parallel for num_threads(32) for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height; const T* current_roi = rois + n * 6; int roi_batch_ind = current_roi[0]; // Do not use rounding; this implementation detail is critical // ROIAlignRotated supports align == true, i.e., continuous coordinate // by default, thus the 0.5 offset T offset = (T)0.5; T roi_center_w = current_roi[1] * spatial_scale - offset; T roi_center_h = current_roi[2] * spatial_scale - offset; T roi_width = current_roi[3] * spatial_scale; T roi_height = current_roi[4] * spatial_scale; T theta = current_roi[5] * M_PI / 180.0; T cos_theta = cos(theta); T sin_theta = sin(theta); AT_ASSERTM( roi_width >= 0 && roi_height >= 0, "ROIs in ROIAlignRotated do not have non-negative size!"); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 // we want to precalculate indices and weights shared by all channels, // this is the key point of optimization std::vector> pre_calc( roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. T roi_start_h = -roi_height / 2.0; T roi_start_w = -roi_width / 2.0; pre_calc_for_bilinear_interpolate( height, width, pooled_height, pooled_width, roi_bin_grid_h, roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta, sin_theta, pre_calc); for (int c = 0; c < channels; c++) { int index_n_c = index_n + c * pooled_width * pooled_height; const T* offset_input = input + (roi_batch_ind * channels + c) * height * width; int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { int index = index_n_c + ph * pooled_width + pw; T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { PreCalc pc = pre_calc[pre_calc_index]; output_val += pc.w1 * offset_input[pc.pos1] + pc.w2 * offset_input[pc.pos2] + pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4]; pre_calc_index += 1; } } output_val /= count; output[index] = output_val; } // for pw } // for ph } // for c } // for n } template void ROIAlignRotatedBackward( const int nthreads, // may not be contiguous. should index using n_stride, etc const T* grad_output, const T& spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, T* grad_input, const T* rois, const int n_stride, const int c_stride, const int h_stride, const int w_stride) { for (int index = 0; index < nthreads; index++) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* current_roi = rois + n * 6; int roi_batch_ind = current_roi[0]; // Do not use rounding; this implementation detail is critical // ROIAlignRotated supports align == true, i.e., continuous coordinate // by default, thus the 0.5 offset T offset = (T)0.5; T roi_center_w = current_roi[1] * spatial_scale - offset; T roi_center_h = current_roi[2] * spatial_scale - offset; T roi_width = current_roi[3] * spatial_scale; T roi_height = current_roi[4] * spatial_scale; T theta = current_roi[5] * M_PI / 180.0; T cos_theta = cos(theta); T sin_theta = sin(theta); AT_ASSERTM( roi_width >= 0 && roi_height >= 0, "ROIs in ROIAlignRotated do not have non-negative size!"); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); T* offset_grad_input = grad_input + ((roi_batch_ind * channels + c) * height * width); int output_offset = n * n_stride + c * c_stride; const T* offset_grad_output = grad_output + output_offset; const T grad_output_this_bin = offset_grad_output[ph * h_stride + pw * w_stride]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. T roi_start_h = -roi_height / 2.0; T roi_start_w = -roi_width / 2.0; // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta around the center and translate T y = yy * cos_theta - xx * sin_theta + roi_center_h; T x = yy * sin_theta + xx * cos_theta + roi_center_w; T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient( height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high); T g1 = grad_output_this_bin * w1 / count; T g2 = grad_output_this_bin * w2 / count; T g3 = grad_output_this_bin * w3 / count; T g4 = grad_output_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { // atomic add is not needed for now since it is single threaded add(offset_grad_input + y_low * width + x_low, static_cast(g1)); add(offset_grad_input + y_low * width + x_high, static_cast(g2)); add(offset_grad_input + y_high * width + x_low, static_cast(g3)); add(offset_grad_input + y_high * width + x_high, static_cast(g4)); } // if } // ix } // iy } // for } // ROIAlignRotatedBackward at::Tensor ROIAlignRotated_forward_cpu( const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor"); AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor"); at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::CheckedFrom c = "ROIAlign_forward_cpu"; at::checkAllSameType(c, {input_t, rois_t}); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); at::Tensor output = at::zeros( {num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; if (output.numel() == 0) { return output; } auto input_ = input.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.scalar_type(), "ROIAlignRotated_forward", [&] { ROIAlignRotatedForward( output_size, input_.data_ptr(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, rois_.data_ptr(), output.data_ptr()); }); return output; } at::Tensor ROIAlignRotated_backward_cpu( const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio) { AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor"); AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor"); at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; at::CheckedFrom c = "ROIAlignRotated_backward_cpu"; at::checkAllSameType(c, {grad_t, rois_t}); at::Tensor grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); // handle possibly empty gradients if (grad.numel() == 0) { return grad_input; } // get stride values to ensure indexing into gradients is correct. int n_stride = grad.stride(0); int c_stride = grad.stride(1); int h_stride = grad.stride(2); int w_stride = grad.stride(3); auto rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( grad.scalar_type(), "ROIAlignRotated_forward", [&] { ROIAlignRotatedBackward( grad.numel(), grad.data_ptr(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, grad_input.data_ptr(), rois_.data_ptr(), n_stride, c_stride, h_stride, w_stride); }); return grad_input; } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) // Note: this implementation originates from the Caffe2 ROIAlignRotated Op // and PyTorch ROIAlign (non-rotated) Op implementations. // The key difference between this implementation and those ones is // we don't do "legacy offset" in this version, as there aren't many previous // works, if any, using the "legacy" ROIAlignRotated Op. // This would make the interface a bit cleaner. namespace detectron2 { namespace { template __device__ T bilinear_interpolate( const T* input, const int height, const int width, T y, T x) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty return 0; } if (y < 0) { y = 0; } if (x < 0) { x = 0; } int y_low = (int)y; int x_low = (int)x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // do bilinear interpolation T v1 = input[y_low * width + x_low]; T v2 = input[y_low * width + x_high]; T v3 = input[y_high * width + x_low]; T v4 = input[y_high * width + x_high]; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void bilinear_interpolate_gradient( const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4, int& x_low, int& x_high, int& y_low, int& y_high) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty w1 = w2 = w3 = w4 = 0.; x_low = x_high = y_low = y_high = -1; return; } if (y < 0) { y = 0; } if (x < 0) { x = 0; } y_low = (int)y; x_low = (int)x; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // reference in forward // T v1 = input[y_low * width + x_low]; // T v2 = input[y_low * width + x_high]; // T v3 = input[y_high * width + x_low]; // T v4 = input[y_high * width + x_high]; // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; return; } } // namespace template __global__ void RoIAlignRotatedForward( const int nthreads, const T* input, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, const T* rois, T* top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* current_roi = rois + n * 6; int roi_batch_ind = current_roi[0]; // Do not use rounding; this implementation detail is critical // ROIAlignRotated supports align == true, i.e., continuous coordinate // by default, thus the 0.5 offset T offset = (T)0.5; T roi_center_w = current_roi[1] * spatial_scale - offset; T roi_center_h = current_roi[2] * spatial_scale - offset; T roi_width = current_roi[3] * spatial_scale; T roi_height = current_roi[4] * spatial_scale; T theta = current_roi[5] * M_PI / 180.0; T cos_theta = cos(theta); T sin_theta = sin(theta); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T* offset_input = input + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. T roi_start_h = -roi_height / 2.0; T roi_start_w = -roi_width / 2.0; // We do average (inte gral) pooling inside a bin const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 { const T yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta around the center and translate T y = yy * cos_theta - xx * sin_theta + roi_center_h; T x = yy * sin_theta + xx * cos_theta + roi_center_w; T val = bilinear_interpolate(offset_input, height, width, y, x); output_val += val; } } output_val /= count; top_data[index] = output_val; } } template __global__ void RoIAlignRotatedBackwardFeature( const int nthreads, const T* top_diff, const int num_rois, const T spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio, T* bottom_diff, const T* rois) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* current_roi = rois + n * 6; int roi_batch_ind = current_roi[0]; // Do not use rounding; this implementation detail is critical // ROIAlignRotated supports align == true, i.e., continuous coordinate // by default, thus the 0.5 offset T offset = (T)0.5; T roi_center_w = current_roi[1] * spatial_scale - offset; T roi_center_h = current_roi[2] * spatial_scale - offset; T roi_width = current_roi[3] * spatial_scale; T roi_height = current_roi[4] * spatial_scale; T theta = current_roi[5] * M_PI / 180.0; T cos_theta = cos(theta); T sin_theta = sin(theta); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const T* offset_top_diff = top_diff + top_offset; const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. T roi_start_h = -roi_height / 2.0; T roi_start_w = -roi_width / 2.0; // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 { const T yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta around the center and translate T y = yy * cos_theta - xx * sin_theta + roi_center_h; T x = yy * sin_theta + xx * cos_theta + roi_center_w; T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient( height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high); T g1 = top_diff_this_bin * w1 / count; T g2 = top_diff_this_bin * w2 / count; T g3 = top_diff_this_bin * w3 / count; T g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd( offset_bottom_diff + y_low * width + x_low, static_cast(g1)); atomicAdd( offset_bottom_diff + y_low * width + x_high, static_cast(g2)); atomicAdd( offset_bottom_diff + y_high * width + x_low, static_cast(g3)); atomicAdd( offset_bottom_diff + y_high * width + x_high, static_cast(g4)); } // if } // ix } // iy } // CUDA_1D_KERNEL_LOOP } // RoIAlignRotatedBackward at::Tensor ROIAlignRotated_forward_cuda( const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int sampling_ratio) { AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor"); at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; at::CheckedFrom c = "ROIAlignRotated_forward_cuda"; at::checkAllSameGPU(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t}); at::cuda::CUDAGuard device_guard(input.device()); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto width = input.size(3); auto output = at::empty( {num_rois, channels, pooled_height, pooled_width}, input.options()); auto output_size = num_rois * pooled_height * pooled_width * channels; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( at::cuda::ATenCeilDiv( static_cast(output_size), static_cast(512)), static_cast(4096))); dim3 block(512); if (output.numel() == 0) { AT_CUDA_CHECK(cudaGetLastError()); return output; } auto input_ = input.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES( input.scalar_type(), "ROIAlignRotated_forward", [&] { RoIAlignRotatedForward<<>>( output_size, input_.data_ptr(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, rois_.data_ptr(), output.data_ptr()); }); cudaDeviceSynchronize(); AT_CUDA_CHECK(cudaGetLastError()); return output; } // TODO remove the dependency on input and use instead its sizes -> save memory at::Tensor ROIAlignRotated_backward_cuda( const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio) { AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor"); at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; at::CheckedFrom c = "ROIAlign_backward_cuda"; at::checkAllSameGPU(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t}); at::cuda::CUDAGuard device_guard(grad.device()); auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( at::cuda::ATenCeilDiv( static_cast(grad.numel()), static_cast(512)), static_cast(4096))); dim3 block(512); // handle possibly empty gradients if (grad.numel() == 0) { AT_CUDA_CHECK(cudaGetLastError()); return grad_input; } auto grad_ = grad.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES( grad.scalar_type(), "ROIAlignRotated_backward", [&] { RoIAlignRotatedBackwardFeature<<>>( grad.numel(), grad_.data_ptr(), num_rois, spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, grad_input.data_ptr(), rois_.data_ptr()); }); AT_CUDA_CHECK(cudaGetLastError()); return grad_input; } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #pragma once #include namespace detectron2 { at::Tensor box_iou_rotated_cpu( const at::Tensor& boxes1, const at::Tensor& boxes2); #if defined(WITH_CUDA) || defined(WITH_HIP) at::Tensor box_iou_rotated_cuda( const at::Tensor& boxes1, const at::Tensor& boxes2); #endif // Interface for Python // inline is needed to prevent multiple function definitions when this header is // included by different cpps inline at::Tensor box_iou_rotated( const at::Tensor& boxes1, const at::Tensor& boxes2) { assert(boxes1.device().is_cuda() == boxes2.device().is_cuda()); if (boxes1.device().is_cuda()) { #if defined(WITH_CUDA) || defined(WITH_HIP) return box_iou_rotated_cuda(boxes1.contiguous(), boxes2.contiguous()); #else AT_ERROR("Detectron2 is not compiled with GPU support!"); #endif } return box_iou_rotated_cpu(boxes1.contiguous(), boxes2.contiguous()); } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #include "box_iou_rotated.h" #include "box_iou_rotated_utils.h" namespace detectron2 { template void box_iou_rotated_cpu_kernel( const at::Tensor& boxes1, const at::Tensor& boxes2, at::Tensor& ious) { auto num_boxes1 = boxes1.size(0); auto num_boxes2 = boxes2.size(0); for (int i = 0; i < num_boxes1; i++) { for (int j = 0; j < num_boxes2; j++) { ious[i * num_boxes2 + j] = single_box_iou_rotated( boxes1[i].data_ptr(), boxes2[j].data_ptr()); } } } at::Tensor box_iou_rotated_cpu( // input must be contiguous: const at::Tensor& boxes1, const at::Tensor& boxes2) { auto num_boxes1 = boxes1.size(0); auto num_boxes2 = boxes2.size(0); at::Tensor ious = at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat)); box_iou_rotated_cpu_kernel(boxes1, boxes2, ious); // reshape from 1d array to 2d array auto shape = std::vector{num_boxes1, num_boxes2}; return ious.reshape(shape); } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #include #include #include #include #include "box_iou_rotated_utils.h" namespace detectron2 { // 2D block with 32 * 16 = 512 threads per block const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 16; template __global__ void box_iou_rotated_cuda_kernel( const int n_boxes1, const int n_boxes2, const T* dev_boxes1, const T* dev_boxes2, T* dev_ious) { const int row_start = blockIdx.x * blockDim.x; const int col_start = blockIdx.y * blockDim.y; const int row_size = min(n_boxes1 - row_start, blockDim.x); const int col_size = min(n_boxes2 - col_start, blockDim.y); __shared__ float block_boxes1[BLOCK_DIM_X * 5]; __shared__ float block_boxes2[BLOCK_DIM_Y * 5]; // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y if (threadIdx.x < row_size && threadIdx.y == 0) { block_boxes1[threadIdx.x * 5 + 0] = dev_boxes1[(row_start + threadIdx.x) * 5 + 0]; block_boxes1[threadIdx.x * 5 + 1] = dev_boxes1[(row_start + threadIdx.x) * 5 + 1]; block_boxes1[threadIdx.x * 5 + 2] = dev_boxes1[(row_start + threadIdx.x) * 5 + 2]; block_boxes1[threadIdx.x * 5 + 3] = dev_boxes1[(row_start + threadIdx.x) * 5 + 3]; block_boxes1[threadIdx.x * 5 + 4] = dev_boxes1[(row_start + threadIdx.x) * 5 + 4]; } if (threadIdx.x < col_size && threadIdx.y == 0) { block_boxes2[threadIdx.x * 5 + 0] = dev_boxes2[(col_start + threadIdx.x) * 5 + 0]; block_boxes2[threadIdx.x * 5 + 1] = dev_boxes2[(col_start + threadIdx.x) * 5 + 1]; block_boxes2[threadIdx.x * 5 + 2] = dev_boxes2[(col_start + threadIdx.x) * 5 + 2]; block_boxes2[threadIdx.x * 5 + 3] = dev_boxes2[(col_start + threadIdx.x) * 5 + 3]; block_boxes2[threadIdx.x * 5 + 4] = dev_boxes2[(col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size && threadIdx.y < col_size) { int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y; dev_ious[offset] = single_box_iou_rotated( block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5); } } at::Tensor box_iou_rotated_cuda( // input must be contiguous const at::Tensor& boxes1, const at::Tensor& boxes2) { using scalar_t = float; AT_ASSERTM( boxes1.scalar_type() == at::kFloat, "boxes1 must be a float tensor"); AT_ASSERTM( boxes2.scalar_type() == at::kFloat, "boxes2 must be a float tensor"); AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor"); AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor"); at::cuda::CUDAGuard device_guard(boxes1.device()); auto num_boxes1 = boxes1.size(0); auto num_boxes2 = boxes2.size(0); at::Tensor ious = at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat)); bool transpose = false; if (num_boxes1 > 0 && num_boxes2 > 0) { scalar_t *data1 = boxes1.data_ptr(), *data2 = boxes2.data_ptr(); if (num_boxes2 > 65535 * BLOCK_DIM_Y) { AT_ASSERTM( num_boxes1 <= 65535 * BLOCK_DIM_Y, "Too many boxes for box_iou_rotated_cuda!"); // x dim is allowed to be large, but y dim cannot, // so we transpose the two to avoid "invalid configuration argument" // error. We assume one of them is small. Otherwise the result is hard to // fit in memory anyway. std::swap(num_boxes1, num_boxes2); std::swap(data1, data2); transpose = true; } const int blocks_x = at::cuda::ATenCeilDiv(static_cast(num_boxes1), BLOCK_DIM_X); const int blocks_y = at::cuda::ATenCeilDiv(static_cast(num_boxes2), BLOCK_DIM_Y); dim3 blocks(blocks_x, blocks_y); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); box_iou_rotated_cuda_kernel<<>>( num_boxes1, num_boxes2, data1, data2, (scalar_t*)ious.data_ptr()); AT_CUDA_CHECK(cudaGetLastError()); } // reshape from 1d array to 2d array auto shape = std::vector{num_boxes1, num_boxes2}; if (transpose) { return ious.view(shape).t(); } else { return ious.view(shape); } } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #pragma once #include #include #if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1 // Designates functions callable from the host (CPU) and the device (GPU) #define HOST_DEVICE __host__ __device__ #define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__ #else #include #define HOST_DEVICE #define HOST_DEVICE_INLINE HOST_DEVICE inline #endif namespace detectron2 { namespace { template struct RotatedBox { T x_ctr, y_ctr, w, h, a; }; template struct Point { T x, y; HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {} HOST_DEVICE_INLINE Point operator+(const Point& p) const { return Point(x + p.x, y + p.y); } HOST_DEVICE_INLINE Point& operator+=(const Point& p) { x += p.x; y += p.y; return *this; } HOST_DEVICE_INLINE Point operator-(const Point& p) const { return Point(x - p.x, y - p.y); } HOST_DEVICE_INLINE Point operator*(const T coeff) const { return Point(x * coeff, y * coeff); } }; template HOST_DEVICE_INLINE T dot_2d(const Point& A, const Point& B) { return A.x * B.x + A.y * B.y; } // R: result type. can be different from input type template HOST_DEVICE_INLINE R cross_2d(const Point& A, const Point& B) { return static_cast(A.x) * static_cast(B.y) - static_cast(B.x) * static_cast(A.y); } template HOST_DEVICE_INLINE void get_rotated_vertices( const RotatedBox& box, Point (&pts)[4]) { // M_PI / 180. == 0.01745329251 double theta = box.a * 0.01745329251; T cosTheta2 = (T)cos(theta) * 0.5f; T sinTheta2 = (T)sin(theta) * 0.5f; // y: top --> down; x: left --> right pts[0].x = box.x_ctr + sinTheta2 * box.h + cosTheta2 * box.w; pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w; pts[1].x = box.x_ctr - sinTheta2 * box.h + cosTheta2 * box.w; pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w; pts[2].x = 2 * box.x_ctr - pts[0].x; pts[2].y = 2 * box.y_ctr - pts[0].y; pts[3].x = 2 * box.x_ctr - pts[1].x; pts[3].y = 2 * box.y_ctr - pts[1].y; } template HOST_DEVICE_INLINE int get_intersection_points( const Point (&pts1)[4], const Point (&pts2)[4], Point (&intersections)[24]) { // Line vector // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] Point vec1[4], vec2[4]; for (int i = 0; i < 4; i++) { vec1[i] = pts1[(i + 1) % 4] - pts1[i]; vec2[i] = pts2[(i + 1) % 4] - pts2[i]; } // When computing the intersection area, it doesn't hurt if we have // more (duplicated/approximate) intersections/vertices than needed, // while it can cause drastic difference if we miss an intersection/vertex. // Therefore, we add an epsilon to relax the comparisons between // the float point numbers that decide the intersection points. double EPS = 1e-5; // Line test - test all line combos for intersection int num = 0; // number of intersections for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { // Solve for 2x2 Ax=b T det = cross_2d(vec2[j], vec1[i]); // This takes care of parallel lines if (fabs(det) <= 1e-14) { continue; } auto vec12 = pts2[j] - pts1[i]; T t1 = cross_2d(vec2[j], vec12) / det; T t2 = cross_2d(vec1[i], vec12) / det; if (t1 > -EPS && t1 < 1.0f + EPS && t2 > -EPS && t2 < 1.0f + EPS) { intersections[num++] = pts1[i] + vec1[i] * t1; } } } // Check for vertices of rect1 inside rect2 { const auto& AB = vec2[0]; const auto& DA = vec2[3]; auto ABdotAB = dot_2d(AB, AB); auto ADdotAD = dot_2d(DA, DA); for (int i = 0; i < 4; i++) { // assume ABCD is the rectangle, and P is the point to be judged // P is inside ABCD iff. P's projection on AB lies within AB // and P's projection on AD lies within AD auto AP = pts1[i] - pts2[0]; auto APdotAB = dot_2d(AP, AB); auto APdotAD = -dot_2d(AP, DA); if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) && (APdotAD < ADdotAD + EPS)) { intersections[num++] = pts1[i]; } } } // Reverse the check - check for vertices of rect2 inside rect1 { const auto& AB = vec1[0]; const auto& DA = vec1[3]; auto ABdotAB = dot_2d(AB, AB); auto ADdotAD = dot_2d(DA, DA); for (int i = 0; i < 4; i++) { auto AP = pts2[i] - pts1[0]; auto APdotAB = dot_2d(AP, AB); auto APdotAD = -dot_2d(AP, DA); if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) && (APdotAD < ADdotAD + EPS)) { intersections[num++] = pts2[i]; } } } return num; } template HOST_DEVICE_INLINE int convex_hull_graham( const Point (&p)[24], const int& num_in, Point (&q)[24], bool shift_to_zero = false) { assert(num_in >= 2); // Step 1: // Find point with minimum y // if more than 1 points have the same minimum y, // pick the one with the minimum x. int t = 0; for (int i = 1; i < num_in; i++) { if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { t = i; } } auto& start = p[t]; // starting point // Step 2: // Subtract starting point from every points (for sorting in the next step) for (int i = 0; i < num_in; i++) { q[i] = p[i] - start; } // Swap the starting point to position 0 auto tmp = q[0]; q[0] = q[t]; q[t] = tmp; // Step 3: // Sort point 1 ~ num_in according to their relative cross-product values // (essentially sorting according to angles) // If the angles are the same, sort according to their distance to origin T dist[24]; #if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1 // compute distance to origin before sort, and sort them together with the // points for (int i = 0; i < num_in; i++) { dist[i] = dot_2d(q[i], q[i]); } // CUDA version // In the future, we can potentially use thrust // for sorting here to improve speed (though not guaranteed) for (int i = 1; i < num_in - 1; i++) { for (int j = i + 1; j < num_in; j++) { T crossProduct = cross_2d(q[i], q[j]); if ((crossProduct < -1e-6) || (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) { auto q_tmp = q[i]; q[i] = q[j]; q[j] = q_tmp; auto dist_tmp = dist[i]; dist[i] = dist[j]; dist[j] = dist_tmp; } } } #else // CPU version std::sort( q + 1, q + num_in, [](const Point& A, const Point& B) -> bool { T temp = cross_2d(A, B); if (fabs(temp) < 1e-6) { return dot_2d(A, A) < dot_2d(B, B); } else { return temp > 0; } }); // compute distance to origin after sort, since the points are now different. for (int i = 0; i < num_in; i++) { dist[i] = dot_2d(q[i], q[i]); } #endif // Step 4: // Make sure there are at least 2 points (that don't overlap with each other) // in the stack int k; // index of the non-overlapped second point for (k = 1; k < num_in; k++) { if (dist[k] > 1e-8) { break; } } if (k == num_in) { // We reach the end, which means the convex hull is just one point q[0] = p[t]; return 1; } q[1] = q[k]; int m = 2; // 2 points in the stack // Step 5: // Finally we can start the scanning process. // When a non-convex relationship between the 3 points is found // (either concave shape or duplicated points), // we pop the previous point from the stack // until the 3-point relationship is convex again, or // until the stack only contains two points for (int i = k + 1; i < num_in; i++) { while (m > 1) { auto q1 = q[i] - q[m - 2], q2 = q[m - 1] - q[m - 2]; // cross_2d() uses FMA and therefore computes round(round(q1.x*q2.y) - // q2.x*q1.y) So it may not return 0 even when q1==q2. Therefore we // compare round(q1.x*q2.y) and round(q2.x*q1.y) directly. (round means // round to nearest floating point). if (q1.x * q2.y >= q2.x * q1.y) m--; else break; } // Using double also helps, but float can solve the issue for now. // while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) // >= 0) { // m--; // } q[m++] = q[i]; } // Step 6 (Optional): // In general sense we need the original coordinates, so we // need to shift the points back (reverting Step 2) // But if we're only interested in getting the area/perimeter of the shape // We can simply return. if (!shift_to_zero) { for (int i = 0; i < m; i++) { q[i] += start; } } return m; } template HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int& m) { if (m <= 2) { return 0; } T area = 0; for (int i = 1; i < m - 1; i++) { area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); } return area / 2.0; } template HOST_DEVICE_INLINE T rotated_boxes_intersection( const RotatedBox& box1, const RotatedBox& box2) { // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned // from rotated_rect_intersection_pts Point intersectPts[24], orderedPts[24]; Point pts1[4]; Point pts2[4]; get_rotated_vertices(box1, pts1); get_rotated_vertices(box2, pts2); int num = get_intersection_points(pts1, pts2, intersectPts); if (num <= 2) { return 0.0; } // Convex Hull to order the intersection points in clockwise order and find // the contour area. int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); return polygon_area(orderedPts, num_convex); } } // namespace template HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw, T const* const box2_raw) { // shift center to the middle point to achieve higher precision in result RotatedBox box1, box2; auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0; auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0; box1.x_ctr = box1_raw[0] - center_shift_x; box1.y_ctr = box1_raw[1] - center_shift_y; box1.w = box1_raw[2]; box1.h = box1_raw[3]; box1.a = box1_raw[4]; box2.x_ctr = box2_raw[0] - center_shift_x; box2.y_ctr = box2_raw[1] - center_shift_y; box2.w = box2_raw[2]; box2.h = box2_raw[3]; box2.a = box2_raw[4]; T area1 = box1.w * box1.h; T area2 = box2.w * box2.h; if (area1 < 1e-14 || area2 < 1e-14) { return 0.f; } T intersection = rotated_boxes_intersection(box1, box2); T iou = intersection / (area1 + area2 - intersection); return iou; } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/cocoeval/cocoeval.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #include "cocoeval.h" #include #include #include #include using namespace pybind11::literals; namespace detectron2 { namespace COCOeval { // Sort detections from highest score to lowest, such that // detection_instances[detection_sorted_indices[t]] >= // detection_instances[detection_sorted_indices[t+1]]. Use stable_sort to match // original COCO API void SortInstancesByDetectionScore( const std::vector& detection_instances, std::vector* detection_sorted_indices) { detection_sorted_indices->resize(detection_instances.size()); std::iota( detection_sorted_indices->begin(), detection_sorted_indices->end(), 0); std::stable_sort( detection_sorted_indices->begin(), detection_sorted_indices->end(), [&detection_instances](size_t j1, size_t j2) { return detection_instances[j1].score > detection_instances[j2].score; }); } // Partition the ground truth objects based on whether or not to ignore them // based on area void SortInstancesByIgnore( const std::array& area_range, const std::vector& ground_truth_instances, std::vector* ground_truth_sorted_indices, std::vector* ignores) { ignores->clear(); ignores->reserve(ground_truth_instances.size()); for (auto o : ground_truth_instances) { ignores->push_back( o.ignore || o.area < area_range[0] || o.area > area_range[1]); } ground_truth_sorted_indices->resize(ground_truth_instances.size()); std::iota( ground_truth_sorted_indices->begin(), ground_truth_sorted_indices->end(), 0); std::stable_sort( ground_truth_sorted_indices->begin(), ground_truth_sorted_indices->end(), [&ignores](size_t j1, size_t j2) { return (int)(*ignores)[j1] < (int)(*ignores)[j2]; }); } // For each IOU threshold, greedily match each detected instance to a ground // truth instance (if possible) and store the results void MatchDetectionsToGroundTruth( const std::vector& detection_instances, const std::vector& detection_sorted_indices, const std::vector& ground_truth_instances, const std::vector& ground_truth_sorted_indices, const std::vector& ignores, const std::vector>& ious, const std::vector& iou_thresholds, const std::array& area_range, ImageEvaluation* results) { // Initialize memory to store return data matches and ignore const int num_iou_thresholds = iou_thresholds.size(); const int num_ground_truth = ground_truth_sorted_indices.size(); const int num_detections = detection_sorted_indices.size(); std::vector ground_truth_matches( num_iou_thresholds * num_ground_truth, 0); std::vector& detection_matches = results->detection_matches; std::vector& detection_ignores = results->detection_ignores; std::vector& ground_truth_ignores = results->ground_truth_ignores; detection_matches.resize(num_iou_thresholds * num_detections, 0); detection_ignores.resize(num_iou_thresholds * num_detections, false); ground_truth_ignores.resize(num_ground_truth); for (auto g = 0; g < num_ground_truth; ++g) { ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]]; } for (auto t = 0; t < num_iou_thresholds; ++t) { for (auto d = 0; d < num_detections; ++d) { // information about best match so far (match=-1 -> unmatched) double best_iou = std::min(iou_thresholds[t], 1 - 1e-10); int match = -1; for (auto g = 0; g < num_ground_truth; ++g) { // if this ground truth instance is already matched and not a // crowd, it cannot be matched to another detection if (ground_truth_matches[t * num_ground_truth + g] > 0 && !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) { continue; } // if detected instance matched to a regular ground truth // instance, we can break on the first ground truth instance // tagged as ignore (because they are sorted by the ignore tag) if (match >= 0 && !ground_truth_ignores[match] && ground_truth_ignores[g]) { break; } // if IOU overlap is the best so far, store the match appropriately if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) { best_iou = ious[d][ground_truth_sorted_indices[g]]; match = g; } } // if match was made, store id of match for both detection and // ground truth if (match >= 0) { detection_ignores[t * num_detections + d] = ground_truth_ignores[match]; detection_matches[t * num_detections + d] = ground_truth_instances[ground_truth_sorted_indices[match]].id; ground_truth_matches[t * num_ground_truth + match] = detection_instances[detection_sorted_indices[d]].id; } // set unmatched detections outside of area range to ignore const InstanceAnnotation& detection = detection_instances[detection_sorted_indices[d]]; detection_ignores[t * num_detections + d] = detection_ignores[t * num_detections + d] || (detection_matches[t * num_detections + d] == 0 && (detection.area < area_range[0] || detection.area > area_range[1])); } } // store detection score results results->detection_scores.resize(detection_sorted_indices.size()); for (size_t d = 0; d < detection_sorted_indices.size(); ++d) { results->detection_scores[d] = detection_instances[detection_sorted_indices[d]].score; } } std::vector EvaluateImages( const std::vector>& area_ranges, int max_detections, const std::vector& iou_thresholds, const ImageCategoryInstances>& image_category_ious, const ImageCategoryInstances& image_category_ground_truth_instances, const ImageCategoryInstances& image_category_detection_instances) { const int num_area_ranges = area_ranges.size(); const int num_images = image_category_ground_truth_instances.size(); const int num_categories = image_category_ious.size() > 0 ? image_category_ious[0].size() : 0; std::vector detection_sorted_indices; std::vector ground_truth_sorted_indices; std::vector ignores; std::vector results_all( num_images * num_area_ranges * num_categories); // Store results for each image, category, and area range combination. Results // for each IOU threshold are packed into the same ImageEvaluation object for (auto i = 0; i < num_images; ++i) { for (auto c = 0; c < num_categories; ++c) { const std::vector& ground_truth_instances = image_category_ground_truth_instances[i][c]; const std::vector& detection_instances = image_category_detection_instances[i][c]; SortInstancesByDetectionScore( detection_instances, &detection_sorted_indices); if ((int)detection_sorted_indices.size() > max_detections) { detection_sorted_indices.resize(max_detections); } for (size_t a = 0; a < area_ranges.size(); ++a) { SortInstancesByIgnore( area_ranges[a], ground_truth_instances, &ground_truth_sorted_indices, &ignores); MatchDetectionsToGroundTruth( detection_instances, detection_sorted_indices, ground_truth_instances, ground_truth_sorted_indices, ignores, image_category_ious[i][c], iou_thresholds, area_ranges[a], &results_all [c * num_area_ranges * num_images + a * num_images + i]); } } } return results_all; } // Convert a python list to a vector template std::vector list_to_vec(const py::list& l) { std::vector v(py::len(l)); for (int i = 0; i < (int)py::len(l); ++i) { v[i] = l[i].cast(); } return v; } // Helper function to Accumulate() // Considers the evaluation results applicable to a particular category, area // range, and max_detections parameter setting, which begin at // evaluations[evaluation_index]. Extracts a sorted list of length n of all // applicable detection instances concatenated across all images in the dataset, // which are represented by the outputs evaluation_indices, detection_scores, // image_detection_indices, and detection_sorted_indices--all of which are // length n. evaluation_indices[i] stores the applicable index into // evaluations[] for instance i, which has detection score detection_score[i], // and is the image_detection_indices[i]'th of the list of detections // for the image containing i. detection_sorted_indices[] defines a sorted // permutation of the 3 other outputs int BuildSortedDetectionList( const std::vector& evaluations, const int64_t evaluation_index, const int64_t num_images, const int max_detections, std::vector* evaluation_indices, std::vector* detection_scores, std::vector* detection_sorted_indices, std::vector* image_detection_indices) { assert(evaluations.size() >= evaluation_index + num_images); // Extract a list of object instances of the applicable category, area // range, and max detections requirements such that they can be sorted image_detection_indices->clear(); evaluation_indices->clear(); detection_scores->clear(); image_detection_indices->reserve(num_images * max_detections); evaluation_indices->reserve(num_images * max_detections); detection_scores->reserve(num_images * max_detections); int num_valid_ground_truth = 0; for (auto i = 0; i < num_images; ++i) { const ImageEvaluation& evaluation = evaluations[evaluation_index + i]; for (int d = 0; d < (int)evaluation.detection_scores.size() && d < max_detections; ++d) { // detected instances evaluation_indices->push_back(evaluation_index + i); image_detection_indices->push_back(d); detection_scores->push_back(evaluation.detection_scores[d]); } for (auto ground_truth_ignore : evaluation.ground_truth_ignores) { if (!ground_truth_ignore) { ++num_valid_ground_truth; } } } // Sort detections by decreasing score, using stable sort to match // python implementation detection_sorted_indices->resize(detection_scores->size()); std::iota( detection_sorted_indices->begin(), detection_sorted_indices->end(), 0); std::stable_sort( detection_sorted_indices->begin(), detection_sorted_indices->end(), [&detection_scores](size_t j1, size_t j2) { return (*detection_scores)[j1] > (*detection_scores)[j2]; }); return num_valid_ground_truth; } // Helper function to Accumulate() // Compute a precision recall curve given a sorted list of detected instances // encoded in evaluations, evaluation_indices, detection_scores, // detection_sorted_indices, image_detection_indices (see // BuildSortedDetectionList()). Using vectors precisions and recalls // and temporary storage, output the results into precisions_out, recalls_out, // and scores_out, which are large buffers containing many precion/recall curves // for all possible parameter settings, with precisions_out_index and // recalls_out_index defining the applicable indices to store results. void ComputePrecisionRecallCurve( const int64_t precisions_out_index, const int64_t precisions_out_stride, const int64_t recalls_out_index, const std::vector& recall_thresholds, const int iou_threshold_index, const int num_iou_thresholds, const int num_valid_ground_truth, const std::vector& evaluations, const std::vector& evaluation_indices, const std::vector& detection_scores, const std::vector& detection_sorted_indices, const std::vector& image_detection_indices, std::vector* precisions, std::vector* recalls, std::vector* precisions_out, std::vector* scores_out, std::vector* recalls_out) { assert(recalls_out->size() > recalls_out_index); // Compute precision/recall for each instance in the sorted list of detections int64_t true_positives_sum = 0, false_positives_sum = 0; precisions->clear(); recalls->clear(); precisions->reserve(detection_sorted_indices.size()); recalls->reserve(detection_sorted_indices.size()); assert(!evaluations.empty() || detection_sorted_indices.empty()); for (auto detection_sorted_index : detection_sorted_indices) { const ImageEvaluation& evaluation = evaluations[evaluation_indices[detection_sorted_index]]; const auto num_detections = evaluation.detection_matches.size() / num_iou_thresholds; const auto detection_index = iou_threshold_index * num_detections + image_detection_indices[detection_sorted_index]; assert(evaluation.detection_matches.size() > detection_index); assert(evaluation.detection_ignores.size() > detection_index); const int64_t detection_match = evaluation.detection_matches[detection_index]; const bool detection_ignores = evaluation.detection_ignores[detection_index]; const auto true_positive = detection_match > 0 && !detection_ignores; const auto false_positive = detection_match == 0 && !detection_ignores; if (true_positive) { ++true_positives_sum; } if (false_positive) { ++false_positives_sum; } const double recall = static_cast(true_positives_sum) / num_valid_ground_truth; recalls->push_back(recall); const int64_t num_valid_detections = true_positives_sum + false_positives_sum; const double precision = num_valid_detections > 0 ? static_cast(true_positives_sum) / num_valid_detections : 0.0; precisions->push_back(precision); } (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0; for (int64_t i = static_cast(precisions->size()) - 1; i > 0; --i) { if ((*precisions)[i] > (*precisions)[i - 1]) { (*precisions)[i - 1] = (*precisions)[i]; } } // Sample the per instance precision/recall list at each recall threshold for (size_t r = 0; r < recall_thresholds.size(); ++r) { // first index in recalls >= recall_thresholds[r] std::vector::iterator low = std::lower_bound( recalls->begin(), recalls->end(), recall_thresholds[r]); size_t precisions_index = low - recalls->begin(); const auto results_ind = precisions_out_index + r * precisions_out_stride; assert(results_ind < precisions_out->size()); assert(results_ind < scores_out->size()); if (precisions_index < precisions->size()) { (*precisions_out)[results_ind] = (*precisions)[precisions_index]; (*scores_out)[results_ind] = detection_scores[detection_sorted_indices[precisions_index]]; } else { (*precisions_out)[results_ind] = 0; (*scores_out)[results_ind] = 0; } } } py::dict Accumulate( const py::object& params, const std::vector& evaluations) { const std::vector recall_thresholds = list_to_vec(params.attr("recThrs")); const std::vector max_detections = list_to_vec(params.attr("maxDets")); const int num_iou_thresholds = py::len(params.attr("iouThrs")); const int num_recall_thresholds = py::len(params.attr("recThrs")); const int num_categories = params.attr("useCats").cast() == 1 ? py::len(params.attr("catIds")) : 1; const int num_area_ranges = py::len(params.attr("areaRng")); const int num_max_detections = py::len(params.attr("maxDets")); const int num_images = py::len(params.attr("imgIds")); std::vector precisions_out( num_iou_thresholds * num_recall_thresholds * num_categories * num_area_ranges * num_max_detections, -1); std::vector recalls_out( num_iou_thresholds * num_categories * num_area_ranges * num_max_detections, -1); std::vector scores_out( num_iou_thresholds * num_recall_thresholds * num_categories * num_area_ranges * num_max_detections, -1); // Consider the list of all detected instances in the entire dataset in one // large list. evaluation_indices, detection_scores, // image_detection_indices, and detection_sorted_indices all have the same // length as this list, such that each entry corresponds to one detected // instance std::vector evaluation_indices; // indices into evaluations[] std::vector detection_scores; // detection scores of each instance std::vector detection_sorted_indices; // sorted indices of all // instances in the dataset std::vector image_detection_indices; // indices into the list of detected instances in // the same image as each instance std::vector precisions, recalls; for (auto c = 0; c < num_categories; ++c) { for (auto a = 0; a < num_area_ranges; ++a) { for (auto m = 0; m < num_max_detections; ++m) { // The COCO PythonAPI assumes evaluations[] (the return value of // COCOeval::EvaluateImages() is one long list storing results for each // combination of category, area range, and image id, with categories in // the outermost loop and images in the innermost loop. const int64_t evaluations_index = c * num_area_ranges * num_images + a * num_images; int num_valid_ground_truth = BuildSortedDetectionList( evaluations, evaluations_index, num_images, max_detections[m], &evaluation_indices, &detection_scores, &detection_sorted_indices, &image_detection_indices); if (num_valid_ground_truth == 0) { continue; } for (auto t = 0; t < num_iou_thresholds; ++t) { // recalls_out is a flattened vectors representing a // num_iou_thresholds X num_categories X num_area_ranges X // num_max_detections matrix const int64_t recalls_out_index = t * num_categories * num_area_ranges * num_max_detections + c * num_area_ranges * num_max_detections + a * num_max_detections + m; // precisions_out and scores_out are flattened vectors // representing a num_iou_thresholds X num_recall_thresholds X // num_categories X num_area_ranges X num_max_detections matrix const int64_t precisions_out_stride = num_categories * num_area_ranges * num_max_detections; const int64_t precisions_out_index = t * num_recall_thresholds * num_categories * num_area_ranges * num_max_detections + c * num_area_ranges * num_max_detections + a * num_max_detections + m; ComputePrecisionRecallCurve( precisions_out_index, precisions_out_stride, recalls_out_index, recall_thresholds, t, num_iou_thresholds, num_valid_ground_truth, evaluations, evaluation_indices, detection_scores, detection_sorted_indices, image_detection_indices, &precisions, &recalls, &precisions_out, &scores_out, &recalls_out); } } } } time_t rawtime; struct tm local_time; std::array buffer; time(&rawtime); #ifdef _WIN32 localtime_s(&local_time, &rawtime); #else localtime_r(&rawtime, &local_time); #endif strftime( buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time); return py::dict( "params"_a = params, "counts"_a = std::vector( {num_iou_thresholds, num_recall_thresholds, num_categories, num_area_ranges, num_max_detections}), "date"_a = buffer, "precision"_a = precisions_out, "recall"_a = recalls_out, "scores"_a = scores_out); } } // namespace COCOeval } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/cocoeval/cocoeval.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #pragma once #include #include #include #include #include namespace py = pybind11; namespace detectron2 { namespace COCOeval { // Annotation data for a single object instance in an image struct InstanceAnnotation { InstanceAnnotation( uint64_t id, double score, double area, bool is_crowd, bool ignore) : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {} uint64_t id; double score = 0.; double area = 0.; bool is_crowd = false; bool ignore = false; }; // Stores intermediate results for evaluating detection results for a single // image that has D detected instances and G ground truth instances. This stores // matches between detected and ground truth instances struct ImageEvaluation { // For each of the D detected instances, the id of the matched ground truth // instance, or 0 if unmatched std::vector detection_matches; // The detection score of each of the D detected instances std::vector detection_scores; // Marks whether or not each of G instances was ignored from evaluation (e.g., // because it's outside area_range) std::vector ground_truth_ignores; // Marks whether or not each of D instances was ignored from evaluation (e.g., // because it's outside aRng) std::vector detection_ignores; }; template using ImageCategoryInstances = std::vector>>; // C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each // combination of image, category, area range settings, and IOU thresholds to // evaluate, it matches detected instances to ground truth instances and stores // the results into a vector of ImageEvaluation results, which will be // interpreted by the COCOeval::Accumulate() function to produce precion-recall // curves. The parameters of nested vectors have the following semantics: // image_category_ious[i][c][d][g] is the intersection over union of the d'th // detected instance and g'th ground truth instance of // category category_ids[c] in image image_ids[i] // image_category_ground_truth_instances[i][c] is a vector of ground truth // instances in image image_ids[i] of category category_ids[c] // image_category_detection_instances[i][c] is a vector of detected // instances in image image_ids[i] of category category_ids[c] std::vector EvaluateImages( const std::vector>& area_ranges, // vector of 2-tuples int max_detections, const std::vector& iou_thresholds, const ImageCategoryInstances>& image_category_ious, const ImageCategoryInstances& image_category_ground_truth_instances, const ImageCategoryInstances& image_category_detection_instances); // C++ implementation of COCOeval.accumulate(), which generates precision // recall curves for each set of category, IOU threshold, detection area range, // and max number of detections parameters. It is assumed that the parameter // evaluations is the return value of the functon COCOeval::EvaluateImages(), // which was called with the same parameter settings params py::dict Accumulate( const py::object& params, const std::vector& evalutations); } // namespace COCOeval } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/cuda_version.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #include namespace detectron2 { int get_cudart_version() { // Not a ROCM platform: Either HIP is not used, or // it is used, but platform is not ROCM (i.e. it is CUDA) #if !defined(__HIP_PLATFORM_HCC__) return CUDART_VERSION; #else int version = 0; #if HIP_VERSION_MAJOR != 0 // Create a convention similar to that of CUDA, as assumed by other // parts of the code. version = HIP_VERSION_MINOR; version += (HIP_VERSION_MAJOR * 100); #else hipRuntimeGetVersion(&version); #endif return version; #endif } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/deformable/deform_conv.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #pragma once #include namespace detectron2 { #if defined(WITH_CUDA) || defined(WITH_HIP) int deform_conv_forward_cuda( at::Tensor input, at::Tensor weight, at::Tensor offset, at::Tensor output, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step); int deform_conv_backward_input_cuda( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradInput, at::Tensor gradOffset, at::Tensor weight, at::Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step); int deform_conv_backward_parameters_cuda( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradWeight, // at::Tensor gradBias, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, float scale, int im2col_step); void modulated_deform_conv_cuda_forward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, int kernel_h, int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int deformable_group, const bool with_bias); void modulated_deform_conv_cuda_backward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor columns, at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, const bool with_bias); #endif inline int deform_conv_forward( at::Tensor input, at::Tensor weight, at::Tensor offset, at::Tensor output, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { if (input.is_cuda()) { #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!"); TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); return deform_conv_forward_cuda( input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, im2col_step); #else AT_ERROR("Detectron2 is not compiled with GPU support!"); #endif } AT_ERROR("This operator is not implemented on CPU"); } inline int deform_conv_backward_input( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradInput, at::Tensor gradOffset, at::Tensor weight, at::Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { if (gradOutput.is_cuda()) { #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!"); TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!"); TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); return deform_conv_backward_input_cuda( input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, im2col_step); #else AT_ERROR("Detectron2 is not compiled with GPU support!"); #endif } AT_ERROR("This operator is not implemented on CPU"); } inline int deform_conv_backward_filter( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradWeight, // at::Tensor gradBias, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, float scale, int im2col_step) { if (gradOutput.is_cuda()) { #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!"); TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); return deform_conv_backward_parameters_cuda( input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group, scale, im2col_step); #else AT_ERROR("Detectron2 is not compiled with GPU support!"); #endif } AT_ERROR("This operator is not implemented on CPU"); } inline void modulated_deform_conv_forward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, int kernel_h, int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int deformable_group, const bool with_bias) { if (input.is_cuda()) { #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!"); TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!"); TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); return modulated_deform_conv_cuda_forward( input, weight, bias, ones, offset, mask, output, columns, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, deformable_group, with_bias); #else AT_ERROR("Detectron2 is not compiled with GPU support!"); #endif } AT_ERROR("This operator is not implemented on CPU"); } inline void modulated_deform_conv_backward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor columns, at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, const bool with_bias) { if (grad_output.is_cuda()) { #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!"); TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!"); TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!"); TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); return modulated_deform_conv_cuda_backward( input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, deformable_group, with_bias); #else AT_ERROR("Detectron2 is not compiled with GPU support!"); #endif } AT_ERROR("This operator is not implemented on CPU"); } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/deformable/deform_conv_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. // modified from // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp // Original license: Apache 2.0 // modify from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c // Original license: Apache 2.0 #include #include "deform_conv.h" #include #include namespace detectron2 { void deformable_im2col( const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor data_col); void deformable_col2im( const at::Tensor data_col, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_im); void deformable_col2im_coord( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_offset); void modulated_deformable_im2col_cuda( const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor data_col); void modulated_deformable_col2im_cuda( const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_im); void modulated_deformable_col2im_coord_cuda( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_offset, at::Tensor grad_mask); void shape_check( at::Tensor input, at::Tensor offset, at::Tensor* gradOutput, at::Tensor weight, int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, int group, int deformable_group) { TORCH_CHECK( weight.ndimension() == 4, "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " "but got: %s", weight.ndimension()); TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); TORCH_CHECK( kW > 0 && kH > 0, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); TORCH_CHECK( (weight.size(2) == kH && weight.size(3) == kW), "kernel size should be consistent with weight, ", "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH, kW, weight.size(2), weight.size(3)); TORCH_CHECK( dW > 0 && dH > 0, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); TORCH_CHECK( dilationW > 0 && dilationH > 0, "dilation should be greater than 0, but got dilationH: %d dilationW: %d", dilationH, dilationW); int ndim = input.ndimension(); int dimf = 0; int dimh = 1; int dimw = 2; if (ndim == 4) { dimf++; dimh++; dimw++; } TORCH_CHECK( ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", ndim); long nInputPlane = weight.size(1) * group; long inputHeight = input.size(dimh); long inputWidth = input.size(dimw); long nOutputPlane = weight.size(0); long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; TORCH_CHECK( nInputPlane % deformable_group == 0, "input channels must divide deformable group size"); if (outputWidth < 1 || outputHeight < 1) AT_ERROR( "Given input size: (%ld x %ld x %ld). " "Calculated output size: (%ld x %ld x %ld). Output size is too small", nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight, outputWidth); TORCH_CHECK( input.size(1) == nInputPlane, "invalid number of input planes, expected: %d, but got: %d", nInputPlane, input.size(1)); TORCH_CHECK( (inputHeight + 2 * padH >= kH && inputWidth + 2 * padW >= kW), "input image is smaller than kernel"); TORCH_CHECK( (offset.size(2) == outputHeight && offset.size(3) == outputWidth), "invalid spatial size of offset, expected height: %d width: %d, but " "got height: %d width: %d", outputHeight, outputWidth, offset.size(2), offset.size(3)); TORCH_CHECK( (offset.size(1) == deformable_group * 2 * kH * kW), "invalid number of channels of offset"); if (gradOutput != NULL) { TORCH_CHECK( gradOutput->size(dimf) == nOutputPlane, "invalid number of gradOutput planes, expected: %d, but got: %d", nOutputPlane, gradOutput->size(dimf)); TORCH_CHECK( (gradOutput->size(dimh) == outputHeight && gradOutput->size(dimw) == outputWidth), "invalid size of gradOutput, expected height: %d width: %d , but " "got height: %d width: %d", outputHeight, outputWidth, gradOutput->size(dimh), gradOutput->size(dimw)); } } int deform_conv_forward_cuda( at::Tensor input, at::Tensor weight, at::Tensor offset, at::Tensor output, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { // todo: resize columns to include im2col: done // todo: add im2col_step as input // todo: add new output buffer and transpose it to output (or directly // transpose output) todo: possibly change data indexing because of // parallel_imgs shape_check( input, offset, NULL, weight, kH, kW, dH, dW, padH, padW, dilationH, dilationW, group, deformable_group); input = input.contiguous(); offset = offset.contiguous(); weight = weight.contiguous(); int batch = 1; if (input.ndimension() == 3) { // Force batch batch = 0; input.unsqueeze_(0); offset.unsqueeze_(0); } // todo: assert batchsize dividable by im2col_step long batchSize = input.size(0); long nInputPlane = input.size(1); long inputHeight = input.size(2); long inputWidth = input.size(3); long nOutputPlane = weight.size(0); long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); output = output.view( {batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.options()); if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < outputHeight * outputWidth) { ones = at::ones({outputHeight, outputWidth}, input.options()); } input = input.view( {batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); offset = offset.view( {batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); at::Tensor output_buffer = at::zeros( {batchSize / im2col_step, nOutputPlane, im2col_step * outputHeight, outputWidth}, output.options()); output_buffer = output_buffer.view( {output_buffer.size(0), group, output_buffer.size(1) / group, output_buffer.size(2), output_buffer.size(3)}); for (int elt = 0; elt < batchSize / im2col_step; elt++) { deformable_im2col( input[elt], offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, columns); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); weight = weight.view( {group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); for (int g = 0; g < group; g++) { output_buffer[elt][g] = output_buffer[elt][g] .flatten(1) .addmm_(weight[g].flatten(1), columns[g]) .view_as(output_buffer[elt][g]); } } output_buffer = output_buffer.view( {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2), output_buffer.size(3), output_buffer.size(4)}); output_buffer = output_buffer.view( {batchSize / im2col_step, nOutputPlane, im2col_step, outputHeight, outputWidth}); output_buffer.transpose_(1, 2); output.copy_(output_buffer); output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth}); input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); offset = offset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); if (batch == 0) { output = output.view({nOutputPlane, outputHeight, outputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth}); offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); } return 1; } int deform_conv_backward_input_cuda( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradInput, at::Tensor gradOffset, at::Tensor weight, at::Tensor columns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { shape_check( input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW, dilationH, dilationW, group, deformable_group); input = input.contiguous(); offset = offset.contiguous(); gradOutput = gradOutput.contiguous(); weight = weight.contiguous(); int batch = 1; if (input.ndimension() == 3) { // Force batch batch = 0; input = input.view({1, input.size(0), input.size(1), input.size(2)}); offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)}); gradOutput = gradOutput.view( {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); } long batchSize = input.size(0); long nInputPlane = input.size(1); long inputHeight = input.size(2); long inputWidth = input.size(3); long nOutputPlane = weight.size(0); long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset"); gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.options()); // change order of grad output gradOutput = gradOutput.view( {batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); gradOutput.transpose_(1, 2); gradInput = gradInput.view( {batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); input = input.view( {batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); gradOffset = gradOffset.view( {batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); offset = offset.view( {batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); for (int elt = 0; elt < batchSize / im2col_step; elt++) { // divide into groups columns = columns.view({group, columns.size(0) / group, columns.size(1)}); weight = weight.view( {group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); gradOutput = gradOutput.view( {gradOutput.size(0), group, gradOutput.size(1) / group, gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)}); for (int g = 0; g < group; g++) { columns[g] = columns[g].addmm_( weight[g].flatten(1).transpose(0, 1), gradOutput[elt][g].flatten(1), 0.0f, 1.0f); } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); gradOutput = gradOutput.view( {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2), gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)}); deformable_col2im_coord( columns, input[elt], offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, gradOffset[elt]); deformable_col2im( columns, offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, gradInput[elt]); } gradOutput.transpose_(1, 2); gradOutput = gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); gradOffset = gradOffset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); offset = offset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); if (batch == 0) { gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth}); gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth}); offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); gradOffset = gradOffset.view({offset.size(1), offset.size(2), offset.size(3)}); } return 1; } int deform_conv_backward_parameters_cuda( at::Tensor input, at::Tensor offset, at::Tensor gradOutput, at::Tensor gradWeight, // at::Tensor gradBias, at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, float scale, int im2col_step) { // todo: transpose and reshape outGrad // todo: reshape columns // todo: add im2col_step as input shape_check( input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH, padW, dilationH, dilationW, group, deformable_group); input = input.contiguous(); offset = offset.contiguous(); gradOutput = gradOutput.contiguous(); int batch = 1; if (input.ndimension() == 3) { // Force batch batch = 0; input = input.view( at::IntList({1, input.size(0), input.size(1), input.size(2)})); gradOutput = gradOutput.view( {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); } long batchSize = input.size(0); long nInputPlane = input.size(1); long inputHeight = input.size(2); long inputWidth = input.size(3); long nOutputPlane = gradWeight.size(0); long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, input.options()); gradOutput = gradOutput.view( {batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); gradOutput.transpose_(1, 2); at::Tensor gradOutputBuffer = at::zeros_like(gradOutput); gradOutputBuffer = gradOutputBuffer.view( {batchSize / im2col_step, nOutputPlane, im2col_step, outputHeight, outputWidth}); gradOutputBuffer.copy_(gradOutput); // gradOutput is not contiguous, so we do reshape (instead of view) next gradOutputBuffer = gradOutputBuffer.reshape( {batchSize / im2col_step, nOutputPlane, im2col_step * outputHeight, outputWidth}); gradOutput.transpose_(1, 2); gradOutput = gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); input = input.view( {batchSize / im2col_step, im2col_step, nInputPlane, inputHeight, inputWidth}); offset = offset.view( {batchSize / im2col_step, im2col_step, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); for (int elt = 0; elt < batchSize / im2col_step; elt++) { deformable_im2col( input[elt], offset[elt], nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, im2col_step, deformable_group, columns); // divide into group gradOutputBuffer = gradOutputBuffer.view( {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group, gradOutputBuffer.size(2), gradOutputBuffer.size(3)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); gradWeight = gradWeight.view( {group, gradWeight.size(0) / group, gradWeight.size(1), gradWeight.size(2), gradWeight.size(3)}); for (int g = 0; g < group; g++) { gradWeight[g] = gradWeight[g] .flatten(1) .addmm_( gradOutputBuffer[elt][g].flatten(1), columns[g].transpose(1, 0), 1.0, scale) .view_as(gradWeight[g]); } gradOutputBuffer = gradOutputBuffer.view( {gradOutputBuffer.size(0), gradOutputBuffer.size(1) * gradOutputBuffer.size(2), gradOutputBuffer.size(3), gradOutputBuffer.size(4)}); columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); gradWeight = gradWeight.view( {gradWeight.size(0) * gradWeight.size(1), gradWeight.size(2), gradWeight.size(3), gradWeight.size(4)}); } input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); offset = offset.view( {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); if (batch == 0) { gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); input = input.view({nInputPlane, inputHeight, inputWidth}); } return 1; } void modulated_deform_conv_cuda_forward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns, int kernel_h, int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int deformable_group, const bool with_bias) { shape_check( input, offset, NULL, weight, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, deformable_group); TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_out = weight.size(0); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) AT_ERROR( "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); if (channels != channels_kernel * group) AT_ERROR( "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel * group); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; // mask shape check TORCH_CHECK( (mask.size(2) == height_out && mask.size(3) == width_out), "invalid spatial size of mask, expected height: %d width: %d, but " "got height: %d width: %d", height_out, width_out, mask.size(2), mask.size(3)); TORCH_CHECK( (mask.size(1) == deformable_group * kernel_h * kernel_w), "invalid number of channels of mask"); if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < height_out * width_out) { // Resize plane and fill with ones... ones = at::ones({height_out, width_out}, input.options()); } // resize output output = output.view({batch, channels_out, height_out, width_out}).zero_(); // resize temporary columns columns = at::zeros( {channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); output = output.view( {output.size(0), group, output.size(1) / group, output.size(2), output.size(3)}); for (int b = 0; b < batch; b++) { modulated_deformable_im2col_cuda( input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns); // divide into group weight = weight.view( {group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); for (int g = 0; g < group; g++) { output[b][g] = output[b][g] .flatten(1) .addmm_(weight[g].flatten(1), columns[g]) .view_as(output[b][g]); } weight = weight.view( {weight.size(0) * weight.size(1), weight.size(2), weight.size(3), weight.size(4)}); columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); } output = output.view( {output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)}); if (with_bias) { output += bias.view({1, bias.size(0), 1, 1}); } } void modulated_deform_conv_cuda_backward( at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones, at::Tensor offset, at::Tensor mask, at::Tensor columns, at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias, at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, const bool with_bias) { shape_check( input, offset, &grad_output, weight, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, deformable_group); TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); const int batch = input.size(0); const int channels = input.size(1); const int height = input.size(2); const int width = input.size(3); const int channels_kernel = weight.size(1); const int kernel_h_ = weight.size(2); const int kernel_w_ = weight.size(3); if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) AT_ERROR( "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); if (channels != channels_kernel * group) AT_ERROR( "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel * group); const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; // mask shape check TORCH_CHECK( (mask.size(2) == height_out && mask.size(3) == width_out), "invalid spatial size of mask, expected height: %d width: %d, but " "got height: %d width: %d", height_out, width_out, mask.size(2), mask.size(3)); TORCH_CHECK( (mask.size(1) == deformable_group * kernel_h * kernel_w), "invalid number of channels of mask"); if (ones.ndimension() != 2 || ones.size(0) * ones.size(1) < height_out * width_out) { // Resize plane and fill with ones... ones = at::ones({height_out, width_out}, input.options()); } grad_input = grad_input.view({batch, channels, height, width}); columns = at::zeros( {channels * kernel_h * kernel_w, height_out * width_out}, input.options()); grad_output = grad_output.view( {grad_output.size(0), group, grad_output.size(1) / group, grad_output.size(2), grad_output.size(3)}); for (int b = 0; b < batch; b++) { // divide int group columns = columns.view({group, columns.size(0) / group, columns.size(1)}); weight = weight.view( {group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)}); for (int g = 0; g < group; g++) { columns[g].addmm_( weight[g].flatten(1).transpose(0, 1), grad_output[b][g].flatten(1), 0.0f, 1.0f); } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); weight = weight.view( {weight.size(0) * weight.size(1), weight.size(2), weight.size(3), weight.size(4)}); // gradient w.r.t. input coordinate data modulated_deformable_col2im_coord_cuda( columns, input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], grad_mask[b]); // gradient w.r.t. input data modulated_deformable_col2im_cuda( columns, offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, grad_input[b]); // gradient w.r.t. weight, dWeight should accumulate across the batch and // group modulated_deformable_im2col_cuda( input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns); columns = columns.view({group, columns.size(0) / group, columns.size(1)}); grad_weight = grad_weight.view( {group, grad_weight.size(0) / group, grad_weight.size(1), grad_weight.size(2), grad_weight.size(3)}); if (with_bias) grad_bias = grad_bias.view({group, grad_bias.size(0) / group}); for (int g = 0; g < group; g++) { grad_weight[g] = grad_weight[g] .flatten(1) .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1)) .view_as(grad_weight[g]); if (with_bias) { grad_bias[g] = grad_bias[g] .view({-1, 1}) .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1})) .view(-1); } } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); grad_weight = grad_weight.view( {grad_weight.size(0) * grad_weight.size(1), grad_weight.size(2), grad_weight.size(3), grad_weight.size(4)}); if (with_bias) grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)}); } grad_output = grad_output.view( {grad_output.size(0) * grad_output.size(1), grad_output.size(2), grad_output.size(3), grad_output.size(4)}); } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. // modified from // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu // Original license: Apache 2.0 // clang-format off // modify from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer ***************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ********************* * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.cuh * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng */ #include #include #include #include #include #include using namespace at; #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) namespace { const int CUDA_NUM_THREADS = 1024; const int kMaxGridNum = 65535; inline int GET_BLOCKS(const int N) { return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS); } } template __device__ scalar_t deformable_im2col_bilinear( const scalar_t* bottom_data, const int data_width, const int height, const int width, scalar_t h, scalar_t w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; scalar_t lh = h - h_low; scalar_t lw = w - w_low; scalar_t hh = 1 - lh, hw = 1 - lw; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = bottom_data[h_low * data_width + w_high]; scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = bottom_data[h_high * data_width + w_low]; scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = bottom_data[h_high * data_width + w_high]; scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ scalar_t get_gradient_weight( scalar_t argmax_h, scalar_t argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ scalar_t get_coordinate_weight( scalar_t argmax_h, scalar_t argmax_w, const int height, const int width, const scalar_t* im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void deformable_im2col_gpu_kernel( const int n, const scalar_t* data_im, const scalar_t* data_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, scalar_t* data_col) { CUDA_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; scalar_t* data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; // const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * // height + h_in) * width + w_in; const scalar_t* data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const scalar_t* data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; scalar_t val = static_cast(0); const scalar_t h_im = h_in + i * dilation_h + offset_h; const scalar_t w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { // const scalar_t map_h = i * dilation_h + offset_h; // const scalar_t map_w = j * dilation_w + offset_w; // const int cur_height = height - h_in; // const int cur_width = width - w_in; // val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, // cur_width, map_h, map_w); val = deformable_im2col_bilinear( data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val; data_col_ptr += batch_size * height_col * width_col; } } } } template __global__ void deformable_col2im_gpu_kernel( const int n, const scalar_t* data_col, const scalar_t* data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, scalar_t* grad_im) { CUDA_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const scalar_t* data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; const scalar_t cur_top_grad = data_col[index]; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; scalar_t weight = get_gradient_weight( cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } template __global__ void deformable_col2im_coord_gpu_kernel( const int n, const scalar_t* data_col, const scalar_t* data_im, const scalar_t* data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, scalar_t* grad_offset) { CUDA_KERNEL_LOOP(index, n) { scalar_t val = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const scalar_t* data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const scalar_t* data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const scalar_t* data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; scalar_t inv_h = h_in + i * dilation_h + offset_h; scalar_t inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } const scalar_t weight = get_coordinate_weight( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos]; cnt += 1; } grad_offset[index] = val; } } namespace detectron2 { void deformable_im2col( const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor data_col) { // num_axes should be smaller than block size // todo: check parallel_imgs is correctly passed in int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height_col * width_col * parallel_imgs; int channel_per_deformable_group = channels / deformable_group; at::cuda::CUDAGuard device_guard(data_im.device()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_im.scalar_type(), "deformable_im2col_gpu", ([&] { const scalar_t* data_im_ = data_im.data_ptr(); const scalar_t* data_offset_ = data_offset.data_ptr(); scalar_t* data_col_ = data_col.data_ptr(); deformable_im2col_gpu_kernel<<< GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>( num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, channels, deformable_group, height_col, width_col, data_col_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in deformable_im2col: %s\n", cudaGetErrorString(err)); } } void deformable_col2im( const at::Tensor data_col, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_im) { // todo: make sure parallel_imgs is passed in correctly int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs; int channel_per_deformable_group = channels / deformable_group; at::cuda::CUDAGuard device_guard(data_col.device()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.scalar_type(), "deformable_col2im_gpu", ([&] { const scalar_t* data_col_ = data_col.data_ptr(); const scalar_t* data_offset_ = data_offset.data_ptr(); scalar_t* grad_im_ = grad_im.data_ptr(); deformable_col2im_gpu_kernel<<< GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>( num_kernels, data_col_, data_offset_, channels, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, deformable_group, height_col, width_col, grad_im_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in deformable_col2im: %s\n", cudaGetErrorString(err)); } } void deformable_col2im_coord( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int parallel_imgs, const int deformable_group, at::Tensor grad_offset) { int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs; int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group; at::cuda::CUDAGuard device_guard(data_col.device()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] { const scalar_t* data_col_ = data_col.data_ptr(); const scalar_t* data_im_ = data_im.data_ptr(); const scalar_t* data_offset_ = data_offset.data_ptr(); scalar_t* grad_offset_ = grad_offset.data_ptr(); deformable_col2im_coord_gpu_kernel<<< GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>( num_kernels, data_col_, data_im_, data_offset_, channels, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group, height_col, width_col, grad_offset_); })); } } // namespace detectron2 template __device__ scalar_t dmcn_im2col_bilinear( const scalar_t* bottom_data, const int data_width, const int height, const int width, scalar_t h, scalar_t w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; scalar_t lh = h - h_low; scalar_t lw = w - w_low; scalar_t hh = 1 - lh, hw = 1 - lw; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = bottom_data[h_low * data_width + w_high]; scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = bottom_data[h_high * data_width + w_low]; scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = bottom_data[h_high * data_width + w_high]; scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ scalar_t dmcn_get_gradient_weight( scalar_t argmax_h, scalar_t argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ scalar_t dmcn_get_coordinate_weight( scalar_t argmax_h, scalar_t argmax_w, const int height, const int width, const scalar_t* im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floor(argmax_h); int argmax_w_low = floor(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; scalar_t weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void modulated_deformable_im2col_gpu_kernel( const int n, const scalar_t* data_im, const scalar_t* data_offset, const scalar_t* data_mask, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, scalar_t* data_col) { CUDA_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; scalar_t* data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; // const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * // height + h_in) * width + w_in; const scalar_t* data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const scalar_t* data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const scalar_t* data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; scalar_t val = static_cast(0); const scalar_t h_im = h_in + i * dilation_h + offset_h; const scalar_t w_im = w_in + j * dilation_w + offset_w; // if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { // const float map_h = i * dilation_h + offset_h; // const float map_w = j * dilation_w + offset_w; // const int cur_height = height - h_in; // const int cur_width = width - w_in; // val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, // cur_width, map_h, map_w); val = dmcn_im2col_bilinear( data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val * mask; data_col_ptr += batch_size * height_col * width_col; // data_col_ptr += height_col * width_col; } } } } template __global__ void modulated_deformable_col2im_gpu_kernel( const int n, const scalar_t* data_col, const scalar_t* data_offset, const scalar_t* data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, scalar_t* grad_im) { CUDA_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const scalar_t* data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const scalar_t* data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; const scalar_t cur_top_grad = data_col[index] * mask; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; scalar_t weight = dmcn_get_gradient_weight( cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } template __global__ void modulated_deformable_col2im_coord_gpu_kernel( const int n, const scalar_t* data_col, const scalar_t* data_im, const scalar_t* data_offset, const scalar_t* data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, scalar_t* grad_offset, scalar_t* grad_mask) { CUDA_KERNEL_LOOP(index, n) { scalar_t val = 0, mval = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const scalar_t* data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const scalar_t* data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const scalar_t* data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const scalar_t* data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; scalar_t inv_h = h_in + i * dilation_h + offset_h; scalar_t inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear( data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); } const scalar_t weight = dmcn_get_coordinate_weight( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos] * mask; cnt += 1; } // KERNEL_ASSIGN(grad_offset[index], offset_req, val); grad_offset[index] = val; if (offset_c % 2 == 0) // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * // height_col + h) * width_col + w], mask_req, mval); grad_mask [(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; } } namespace detectron2 { void modulated_deformable_im2col_cuda( const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor data_col) { // num_axes should be smaller than block size const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * batch_size * height_col * width_col; at::cuda::CUDAGuard device_guard(data_im.device()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] { const scalar_t* data_im_ = data_im.data_ptr(); const scalar_t* data_offset_ = data_offset.data_ptr(); const scalar_t* data_mask_ = data_mask.data_ptr(); scalar_t* data_col_ = data_col.data_ptr(); modulated_deformable_im2col_gpu_kernel<<< GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>( num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, channels, deformable_group, height_col, width_col, data_col_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf( "error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_cuda( const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_im) { const int channel_per_deformable_group = channels / deformable_group; const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; at::cuda::CUDAGuard device_guard(data_col.device()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] { const scalar_t* data_col_ = data_col.data_ptr(); const scalar_t* data_offset_ = data_offset.data_ptr(); const scalar_t* data_mask_ = data_mask.data_ptr(); scalar_t* grad_im_ = grad_im.data_ptr(); modulated_deformable_col2im_gpu_kernel<<< GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>( num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, deformable_group, height_col, width_col, grad_im_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf( "error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } void modulated_deformable_col2im_coord_cuda( const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask, const int batch_size, const int channels, const int height_im, const int width_im, const int height_col, const int width_col, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int deformable_group, at::Tensor grad_offset, at::Tensor grad_mask) { const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; at::cuda::CUDAGuard device_guard(data_col.device()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] { const scalar_t* data_col_ = data_col.data_ptr(); const scalar_t* data_im_ = data_im.data_ptr(); const scalar_t* data_offset_ = data_offset.data_ptr(); const scalar_t* data_mask_ = data_mask.data_ptr(); scalar_t* grad_offset_ = grad_offset.data_ptr(); scalar_t* grad_mask_ = grad_mask.data_ptr(); modulated_deformable_col2im_coord_gpu_kernel<<< GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>( num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, grad_offset_, grad_mask_); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf( "error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); } } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated.h ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #pragma once #include namespace detectron2 { at::Tensor nms_rotated_cpu( const at::Tensor& dets, const at::Tensor& scores, const double iou_threshold); #if defined(WITH_CUDA) || defined(WITH_HIP) at::Tensor nms_rotated_cuda( const at::Tensor& dets, const at::Tensor& scores, const double iou_threshold); #endif // Interface for Python // inline is needed to prevent multiple function definitions when this header is // included by different cpps inline at::Tensor nms_rotated( const at::Tensor& dets, const at::Tensor& scores, const double iou_threshold) { assert(dets.device().is_cuda() == scores.device().is_cuda()); if (dets.device().is_cuda()) { #if defined(WITH_CUDA) || defined(WITH_HIP) return nms_rotated_cuda( dets.contiguous(), scores.contiguous(), iou_threshold); #else AT_ERROR("Detectron2 is not compiled with GPU support!"); #endif } return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold); } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #include "../box_iou_rotated/box_iou_rotated_utils.h" #include "nms_rotated.h" namespace detectron2 { template at::Tensor nms_rotated_cpu_kernel( const at::Tensor& dets, const at::Tensor& scores, const double iou_threshold) { // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel, // however, the code in this function is much shorter because // we delegate the IoU computation for rotated boxes to // the single_box_iou_rotated function in box_iou_rotated_utils.h AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor"); AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor"); AT_ASSERTM( dets.scalar_type() == scores.scalar_type(), "dets should have the same type as scores"); if (dets.numel() == 0) { return at::empty({0}, dets.options().dtype(at::kLong)); } auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto ndets = dets.size(0); at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte)); at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong)); auto suppressed = suppressed_t.data_ptr(); auto keep = keep_t.data_ptr(); auto order = order_t.data_ptr(); int64_t num_to_keep = 0; for (int64_t _i = 0; _i < ndets; _i++) { auto i = order[_i]; if (suppressed[i] == 1) { continue; } keep[num_to_keep++] = i; for (int64_t _j = _i + 1; _j < ndets; _j++) { auto j = order[_j]; if (suppressed[j] == 1) { continue; } auto ovr = single_box_iou_rotated( dets[i].data_ptr(), dets[j].data_ptr()); if (ovr >= iou_threshold) { suppressed[j] = 1; } } } return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); } at::Tensor nms_rotated_cpu( // input must be contiguous const at::Tensor& dets, const at::Tensor& scores, const double iou_threshold) { auto result = at::empty({0}, dets.options()); AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] { result = nms_rotated_cpu_kernel(dets, scores, iou_threshold); }); return result; } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #include #include #include #include #ifdef WITH_CUDA #include "../box_iou_rotated/box_iou_rotated_utils.h" #endif // TODO avoid this when pytorch supports "same directory" hipification #ifdef WITH_HIP #include "box_iou_rotated/box_iou_rotated_utils.h" #endif using namespace detectron2; namespace { int const threadsPerBlock = sizeof(unsigned long long) * 8; } template __global__ void nms_rotated_cuda_kernel( const int n_boxes, const double iou_threshold, const T* dev_boxes, unsigned long long* dev_mask) { // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); // Compared to nms_cuda_kernel, where each box is represented with 4 values // (x1, y1, x2, y2), each rotated box is represented with 5 values // (x_center, y_center, width, height, angle_degrees) here. __shared__ T block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const T* cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_rotated function from box_iou_rotated_utils.h if (single_box_iou_rotated(cur_box, block_boxes + i * 5) > iou_threshold) { t |= 1ULL << i; } } const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } namespace detectron2 { at::Tensor nms_rotated_cuda( // input must be contiguous const at::Tensor& dets, const at::Tensor& scores, double iou_threshold) { // using scalar_t = float; AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor"); AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor"); at::cuda::CUDAGuard device_guard(dets.device()); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto dets_sorted = dets.index_select(0, order_t); auto dets_num = dets.size(0); const int col_blocks = at::cuda::ATenCeilDiv(static_cast(dets_num), threadsPerBlock); at::Tensor mask = at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong)); dim3 blocks(col_blocks, col_blocks); dim3 threads(threadsPerBlock); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES( dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] { nms_rotated_cuda_kernel<<>>( dets_num, iou_threshold, dets_sorted.data_ptr(), (unsigned long long*)mask.data_ptr()); }); at::Tensor mask_cpu = mask.to(at::kCPU); unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr(); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU)); int64_t* keep_out = keep.data_ptr(); int num_to_keep = 0; for (int i = 0; i < dets_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; if (!(remv[nblock] & (1ULL << inblock))) { keep_out[num_to_keep++] = i; unsigned long long* p = mask_host + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } } } AT_CUDA_CHECK(cudaGetLastError()); return order_t.index( {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep) .to(order_t.device(), keep.scalar_type())}); } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/csrc/vision.cpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. #include #include "ROIAlignRotated/ROIAlignRotated.h" #include "box_iou_rotated/box_iou_rotated.h" #include "cocoeval/cocoeval.h" #include "deformable/deform_conv.h" #include "nms_rotated/nms_rotated.h" namespace detectron2 { #if defined(WITH_CUDA) || defined(WITH_HIP) extern int get_cudart_version(); #endif std::string get_cuda_version() { #if defined(WITH_CUDA) || defined(WITH_HIP) std::ostringstream oss; #if defined(WITH_CUDA) oss << "CUDA "; #else oss << "HIP "; #endif // copied from // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231 auto printCudaStyleVersion = [&](int v) { oss << (v / 1000) << "." << (v / 10 % 100); if (v % 10 != 0) { oss << "." << (v % 10); } }; printCudaStyleVersion(get_cudart_version()); return oss.str(); #else // neither CUDA nor HIP return std::string("not available"); #endif } bool has_cuda() { #if defined(WITH_CUDA) return true; #else return false; #endif } // similar to // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp std::string get_compiler_version() { std::ostringstream ss; #if defined(__GNUC__) #ifndef __clang__ #if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8)) #error "GCC >= 4.9 is required!" #endif { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; } #endif #endif #if defined(__clang_major__) { ss << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; } #endif #if defined(_MSC_VER) { ss << "MSVC " << _MSC_FULL_VER; } #endif return ss.str(); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("get_compiler_version", &get_compiler_version, "get_compiler_version"); m.def("get_cuda_version", &get_cuda_version, "get_cuda_version"); m.def("has_cuda", &has_cuda, "has_cuda"); m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward"); m.def( "deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input"); m.def( "deform_conv_backward_filter", &deform_conv_backward_filter, "deform_conv_backward_filter"); m.def( "modulated_deform_conv_forward", &modulated_deform_conv_forward, "modulated_deform_conv_forward"); m.def( "modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward"); m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate"); m.def( "COCOevalEvaluateImages", &COCOeval::EvaluateImages, "COCOeval::EvaluateImages"); pybind11::class_(m, "InstanceAnnotation") .def(pybind11::init()); pybind11::class_(m, "ImageEvaluation") .def(pybind11::init<>()); } TORCH_LIBRARY(detectron2, m) { m.def("nms_rotated", &nms_rotated); m.def("box_iou_rotated", &box_iou_rotated); m.def("roi_align_rotated_forward", &ROIAlignRotated_forward); m.def("roi_align_rotated_backward", &ROIAlignRotated_backward); } } // namespace detectron2 ================================================ FILE: annotator/oneformer/detectron2/layers/deform_conv.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import math from functools import lru_cache import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from torchvision.ops import deform_conv2d from annotator.oneformer.detectron2.utils.develop import create_dummy_class, create_dummy_func from .wrappers import _NewEmptyTensorOp class _DeformConv(Function): @staticmethod def forward( ctx, input, offset, weight, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, im2col_step=64, ): if input is not None and input.dim() != 4: raise ValueError( "Expected 4D tensor as input, got {}D tensor instead.".format(input.dim()) ) ctx.stride = _pair(stride) ctx.padding = _pair(padding) ctx.dilation = _pair(dilation) ctx.groups = groups ctx.deformable_groups = deformable_groups ctx.im2col_step = im2col_step ctx.save_for_backward(input, offset, weight) output = input.new_empty( _DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride) ) ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones if not input.is_cuda: # TODO: let torchvision support full features of our deformconv. if deformable_groups != 1: raise NotImplementedError( "Deformable Conv with deformable_groups != 1 is not supported on CPUs!" ) return deform_conv2d( input, offset, weight, stride=stride, padding=padding, dilation=dilation ) else: cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step) assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize" _C.deform_conv_forward( input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], ctx.padding[1], ctx.padding[0], ctx.dilation[1], ctx.dilation[0], ctx.groups, ctx.deformable_groups, cur_im2col_step, ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): input, offset, weight = ctx.saved_tensors grad_input = grad_offset = grad_weight = None if not grad_output.is_cuda: raise NotImplementedError("Deformable Conv is not supported on CPUs!") else: cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step) assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize" if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: grad_input = torch.zeros_like(input) grad_offset = torch.zeros_like(offset) _C.deform_conv_backward_input( input, offset, grad_output, grad_input, grad_offset, weight, ctx.bufs_[0], weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], ctx.padding[1], ctx.padding[0], ctx.dilation[1], ctx.dilation[0], ctx.groups, ctx.deformable_groups, cur_im2col_step, ) if ctx.needs_input_grad[2]: grad_weight = torch.zeros_like(weight) _C.deform_conv_backward_filter( input, offset, grad_output, grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0], ctx.padding[1], ctx.padding[0], ctx.dilation[1], ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1, cur_im2col_step, ) return grad_input, grad_offset, grad_weight, None, None, None, None, None, None @staticmethod def _output_size(input, weight, padding, dilation, stride): channels = weight.size(0) output_size = (input.size(0), channels) for d in range(input.dim() - 2): in_size = input.size(d + 2) pad = padding[d] kernel = dilation[d] * (weight.size(d + 2) - 1) + 1 stride_ = stride[d] output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,) if not all(map(lambda s: s > 0, output_size)): raise ValueError( "convolution input is too small (output would be {})".format( "x".join(map(str, output_size)) ) ) return output_size @staticmethod @lru_cache(maxsize=128) def _cal_im2col_step(input_size, default_size): """ Calculate proper im2col step size, which should be divisible by input_size and not larger than prefer_size. Meanwhile the step size should be as large as possible to be more efficient. So we choose the largest one among all divisors of input_size which are smaller than prefer_size. :param input_size: input batch size . :param default_size: default preferred im2col step size. :return: the largest proper step size. """ if input_size <= default_size: return input_size best_step = 1 for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)): if input_size % step == 0: if input_size // step <= default_size: return input_size // step best_step = step return best_step class _ModulatedDeformConv(Function): @staticmethod def forward( ctx, input, offset, mask, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, ): ctx.stride = stride ctx.padding = padding ctx.dilation = dilation ctx.groups = groups ctx.deformable_groups = deformable_groups ctx.with_bias = bias is not None if not ctx.with_bias: bias = input.new_empty(1) # fake tensor if not input.is_cuda: raise NotImplementedError("Deformable Conv is not supported on CPUs!") if ( weight.requires_grad or mask.requires_grad or offset.requires_grad or input.requires_grad ): ctx.save_for_backward(input, offset, mask, weight, bias) output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight)) ctx._bufs = [input.new_empty(0), input.new_empty(0)] _C.modulated_deform_conv_forward( input, weight, bias, ctx._bufs[0], offset, mask, output, ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride, ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, ctx.groups, ctx.deformable_groups, ctx.with_bias, ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): if not grad_output.is_cuda: raise NotImplementedError("Deformable Conv is not supported on CPUs!") input, offset, mask, weight, bias = ctx.saved_tensors grad_input = torch.zeros_like(input) grad_offset = torch.zeros_like(offset) grad_mask = torch.zeros_like(mask) grad_weight = torch.zeros_like(weight) grad_bias = torch.zeros_like(bias) _C.modulated_deform_conv_backward( input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1], grad_input, grad_weight, grad_bias, grad_offset, grad_mask, grad_output, weight.shape[2], weight.shape[3], ctx.stride, ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation, ctx.groups, ctx.deformable_groups, ctx.with_bias, ) if not ctx.with_bias: grad_bias = None return ( grad_input, grad_offset, grad_mask, grad_weight, grad_bias, None, None, None, None, None, ) @staticmethod def _infer_shape(ctx, input, weight): n = input.size(0) channels_out = weight.size(0) height, width = input.shape[2:4] kernel_h, kernel_w = weight.shape[2:4] height_out = ( height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1) ) // ctx.stride + 1 width_out = ( width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1) ) // ctx.stride + 1 return n, channels_out, height_out, width_out deform_conv = _DeformConv.apply modulated_deform_conv = _ModulatedDeformConv.apply class DeformConv(nn.Module): def __init__( self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=False, norm=None, activation=None, ): """ Deformable convolution from :paper:`deformconv`. Arguments are similar to :class:`Conv2D`. Extra arguments: Args: deformable_groups (int): number of groups used in deformable convolution. norm (nn.Module, optional): a normalization layer activation (callable(Tensor) -> Tensor): a callable activation function """ super(DeformConv, self).__init__() assert not bias assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format( in_channels, groups ) assert ( out_channels % groups == 0 ), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups) self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = _pair(stride) self.padding = _pair(padding) self.dilation = _pair(dilation) self.groups = groups self.deformable_groups = deformable_groups self.norm = norm self.activation = activation self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size) ) self.bias = None nn.init.kaiming_uniform_(self.weight, nonlinearity="relu") def forward(self, x, offset): if x.numel() == 0: # When input is empty, we want to return a empty tensor with "correct" shape, # So that the following operations will not panic # if they check for the shape of the tensor. # This computes the height and width of the output tensor output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // s + 1 for i, p, di, k, s in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride ) ] output_shape = [x.shape[0], self.weight.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) x = deform_conv( x, offset, self.weight, self.stride, self.padding, self.dilation, self.groups, self.deformable_groups, ) if self.norm is not None: x = self.norm(x) if self.activation is not None: x = self.activation(x) return x def extra_repr(self): tmpstr = "in_channels=" + str(self.in_channels) tmpstr += ", out_channels=" + str(self.out_channels) tmpstr += ", kernel_size=" + str(self.kernel_size) tmpstr += ", stride=" + str(self.stride) tmpstr += ", padding=" + str(self.padding) tmpstr += ", dilation=" + str(self.dilation) tmpstr += ", groups=" + str(self.groups) tmpstr += ", deformable_groups=" + str(self.deformable_groups) tmpstr += ", bias=False" return tmpstr class ModulatedDeformConv(nn.Module): def __init__( self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, norm=None, activation=None, ): """ Modulated deformable convolution from :paper:`deformconv2`. Arguments are similar to :class:`Conv2D`. Extra arguments: Args: deformable_groups (int): number of groups used in deformable convolution. norm (nn.Module, optional): a normalization layer activation (callable(Tensor) -> Tensor): a callable activation function """ super(ModulatedDeformConv, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = stride self.padding = padding self.dilation = dilation self.groups = groups self.deformable_groups = deformable_groups self.with_bias = bias self.norm = norm self.activation = activation self.weight = nn.Parameter( torch.Tensor(out_channels, in_channels // groups, *self.kernel_size) ) if bias: self.bias = nn.Parameter(torch.Tensor(out_channels)) else: self.bias = None nn.init.kaiming_uniform_(self.weight, nonlinearity="relu") if self.bias is not None: nn.init.constant_(self.bias, 0) def forward(self, x, offset, mask): if x.numel() == 0: output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // s + 1 for i, p, di, k, s in zip( x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride ) ] output_shape = [x.shape[0], self.weight.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) x = modulated_deform_conv( x, offset, mask, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.deformable_groups, ) if self.norm is not None: x = self.norm(x) if self.activation is not None: x = self.activation(x) return x def extra_repr(self): tmpstr = "in_channels=" + str(self.in_channels) tmpstr += ", out_channels=" + str(self.out_channels) tmpstr += ", kernel_size=" + str(self.kernel_size) tmpstr += ", stride=" + str(self.stride) tmpstr += ", padding=" + str(self.padding) tmpstr += ", dilation=" + str(self.dilation) tmpstr += ", groups=" + str(self.groups) tmpstr += ", deformable_groups=" + str(self.deformable_groups) tmpstr += ", bias=" + str(self.with_bias) return tmpstr try: from annotator.oneformer.detectron2 import _C except ImportError: # TODO: register ops natively so there is no need to import _C. _msg = "detectron2 is not compiled successfully, please build following the instructions!" _args = ("detectron2._C", _msg) DeformConv = create_dummy_class("DeformConv", *_args) ModulatedDeformConv = create_dummy_class("ModulatedDeformConv", *_args) deform_conv = create_dummy_func("deform_conv", *_args) modulated_deform_conv = create_dummy_func("modulated_deform_conv", *_args) ================================================ FILE: annotator/oneformer/detectron2/layers/losses.py ================================================ import math import torch def diou_loss( boxes1: torch.Tensor, boxes2: torch.Tensor, reduction: str = "none", eps: float = 1e-7, ) -> torch.Tensor: """ Distance Intersection over Union Loss (Zhaohui Zheng et. al) https://arxiv.org/abs/1911.08287 Args: boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,). reduction: 'none' | 'mean' | 'sum' 'none': No reduction will be applied to the output. 'mean': The output will be averaged. 'sum': The output will be summed. eps (float): small number to prevent division by zero """ x1, y1, x2, y2 = boxes1.unbind(dim=-1) x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1) # TODO: use torch._assert_async() when pytorch 1.8 support is dropped assert (x2 >= x1).all(), "bad box: x1 larger than x2" assert (y2 >= y1).all(), "bad box: y1 larger than y2" # Intersection keypoints xkis1 = torch.max(x1, x1g) ykis1 = torch.max(y1, y1g) xkis2 = torch.min(x2, x2g) ykis2 = torch.min(y2, y2g) intsct = torch.zeros_like(x1) mask = (ykis2 > ykis1) & (xkis2 > xkis1) intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask]) union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps iou = intsct / union # smallest enclosing box xc1 = torch.min(x1, x1g) yc1 = torch.min(y1, y1g) xc2 = torch.max(x2, x2g) yc2 = torch.max(y2, y2g) diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps # centers of boxes x_p = (x2 + x1) / 2 y_p = (y2 + y1) / 2 x_g = (x1g + x2g) / 2 y_g = (y1g + y2g) / 2 distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2) # Eqn. (7) loss = 1 - iou + (distance / diag_len) if reduction == "mean": loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum() elif reduction == "sum": loss = loss.sum() return loss def ciou_loss( boxes1: torch.Tensor, boxes2: torch.Tensor, reduction: str = "none", eps: float = 1e-7, ) -> torch.Tensor: """ Complete Intersection over Union Loss (Zhaohui Zheng et. al) https://arxiv.org/abs/1911.08287 Args: boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,). reduction: 'none' | 'mean' | 'sum' 'none': No reduction will be applied to the output. 'mean': The output will be averaged. 'sum': The output will be summed. eps (float): small number to prevent division by zero """ x1, y1, x2, y2 = boxes1.unbind(dim=-1) x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1) # TODO: use torch._assert_async() when pytorch 1.8 support is dropped assert (x2 >= x1).all(), "bad box: x1 larger than x2" assert (y2 >= y1).all(), "bad box: y1 larger than y2" # Intersection keypoints xkis1 = torch.max(x1, x1g) ykis1 = torch.max(y1, y1g) xkis2 = torch.min(x2, x2g) ykis2 = torch.min(y2, y2g) intsct = torch.zeros_like(x1) mask = (ykis2 > ykis1) & (xkis2 > xkis1) intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask]) union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps iou = intsct / union # smallest enclosing box xc1 = torch.min(x1, x1g) yc1 = torch.min(y1, y1g) xc2 = torch.max(x2, x2g) yc2 = torch.max(y2, y2g) diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps # centers of boxes x_p = (x2 + x1) / 2 y_p = (y2 + y1) / 2 x_g = (x1g + x2g) / 2 y_g = (y1g + y2g) / 2 distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2) # width and height of boxes w_pred = x2 - x1 h_pred = y2 - y1 w_gt = x2g - x1g h_gt = y2g - y1g v = (4 / (math.pi**2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2) with torch.no_grad(): alpha = v / (1 - iou + v + eps) # Eqn. (10) loss = 1 - iou + (distance / diag_len) + alpha * v if reduction == "mean": loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum() elif reduction == "sum": loss = loss.sum() return loss ================================================ FILE: annotator/oneformer/detectron2/layers/mask_ops.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import numpy as np from typing import Tuple import torch from PIL import Image from torch.nn import functional as F __all__ = ["paste_masks_in_image"] BYTES_PER_FLOAT = 4 # TODO: This memory limit may be too much or too little. It would be better to # determine it based on available resources. GPU_MEM_LIMIT = 1024**3 # 1 GB memory limit def _do_paste_mask(masks, boxes, img_h: int, img_w: int, skip_empty: bool = True): """ Args: masks: N, 1, H, W boxes: N, 4 img_h, img_w (int): skip_empty (bool): only paste masks within the region that tightly bound all boxes, and returns the results this region only. An important optimization for CPU. Returns: if skip_empty == False, a mask of shape (N, img_h, img_w) if skip_empty == True, a mask of shape (N, h', w'), and the slice object for the corresponding region. """ # On GPU, paste all masks together (up to chunk size) # by using the entire image to sample the masks # Compared to pasting them one by one, # this has more operations but is faster on COCO-scale dataset. device = masks.device if skip_empty and not torch.jit.is_scripting(): x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to( dtype=torch.int32 ) x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32) y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32) else: x0_int, y0_int = 0, 0 x1_int, y1_int = img_w, img_h x0, y0, x1, y1 = torch.split(boxes, 1, dim=1) # each is Nx1 N = masks.shape[0] img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5 img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5 img_y = (img_y - y0) / (y1 - y0) * 2 - 1 img_x = (img_x - x0) / (x1 - x0) * 2 - 1 # img_x, img_y have shapes (N, w), (N, h) gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1)) gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1)) grid = torch.stack([gx, gy], dim=3) if not torch.jit.is_scripting(): if not masks.dtype.is_floating_point: masks = masks.float() img_masks = F.grid_sample(masks, grid.to(masks.dtype), align_corners=False) if skip_empty and not torch.jit.is_scripting(): return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int)) else: return img_masks[:, 0], () # Annotate boxes as Tensor (but not Boxes) in order to use scripting @torch.jit.script_if_tracing def paste_masks_in_image( masks: torch.Tensor, boxes: torch.Tensor, image_shape: Tuple[int, int], threshold: float = 0.5 ): """ Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image. The location, height, and width for pasting each mask is determined by their corresponding bounding boxes in boxes. Note: This is a complicated but more accurate implementation. In actual deployment, it is often enough to use a faster but less accurate implementation. See :func:`paste_mask_in_image_old` in this file for an alternative implementation. Args: masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of detected object instances in the image and Hmask, Wmask are the mask width and mask height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1]. boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4). boxes[i] and masks[i] correspond to the same object instance. image_shape (tuple): height, width threshold (float): A threshold in [0, 1] for converting the (soft) masks to binary masks. Returns: img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the number of detected object instances and Himage, Wimage are the image width and height. img_masks[i] is a binary mask for object instance i. """ assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported" N = len(masks) if N == 0: return masks.new_empty((0,) + image_shape, dtype=torch.uint8) if not isinstance(boxes, torch.Tensor): boxes = boxes.tensor device = boxes.device assert len(boxes) == N, boxes.shape img_h, img_w = image_shape # The actual implementation split the input into chunks, # and paste them chunk by chunk. if device.type == "cpu" or torch.jit.is_scripting(): # CPU is most efficient when they are pasted one by one with skip_empty=True # so that it performs minimal number of operations. num_chunks = N else: # GPU benefits from parallelism for larger chunks, but may have memory issue # int(img_h) because shape may be tensors in tracing num_chunks = int(np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT)) assert ( num_chunks <= N ), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it" chunks = torch.chunk(torch.arange(N, device=device), num_chunks) img_masks = torch.zeros( N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8 ) for inds in chunks: masks_chunk, spatial_inds = _do_paste_mask( masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu" ) if threshold >= 0: masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool) else: # for visualization and debugging masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8) if torch.jit.is_scripting(): # Scripting does not use the optimized codepath img_masks[inds] = masks_chunk else: img_masks[(inds,) + spatial_inds] = masks_chunk return img_masks # The below are the original paste function (from Detectron1) which has # larger quantization error. # It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample. def paste_mask_in_image_old(mask, box, img_h, img_w, threshold): """ Paste a single mask in an image. This is a per-box implementation of :func:`paste_masks_in_image`. This function has larger quantization error due to incorrect pixel modeling and is not used any more. Args: mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single object instance. Values are in [0, 1]. box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners of the object instance. img_h, img_w (int): Image height and width. threshold (float): Mask binarization threshold in [0, 1]. Returns: im_mask (Tensor): The resized and binarized object mask pasted into the original image plane (a tensor of shape (img_h, img_w)). """ # Conversion from continuous box coordinates to discrete pixel coordinates # via truncation (cast to int32). This determines which pixels to paste the # mask onto. box = box.to(dtype=torch.int32) # Continuous to discrete coordinate conversion # An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to # a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1 # pixels (not x1 - x0 pixels). samples_w = box[2] - box[0] + 1 # Number of pixel samples, *not* geometric width samples_h = box[3] - box[1] + 1 # Number of pixel samples, *not* geometric height # Resample the mask from it's original grid to the new samples_w x samples_h grid mask = Image.fromarray(mask.cpu().numpy()) mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR) mask = np.array(mask, copy=False) if threshold >= 0: mask = np.array(mask > threshold, dtype=np.uint8) mask = torch.from_numpy(mask) else: # for visualization and debugging, we also # allow it to return an unmodified mask mask = torch.from_numpy(mask * 255).to(torch.uint8) im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8) x_0 = max(box[0], 0) x_1 = min(box[2] + 1, img_w) y_0 = max(box[1], 0) y_1 = min(box[3] + 1, img_h) im_mask[y_0:y_1, x_0:x_1] = mask[ (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0]) ] return im_mask # Our pixel modeling requires extrapolation for any continuous # coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks, # we would like this extrapolation to be an interpolation between boundary values and zero, # instead of using absolute zero or boundary values. # Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this: # masks, scale = pad_masks(masks[:, 0, :, :], 1) # boxes = scale_boxes(boxes.tensor, scale) def pad_masks(masks, padding): """ Args: masks (tensor): A tensor of shape (B, M, M) representing B masks. padding (int): Number of cells to pad on all sides. Returns: The padded masks and the scale factor of the padding size / original size. """ B = masks.shape[0] M = masks.shape[-1] pad2 = 2 * padding scale = float(M + pad2) / M padded_masks = masks.new_zeros((B, M + pad2, M + pad2)) padded_masks[:, padding:-padding, padding:-padding] = masks return padded_masks, scale def scale_boxes(boxes, scale): """ Args: boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4 coords representing the corners x0, y0, x1, y1, scale (float): The box scaling factor. Returns: Scaled boxes. """ w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5 h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5 x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5 y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5 w_half *= scale h_half *= scale scaled_boxes = torch.zeros_like(boxes) scaled_boxes[:, 0] = x_c - w_half scaled_boxes[:, 2] = x_c + w_half scaled_boxes[:, 1] = y_c - h_half scaled_boxes[:, 3] = y_c + h_half return scaled_boxes @torch.jit.script_if_tracing def _paste_masks_tensor_shape( masks: torch.Tensor, boxes: torch.Tensor, image_shape: Tuple[torch.Tensor, torch.Tensor], threshold: float = 0.5, ): """ A wrapper of paste_masks_in_image where image_shape is Tensor. During tracing, shapes might be tensors instead of ints. The Tensor->int conversion should be scripted rather than traced. """ return paste_masks_in_image(masks, boxes, (int(image_shape[0]), int(image_shape[1])), threshold) ================================================ FILE: annotator/oneformer/detectron2/layers/nms.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import torch from torchvision.ops import boxes as box_ops from torchvision.ops import nms # noqa . for compatibility def batched_nms( boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float ): """ Same as torchvision.ops.boxes.batched_nms, but with float(). """ assert boxes.shape[-1] == 4 # Note: Torchvision already has a strategy (https://github.com/pytorch/vision/issues/1311) # to decide whether to use coordinate trick or for loop to implement batched_nms. So we # just call it directly. # Fp16 does not have enough range for batched NMS, so adding float(). return box_ops.batched_nms(boxes.float(), scores, idxs, iou_threshold) # Note: this function (nms_rotated) might be moved into # torchvision/ops/boxes.py in the future def nms_rotated(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float): """ Performs non-maximum suppression (NMS) on the rotated boxes according to their intersection-over-union (IoU). Rotated NMS iteratively removes lower scoring rotated boxes which have an IoU greater than iou_threshold with another (higher scoring) rotated box. Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they can be representing completely different objects in certain tasks, e.g., OCR. As for the question of whether rotated-NMS should treat them as faraway boxes even though their IOU is 1, it depends on the application and/or ground truth annotation. As an extreme example, consider a single character v and the square box around it. If the angle is 0 degree, the object (text) would be read as 'v'; If the angle is 90 degrees, the object (text) would become '>'; If the angle is 180 degrees, the object (text) would become '^'; If the angle is 270/-90 degrees, the object (text) would become '<' All of these cases have IoU of 1 to each other, and rotated NMS that only uses IoU as criterion would only keep one of them with the highest score - which, practically, still makes sense in most cases because typically only one of theses orientations is the correct one. Also, it does not matter as much if the box is only used to classify the object (instead of transcribing them with a sequential OCR recognition model) later. On the other hand, when we use IoU to filter proposals that are close to the ground truth during training, we should definitely take the angle into account if we know the ground truth is labeled with the strictly correct orientation (as in, upside-down words are annotated with -180 degrees even though they can be covered with a 0/90/-90 degree box, etc.) The way the original dataset is annotated also matters. For example, if the dataset is a 4-point polygon dataset that does not enforce ordering of vertices/orientation, we can estimate a minimum rotated bounding box to this polygon, but there's no way we can tell the correct angle with 100% confidence (as shown above, there could be 4 different rotated boxes, with angles differed by 90 degrees to each other, covering the exactly same region). In that case we have to just use IoU to determine the box proximity (as many detection benchmarks (even for text) do) unless there're other assumptions we can make (like width is always larger than height, or the object is not rotated by more than 90 degrees CCW/CW, etc.) In summary, not considering angles in rotated NMS seems to be a good option for now, but we should be aware of its implications. Args: boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in (x_center, y_center, width, height, angle_degrees) format. scores (Tensor[N]): Scores for each one of the rotated boxes iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold Returns: keep (Tensor): int64 tensor with the indices of the elements that have been kept by Rotated NMS, sorted in decreasing order of scores """ return torch.ops.detectron2.nms_rotated(boxes, scores, iou_threshold) # Note: this function (batched_nms_rotated) might be moved into # torchvision/ops/boxes.py in the future @torch.jit.script_if_tracing def batched_nms_rotated( boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float ): """ Performs non-maximum suppression in a batched fashion. Each index value correspond to a category, and NMS will not be applied between elements of different categories. Args: boxes (Tensor[N, 5]): boxes where NMS will be performed. They are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format scores (Tensor[N]): scores for each one of the boxes idxs (Tensor[N]): indices of the categories for each one of the boxes. iou_threshold (float): discards all overlapping boxes with IoU < iou_threshold Returns: Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted in decreasing order of scores """ assert boxes.shape[-1] == 5 if boxes.numel() == 0: return torch.empty((0,), dtype=torch.int64, device=boxes.device) boxes = boxes.float() # fp16 does not have enough range for batched NMS # Strategy: in order to perform NMS independently per class, # we add an offset to all the boxes. The offset is dependent # only on the class idx, and is large enough so that boxes # from different classes do not overlap # Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate, # which won't handle negative coordinates correctly. # Here by using min_coordinate we can make sure the negative coordinates are # correctly handled. max_coordinate = ( torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2 ).max() min_coordinate = ( torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2 ).min() offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1) boxes_for_nms = boxes.clone() # avoid modifying the original values in boxes boxes_for_nms[:, :2] += offsets[:, None] keep = nms_rotated(boxes_for_nms, scores, iou_threshold) return keep ================================================ FILE: annotator/oneformer/detectron2/layers/roi_align.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from torch import nn from torchvision.ops import roi_align # NOTE: torchvision's RoIAlign has a different default aligned=False class ROIAlign(nn.Module): def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True): """ Args: output_size (tuple): h, w spatial_scale (float): scale the input boxes by this number sampling_ratio (int): number of inputs samples to take for each output sample. 0 to take samples densely. aligned (bool): if False, use the legacy implementation in Detectron. If True, align the results more perfectly. Note: The meaning of aligned=True: Given a continuous coordinate c, its two neighboring pixel indices (in our pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled from the underlying signal at continuous coordinates 0.5 and 1.5). But the original roi_align (aligned=False) does not subtract the 0.5 when computing neighboring pixel indices and therefore it uses pixels with a slightly incorrect alignment (relative to our pixel model) when performing bilinear interpolation. With `aligned=True`, we first appropriately scale the ROI and then shift it by -0.5 prior to calling roi_align. This produces the correct neighbors; see detectron2/tests/test_roi_align.py for verification. The difference does not make a difference to the model's performance if ROIAlign is used together with conv layers. """ super().__init__() self.output_size = output_size self.spatial_scale = spatial_scale self.sampling_ratio = sampling_ratio self.aligned = aligned from torchvision import __version__ version = tuple(int(x) for x in __version__.split(".")[:2]) # https://github.com/pytorch/vision/pull/2438 assert version >= (0, 7), "Require torchvision >= 0.7" def forward(self, input, rois): """ Args: input: NCHW images rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy. """ assert rois.dim() == 2 and rois.size(1) == 5 if input.is_quantized: input = input.dequantize() return roi_align( input, rois.to(dtype=input.dtype), self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned, ) def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "output_size=" + str(self.output_size) tmpstr += ", spatial_scale=" + str(self.spatial_scale) tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) tmpstr += ", aligned=" + str(self.aligned) tmpstr += ")" return tmpstr ================================================ FILE: annotator/oneformer/detectron2/layers/roi_align_rotated.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair class _ROIAlignRotated(Function): @staticmethod def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): ctx.save_for_backward(roi) ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.sampling_ratio = sampling_ratio ctx.input_shape = input.size() output = torch.ops.detectron2.roi_align_rotated_forward( input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): (rois,) = ctx.saved_tensors output_size = ctx.output_size spatial_scale = ctx.spatial_scale sampling_ratio = ctx.sampling_ratio bs, ch, h, w = ctx.input_shape grad_input = torch.ops.detectron2.roi_align_rotated_backward( grad_output, rois, spatial_scale, output_size[0], output_size[1], bs, ch, h, w, sampling_ratio, ) return grad_input, None, None, None, None, None roi_align_rotated = _ROIAlignRotated.apply class ROIAlignRotated(nn.Module): def __init__(self, output_size, spatial_scale, sampling_ratio): """ Args: output_size (tuple): h, w spatial_scale (float): scale the input boxes by this number sampling_ratio (int): number of inputs samples to take for each output sample. 0 to take samples densely. Note: ROIAlignRotated supports continuous coordinate by default: Given a continuous coordinate c, its two neighboring pixel indices (in our pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled from the underlying signal at continuous coordinates 0.5 and 1.5). """ super(ROIAlignRotated, self).__init__() self.output_size = output_size self.spatial_scale = spatial_scale self.sampling_ratio = sampling_ratio def forward(self, input, rois): """ Args: input: NCHW images rois: Bx6 boxes. First column is the index into N. The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees). """ assert rois.dim() == 2 and rois.size(1) == 6 orig_dtype = input.dtype if orig_dtype == torch.float16: input = input.float() rois = rois.float() output_size = _pair(self.output_size) # Scripting for Autograd is currently unsupported. # This is a quick fix without having to rewrite code on the C++ side if torch.jit.is_scripting() or torch.jit.is_tracing(): return torch.ops.detectron2.roi_align_rotated_forward( input, rois, self.spatial_scale, output_size[0], output_size[1], self.sampling_ratio ).to(dtype=orig_dtype) return roi_align_rotated( input, rois, self.output_size, self.spatial_scale, self.sampling_ratio ).to(dtype=orig_dtype) def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "output_size=" + str(self.output_size) tmpstr += ", spatial_scale=" + str(self.spatial_scale) tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) tmpstr += ")" return tmpstr ================================================ FILE: annotator/oneformer/detectron2/layers/rotated_boxes.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from __future__ import absolute_import, division, print_function, unicode_literals import torch def pairwise_iou_rotated(boxes1, boxes2): """ Return intersection-over-union (Jaccard index) of boxes. Both sets of boxes are expected to be in (x_center, y_center, width, height, angle) format. Arguments: boxes1 (Tensor[N, 5]) boxes2 (Tensor[M, 5]) Returns: iou (Tensor[N, M]): the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2 """ return torch.ops.detectron2.box_iou_rotated(boxes1, boxes2) ================================================ FILE: annotator/oneformer/detectron2/layers/shape_spec.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. from dataclasses import dataclass from typing import Optional @dataclass class ShapeSpec: """ A simple structure that contains basic shape specification about a tensor. It is often used as the auxiliary inputs/outputs of models, to complement the lack of shape inference ability among pytorch modules. """ channels: Optional[int] = None height: Optional[int] = None width: Optional[int] = None stride: Optional[int] = None ================================================ FILE: annotator/oneformer/detectron2/layers/wrappers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. """ Wrappers around on some nn functions, mainly to support empty tensors. Ideally, add support directly in PyTorch to empty tensors in those functions. These can be removed once https://github.com/pytorch/pytorch/issues/12013 is implemented """ import warnings from typing import List, Optional import torch from torch.nn import functional as F from annotator.oneformer.detectron2.utils.env import TORCH_VERSION def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor: """ Turn a list of integer scalars or integer Tensor scalars into a vector, in a way that's both traceable and scriptable. In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs. In scripting or eager, `x` should be a list of int. """ if torch.jit.is_scripting(): return torch.as_tensor(x, device=device) if torch.jit.is_tracing(): assert all( [isinstance(t, torch.Tensor) for t in x] ), "Shape should be tensor during tracing!" # as_tensor should not be used in tracing because it records a constant ret = torch.stack(x) if ret.device != device: # avoid recording a hard-coded device if not necessary ret = ret.to(device=device) return ret return torch.as_tensor(x, device=device) def check_if_dynamo_compiling(): if TORCH_VERSION >= (1, 14): from torch._dynamo import is_compiling return is_compiling() else: return False def cat(tensors: List[torch.Tensor], dim: int = 0): """ Efficient version of torch.cat that avoids a copy if there is only a single element in a list """ assert isinstance(tensors, (list, tuple)) if len(tensors) == 1: return tensors[0] return torch.cat(tensors, dim) def empty_input_loss_func_wrapper(loss_func): def wrapped_loss_func(input, target, *, reduction="mean", **kwargs): """ Same as `loss_func`, but returns 0 (instead of nan) for empty inputs. """ if target.numel() == 0 and reduction == "mean": return input.sum() * 0.0 # connect the gradient return loss_func(input, target, reduction=reduction, **kwargs) return wrapped_loss_func cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy) class _NewEmptyTensorOp(torch.autograd.Function): @staticmethod def forward(ctx, x, new_shape): ctx.shape = x.shape return x.new_empty(new_shape) @staticmethod def backward(ctx, grad): shape = ctx.shape return _NewEmptyTensorOp.apply(grad, shape), None class Conv2d(torch.nn.Conv2d): """ A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features. """ def __init__(self, *args, **kwargs): """ Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`: Args: norm (nn.Module, optional): a normalization layer activation (callable(Tensor) -> Tensor): a callable activation function It assumes that norm layer is used before activation. """ norm = kwargs.pop("norm", None) activation = kwargs.pop("activation", None) super().__init__(*args, **kwargs) self.norm = norm self.activation = activation def forward(self, x): # torchscript does not support SyncBatchNorm yet # https://github.com/pytorch/pytorch/issues/40507 # and we skip these codes in torchscript since: # 1. currently we only support torchscript in evaluation mode # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or # later version, `Conv2d` in these PyTorch versions has already supported empty inputs. if not torch.jit.is_scripting(): # Dynamo doesn't support context managers yet is_dynamo_compiling = check_if_dynamo_compiling() if not is_dynamo_compiling: with warnings.catch_warnings(record=True): if x.numel() == 0 and self.training: # https://github.com/pytorch/pytorch/issues/12013 assert not isinstance( self.norm, torch.nn.SyncBatchNorm ), "SyncBatchNorm does not support empty inputs!" x = F.conv2d( x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups ) if self.norm is not None: x = self.norm(x) if self.activation is not None: x = self.activation(x) return x ConvTranspose2d = torch.nn.ConvTranspose2d BatchNorm2d = torch.nn.BatchNorm2d interpolate = F.interpolate Linear = torch.nn.Linear def nonzero_tuple(x): """ A 'as_tuple=True' version of torch.nonzero to support torchscript. because of https://github.com/pytorch/pytorch/issues/38718 """ if torch.jit.is_scripting(): if x.dim() == 0: return x.unsqueeze(0).nonzero().unbind(1) return x.nonzero().unbind(1) else: return x.nonzero(as_tuple=True) @torch.jit.script_if_tracing def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor: """ Tracing friendly way to cast tensor to another tensor's device. Device will be treated as constant during tracing, scripting the casting process as whole can workaround this issue. """ return src.to(dst.device) ================================================ FILE: annotator/oneformer/detectron2/model_zoo/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. """ Model Zoo API for Detectron2: a collection of functions to create common model architectures listed in `MODEL_ZOO.md `_, and optionally load their pre-trained weights. """ from .model_zoo import get, get_config_file, get_checkpoint_url, get_config __all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"] ================================================ FILE: annotator/oneformer/detectron2/model_zoo/model_zoo.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import os from typing import Optional import pkg_resources import torch from annotator.oneformer.detectron2.checkpoint import DetectionCheckpointer from annotator.oneformer.detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate from annotator.oneformer.detectron2.modeling import build_model class _ModelZooUrls(object): """ Mapping from names to officially released Detectron2 pre-trained models. """ S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/" # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl CONFIG_PATH_TO_URL_SUFFIX = { # COCO Detection with Faster R-CNN "COCO-Detection/faster_rcnn_R_50_C4_1x": "137257644/model_final_721ade.pkl", "COCO-Detection/faster_rcnn_R_50_DC5_1x": "137847829/model_final_51d356.pkl", "COCO-Detection/faster_rcnn_R_50_FPN_1x": "137257794/model_final_b275ba.pkl", "COCO-Detection/faster_rcnn_R_50_C4_3x": "137849393/model_final_f97cb7.pkl", "COCO-Detection/faster_rcnn_R_50_DC5_3x": "137849425/model_final_68d202.pkl", "COCO-Detection/faster_rcnn_R_50_FPN_3x": "137849458/model_final_280758.pkl", "COCO-Detection/faster_rcnn_R_101_C4_3x": "138204752/model_final_298dad.pkl", "COCO-Detection/faster_rcnn_R_101_DC5_3x": "138204841/model_final_3e0943.pkl", "COCO-Detection/faster_rcnn_R_101_FPN_3x": "137851257/model_final_f6e8b1.pkl", "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x": "139173657/model_final_68b088.pkl", # COCO Detection with RetinaNet "COCO-Detection/retinanet_R_50_FPN_1x": "190397773/model_final_bfca0b.pkl", "COCO-Detection/retinanet_R_50_FPN_3x": "190397829/model_final_5bd44e.pkl", "COCO-Detection/retinanet_R_101_FPN_3x": "190397697/model_final_971ab9.pkl", # COCO Detection with RPN and Fast R-CNN "COCO-Detection/rpn_R_50_C4_1x": "137258005/model_final_450694.pkl", "COCO-Detection/rpn_R_50_FPN_1x": "137258492/model_final_02ce48.pkl", "COCO-Detection/fast_rcnn_R_50_FPN_1x": "137635226/model_final_e5f7ce.pkl", # COCO Instance Segmentation Baselines with Mask R-CNN "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x": "137259246/model_final_9243eb.pkl", "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x": "137260150/model_final_4f86c3.pkl", "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "137260431/model_final_a54504.pkl", "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x": "137849525/model_final_4ce675.pkl", "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x": "137849551/model_final_84107b.pkl", "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x": "137849600/model_final_f10217.pkl", "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x": "138363239/model_final_a2914c.pkl", "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x": "138363294/model_final_0464b7.pkl", "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x": "138205316/model_final_a3ec72.pkl", "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x": "139653917/model_final_2d9806.pkl", # noqa # New baselines using Large-Scale Jitter and Longer Training Schedule "new_baselines/mask_rcnn_R_50_FPN_100ep_LSJ": "42047764/model_final_bb69de.pkl", "new_baselines/mask_rcnn_R_50_FPN_200ep_LSJ": "42047638/model_final_89a8d3.pkl", "new_baselines/mask_rcnn_R_50_FPN_400ep_LSJ": "42019571/model_final_14d201.pkl", "new_baselines/mask_rcnn_R_101_FPN_100ep_LSJ": "42025812/model_final_4f7b58.pkl", "new_baselines/mask_rcnn_R_101_FPN_200ep_LSJ": "42131867/model_final_0bb7ae.pkl", "new_baselines/mask_rcnn_R_101_FPN_400ep_LSJ": "42073830/model_final_f96b26.pkl", "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_100ep_LSJ": "42047771/model_final_b7fbab.pkl", # noqa "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_200ep_LSJ": "42132721/model_final_5d87c1.pkl", # noqa "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_400ep_LSJ": "42025447/model_final_f1362d.pkl", # noqa "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_100ep_LSJ": "42047784/model_final_6ba57e.pkl", # noqa "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_200ep_LSJ": "42047642/model_final_27b9c1.pkl", # noqa "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ": "42045954/model_final_ef3a80.pkl", # noqa # COCO Person Keypoint Detection Baselines with Keypoint R-CNN "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x": "137261548/model_final_04e291.pkl", "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x": "137849621/model_final_a6e10b.pkl", "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x": "138363331/model_final_997cc7.pkl", "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x": "139686956/model_final_5ad38f.pkl", # COCO Panoptic Segmentation Baselines with Panoptic FPN "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x": "139514544/model_final_dbfeb4.pkl", "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x": "139514569/model_final_c10459.pkl", "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x": "139514519/model_final_cafdb1.pkl", # LVIS Instance Segmentation Baselines with Mask R-CNN "LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "144219072/model_final_571f7c.pkl", # noqa "LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x": "144219035/model_final_824ab5.pkl", # noqa "LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x": "144219108/model_final_5e3439.pkl", # noqa # Cityscapes & Pascal VOC Baselines "Cityscapes/mask_rcnn_R_50_FPN": "142423278/model_final_af9cf5.pkl", "PascalVOC-Detection/faster_rcnn_R_50_C4": "142202221/model_final_b1acc2.pkl", # Other Settings "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5": "138602867/model_final_65c703.pkl", "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5": "144998336/model_final_821d0b.pkl", "Misc/cascade_mask_rcnn_R_50_FPN_1x": "138602847/model_final_e9d89b.pkl", "Misc/cascade_mask_rcnn_R_50_FPN_3x": "144998488/model_final_480dd8.pkl", "Misc/mask_rcnn_R_50_FPN_3x_syncbn": "169527823/model_final_3b3c51.pkl", "Misc/mask_rcnn_R_50_FPN_3x_gn": "138602888/model_final_dc5d9e.pkl", "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn": "138602908/model_final_01ca85.pkl", "Misc/scratch_mask_rcnn_R_50_FPN_9x_gn": "183808979/model_final_da7b4c.pkl", "Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn": "184226666/model_final_5ce33e.pkl", "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x": "139797668/model_final_be35db.pkl", "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv": "18131413/model_0039999_e76410.pkl", # noqa # D1 Comparisons "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x": "137781054/model_final_7ab50c.pkl", # noqa "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x": "137781281/model_final_62ca52.pkl", # noqa "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x": "137781195/model_final_cce136.pkl", } @staticmethod def query(config_path: str) -> Optional[str]: """ Args: config_path: relative config filename """ name = config_path.replace(".yaml", "").replace(".py", "") if name in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX: suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[name] return _ModelZooUrls.S3_PREFIX + name + "/" + suffix return None def get_checkpoint_url(config_path): """ Returns the URL to the model trained using the given config Args: config_path (str): config file name relative to detectron2's "configs/" directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml" Returns: str: a URL to the model """ url = _ModelZooUrls.query(config_path) if url is None: raise RuntimeError("Pretrained model for {} is not available!".format(config_path)) return url def get_config_file(config_path): """ Returns path to a builtin config file. Args: config_path (str): config file name relative to detectron2's "configs/" directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml" Returns: str: the real path to the config file. """ cfg_file = pkg_resources.resource_filename( "detectron2.model_zoo", os.path.join("configs", config_path) ) if not os.path.exists(cfg_file): raise RuntimeError("{} not available in Model Zoo!".format(config_path)) return cfg_file def get_config(config_path, trained: bool = False): """ Returns a config object for a model in model zoo. Args: config_path (str): config file name relative to detectron2's "configs/" directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml" trained (bool): If True, will set ``MODEL.WEIGHTS`` to trained model zoo weights. If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used instead; this will typically (though not always) initialize a subset of weights using an ImageNet pre-trained model, while randomly initializing the other weights. Returns: CfgNode or omegaconf.DictConfig: a config object """ cfg_file = get_config_file(config_path) if cfg_file.endswith(".yaml"): cfg = get_cfg() cfg.merge_from_file(cfg_file) if trained: cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path) return cfg elif cfg_file.endswith(".py"): cfg = LazyConfig.load(cfg_file) if trained: url = get_checkpoint_url(config_path) if "train" in cfg and "init_checkpoint" in cfg.train: cfg.train.init_checkpoint = url else: raise NotImplementedError return cfg def get(config_path, trained: bool = False, device: Optional[str] = None): """ Get a model specified by relative path under Detectron2's official ``configs/`` directory. Args: config_path (str): config file name relative to detectron2's "configs/" directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml" trained (bool): see :func:`get_config`. device (str or None): overwrite the device in config, if given. Returns: nn.Module: a detectron2 model. Will be in training mode. Example: :: from annotator.oneformer.detectron2 import model_zoo model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True) """ cfg = get_config(config_path, trained) if device is None and not torch.cuda.is_available(): device = "cpu" if device is not None and isinstance(cfg, CfgNode): cfg.MODEL.DEVICE = device if isinstance(cfg, CfgNode): model = build_model(cfg) DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) else: model = instantiate(cfg.model) if device is not None: model = model.to(device) if "train" in cfg and "init_checkpoint" in cfg.train: DetectionCheckpointer(model).load(cfg.train.init_checkpoint) return model ================================================ FILE: annotator/oneformer/detectron2/modeling/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from annotator.oneformer.detectron2.layers import ShapeSpec from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY from .backbone import ( BACKBONE_REGISTRY, FPN, Backbone, ResNet, ResNetBlockBase, build_backbone, build_resnet_backbone, make_stage, ViT, SimpleFeaturePyramid, get_vit_lr_decay_rate, MViT, SwinTransformer, ) from .meta_arch import ( META_ARCH_REGISTRY, SEM_SEG_HEADS_REGISTRY, GeneralizedRCNN, PanopticFPN, ProposalNetwork, RetinaNet, SemanticSegmentor, build_model, build_sem_seg_head, FCOS, ) from .postprocessing import detector_postprocess from .proposal_generator import ( PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator, RPN_HEAD_REGISTRY, build_rpn_head, ) from .roi_heads import ( ROI_BOX_HEAD_REGISTRY, ROI_HEADS_REGISTRY, ROI_KEYPOINT_HEAD_REGISTRY, ROI_MASK_HEAD_REGISTRY, ROIHeads, StandardROIHeads, BaseMaskRCNNHead, BaseKeypointRCNNHead, FastRCNNOutputLayers, build_box_head, build_keypoint_head, build_mask_head, build_roi_heads, ) from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA from .mmdet_wrapper import MMDetBackbone, MMDetDetector _EXCLUDE = {"ShapeSpec"} __all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")] from annotator.oneformer.detectron2.utils.env import fixup_module_metadata fixup_module_metadata(__name__, globals(), __all__) del fixup_module_metadata ================================================ FILE: annotator/oneformer/detectron2/modeling/anchor_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import collections import math from typing import List import torch from torch import nn from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import ShapeSpec, move_device_like from annotator.oneformer.detectron2.structures import Boxes, RotatedBoxes from annotator.oneformer.detectron2.utils.registry import Registry ANCHOR_GENERATOR_REGISTRY = Registry("ANCHOR_GENERATOR") ANCHOR_GENERATOR_REGISTRY.__doc__ = """ Registry for modules that creates object detection anchors for feature maps. The registered object will be called with `obj(cfg, input_shape)`. """ class BufferList(nn.Module): """ Similar to nn.ParameterList, but for buffers """ def __init__(self, buffers): super().__init__() for i, buffer in enumerate(buffers): # Use non-persistent buffer so the values are not saved in checkpoint self.register_buffer(str(i), buffer, persistent=False) def __len__(self): return len(self._buffers) def __iter__(self): return iter(self._buffers.values()) def _create_grid_offsets( size: List[int], stride: int, offset: float, target_device_tensor: torch.Tensor ): grid_height, grid_width = size shifts_x = move_device_like( torch.arange(offset * stride, grid_width * stride, step=stride, dtype=torch.float32), target_device_tensor, ) shifts_y = move_device_like( torch.arange(offset * stride, grid_height * stride, step=stride, dtype=torch.float32), target_device_tensor, ) shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) return shift_x, shift_y def _broadcast_params(params, num_features, name): """ If one size (or aspect ratio) is specified and there are multiple feature maps, we "broadcast" anchors of that single size (or aspect ratio) over all feature maps. If params is list[float], or list[list[float]] with len(params) == 1, repeat it num_features time. Returns: list[list[float]]: param for each feature """ assert isinstance( params, collections.abc.Sequence ), f"{name} in anchor generator has to be a list! Got {params}." assert len(params), f"{name} in anchor generator cannot be empty!" if not isinstance(params[0], collections.abc.Sequence): # params is list[float] return [params] * num_features if len(params) == 1: return list(params) * num_features assert len(params) == num_features, ( f"Got {name} of length {len(params)} in anchor generator, " f"but the number of input features is {num_features}!" ) return params @ANCHOR_GENERATOR_REGISTRY.register() class DefaultAnchorGenerator(nn.Module): """ Compute anchors in the standard ways described in "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks". """ box_dim: torch.jit.Final[int] = 4 """ the dimension of each anchor box. """ @configurable def __init__(self, *, sizes, aspect_ratios, strides, offset=0.5): """ This interface is experimental. Args: sizes (list[list[float]] or list[float]): If ``sizes`` is list[list[float]], ``sizes[i]`` is the list of anchor sizes (i.e. sqrt of anchor area) to use for the i-th feature map. If ``sizes`` is list[float], ``sizes`` is used for all feature maps. Anchor sizes are given in absolute lengths in units of the input image; they do not dynamically scale if the input image size changes. aspect_ratios (list[list[float]] or list[float]): list of aspect ratios (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies. strides (list[int]): stride of each input feature. offset (float): Relative offset between the center of the first anchor and the top-left corner of the image. Value has to be in [0, 1). Recommend to use 0.5, which means half stride. """ super().__init__() self.strides = strides self.num_features = len(self.strides) sizes = _broadcast_params(sizes, self.num_features, "sizes") aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios") self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios) self.offset = offset assert 0.0 <= self.offset < 1.0, self.offset @classmethod def from_config(cls, cfg, input_shape: List[ShapeSpec]): return { "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES, "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS, "strides": [x.stride for x in input_shape], "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET, } def _calculate_anchors(self, sizes, aspect_ratios): cell_anchors = [ self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios) ] return BufferList(cell_anchors) @property @torch.jit.unused def num_cell_anchors(self): """ Alias of `num_anchors`. """ return self.num_anchors @property @torch.jit.unused def num_anchors(self): """ Returns: list[int]: Each int is the number of anchors at every pixel location, on that feature map. For example, if at every pixel we use anchors of 3 aspect ratios and 5 sizes, the number of anchors is 15. (See also ANCHOR_GENERATOR.SIZES and ANCHOR_GENERATOR.ASPECT_RATIOS in config) In standard RPN models, `num_anchors` on every feature map is the same. """ return [len(cell_anchors) for cell_anchors in self.cell_anchors] def _grid_anchors(self, grid_sizes: List[List[int]]): """ Returns: list[Tensor]: #featuremap tensors, each is (#locations x #cell_anchors) x 4 """ anchors = [] # buffers() not supported by torchscript. use named_buffers() instead buffers: List[torch.Tensor] = [x[1] for x in self.cell_anchors.named_buffers()] for size, stride, base_anchors in zip(grid_sizes, self.strides, buffers): shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors) shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4)) return anchors def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)): """ Generate a tensor storing canonical anchor boxes, which are all anchor boxes of different sizes and aspect_ratios centered at (0, 0). We can later build the set of anchors for a full feature map by shifting and tiling these tensors (see `meth:_grid_anchors`). Args: sizes (tuple[float]): aspect_ratios (tuple[float]]): Returns: Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes in XYXY format. """ # This is different from the anchor generator defined in the original Faster R-CNN # code or Detectron. They yield the same AP, however the old version defines cell # anchors in a less natural way with a shift relative to the feature grid and # quantization that results in slightly different sizes for different aspect ratios. # See also https://github.com/facebookresearch/Detectron/issues/227 anchors = [] for size in sizes: area = size**2.0 for aspect_ratio in aspect_ratios: # s * s = w * h # a = h / w # ... some algebra ... # w = sqrt(s * s / a) # h = a * w w = math.sqrt(area / aspect_ratio) h = aspect_ratio * w x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0 anchors.append([x0, y0, x1, y1]) return torch.tensor(anchors) def forward(self, features: List[torch.Tensor]): """ Args: features (list[Tensor]): list of backbone feature maps on which to generate anchors. Returns: list[Boxes]: a list of Boxes containing all the anchors for each feature map (i.e. the cell anchors repeated over all locations in the feature map). The number of anchors of each feature map is Hi x Wi x num_cell_anchors, where Hi, Wi are resolution of the feature map divided by anchor stride. """ grid_sizes = [feature_map.shape[-2:] for feature_map in features] anchors_over_all_feature_maps = self._grid_anchors(grid_sizes) return [Boxes(x) for x in anchors_over_all_feature_maps] @ANCHOR_GENERATOR_REGISTRY.register() class RotatedAnchorGenerator(nn.Module): """ Compute rotated anchors used by Rotated RPN (RRPN), described in "Arbitrary-Oriented Scene Text Detection via Rotation Proposals". """ box_dim: int = 5 """ the dimension of each anchor box. """ @configurable def __init__(self, *, sizes, aspect_ratios, strides, angles, offset=0.5): """ This interface is experimental. Args: sizes (list[list[float]] or list[float]): If sizes is list[list[float]], sizes[i] is the list of anchor sizes (i.e. sqrt of anchor area) to use for the i-th feature map. If sizes is list[float], the sizes are used for all feature maps. Anchor sizes are given in absolute lengths in units of the input image; they do not dynamically scale if the input image size changes. aspect_ratios (list[list[float]] or list[float]): list of aspect ratios (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies. strides (list[int]): stride of each input feature. angles (list[list[float]] or list[float]): list of angles (in degrees CCW) to use for anchors. Same "broadcast" rule for `sizes` applies. offset (float): Relative offset between the center of the first anchor and the top-left corner of the image. Value has to be in [0, 1). Recommend to use 0.5, which means half stride. """ super().__init__() self.strides = strides self.num_features = len(self.strides) sizes = _broadcast_params(sizes, self.num_features, "sizes") aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios") angles = _broadcast_params(angles, self.num_features, "angles") self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios, angles) self.offset = offset assert 0.0 <= self.offset < 1.0, self.offset @classmethod def from_config(cls, cfg, input_shape: List[ShapeSpec]): return { "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES, "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS, "strides": [x.stride for x in input_shape], "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET, "angles": cfg.MODEL.ANCHOR_GENERATOR.ANGLES, } def _calculate_anchors(self, sizes, aspect_ratios, angles): cell_anchors = [ self.generate_cell_anchors(size, aspect_ratio, angle).float() for size, aspect_ratio, angle in zip(sizes, aspect_ratios, angles) ] return BufferList(cell_anchors) @property def num_cell_anchors(self): """ Alias of `num_anchors`. """ return self.num_anchors @property def num_anchors(self): """ Returns: list[int]: Each int is the number of anchors at every pixel location, on that feature map. For example, if at every pixel we use anchors of 3 aspect ratios, 2 sizes and 5 angles, the number of anchors is 30. (See also ANCHOR_GENERATOR.SIZES, ANCHOR_GENERATOR.ASPECT_RATIOS and ANCHOR_GENERATOR.ANGLES in config) In standard RRPN models, `num_anchors` on every feature map is the same. """ return [len(cell_anchors) for cell_anchors in self.cell_anchors] def _grid_anchors(self, grid_sizes): anchors = [] for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors): shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors) zeros = torch.zeros_like(shift_x) shifts = torch.stack((shift_x, shift_y, zeros, zeros, zeros), dim=1) anchors.append((shifts.view(-1, 1, 5) + base_anchors.view(1, -1, 5)).reshape(-1, 5)) return anchors def generate_cell_anchors( self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2), angles=(-90, -60, -30, 0, 30, 60, 90), ): """ Generate a tensor storing canonical anchor boxes, which are all anchor boxes of different sizes, aspect_ratios, angles centered at (0, 0). We can later build the set of anchors for a full feature map by shifting and tiling these tensors (see `meth:_grid_anchors`). Args: sizes (tuple[float]): aspect_ratios (tuple[float]]): angles (tuple[float]]): Returns: Tensor of shape (len(sizes) * len(aspect_ratios) * len(angles), 5) storing anchor boxes in (x_ctr, y_ctr, w, h, angle) format. """ anchors = [] for size in sizes: area = size**2.0 for aspect_ratio in aspect_ratios: # s * s = w * h # a = h / w # ... some algebra ... # w = sqrt(s * s / a) # h = a * w w = math.sqrt(area / aspect_ratio) h = aspect_ratio * w anchors.extend([0, 0, w, h, a] for a in angles) return torch.tensor(anchors) def forward(self, features): """ Args: features (list[Tensor]): list of backbone feature maps on which to generate anchors. Returns: list[RotatedBoxes]: a list of Boxes containing all the anchors for each feature map (i.e. the cell anchors repeated over all locations in the feature map). The number of anchors of each feature map is Hi x Wi x num_cell_anchors, where Hi, Wi are resolution of the feature map divided by anchor stride. """ grid_sizes = [feature_map.shape[-2:] for feature_map in features] anchors_over_all_feature_maps = self._grid_anchors(grid_sizes) return [RotatedBoxes(x) for x in anchors_over_all_feature_maps] def build_anchor_generator(cfg, input_shape): """ Built an anchor generator from `cfg.MODEL.ANCHOR_GENERATOR.NAME`. """ anchor_generator = cfg.MODEL.ANCHOR_GENERATOR.NAME return ANCHOR_GENERATOR_REGISTRY.get(anchor_generator)(cfg, input_shape) ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .build import build_backbone, BACKBONE_REGISTRY # noqa F401 isort:skip from .backbone import Backbone from .fpn import FPN from .regnet import RegNet from .resnet import ( BasicStem, ResNet, ResNetBlockBase, build_resnet_backbone, make_stage, BottleneckBlock, ) from .vit import ViT, SimpleFeaturePyramid, get_vit_lr_decay_rate from .mvit import MViT from .swin import SwinTransformer __all__ = [k for k in globals().keys() if not k.startswith("_")] # TODO can expose more resnet blocks after careful consideration ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/backbone.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from abc import ABCMeta, abstractmethod from typing import Dict import torch.nn as nn from annotator.oneformer.detectron2.layers import ShapeSpec __all__ = ["Backbone"] class Backbone(nn.Module, metaclass=ABCMeta): """ Abstract base class for network backbones. """ def __init__(self): """ The `__init__` method of any subclass can specify its own set of arguments. """ super().__init__() @abstractmethod def forward(self): """ Subclasses must override this method, but adhere to the same return type. Returns: dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor """ pass @property def size_divisibility(self) -> int: """ Some backbones require the input height and width to be divisible by a specific integer. This is typically true for encoder / decoder type networks with lateral connection (e.g., FPN) for which feature maps need to match dimension in the "bottom up" and "top down" paths. Set to 0 if no specific input size divisibility is required. """ return 0 @property def padding_constraints(self) -> Dict[str, int]: """ This property is a generalization of size_divisibility. Some backbones and training recipes require specific padding constraints, such as enforcing divisibility by a specific integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter in :paper:vitdet). `padding_constraints` contains these optional items like: { "size_divisibility": int, "square_size": int, # Future options are possible } `size_divisibility` will read from here if presented and `square_size` indicates the square padding size if `square_size` > 0. TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints could be generalized as TypedDict (Python 3.8+) to support more types in the future. """ return {} def output_shape(self): """ Returns: dict[str->ShapeSpec] """ # this is a backward-compatible default return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from annotator.oneformer.detectron2.layers import ShapeSpec from annotator.oneformer.detectron2.utils.registry import Registry from .backbone import Backbone BACKBONE_REGISTRY = Registry("BACKBONE") BACKBONE_REGISTRY.__doc__ = """ Registry for backbones, which extract feature maps from images The registered object must be a callable that accepts two arguments: 1. A :class:`detectron2.config.CfgNode` 2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification. Registered object must return instance of :class:`Backbone`. """ def build_backbone(cfg, input_shape=None): """ Build a backbone from `cfg.MODEL.BACKBONE.NAME`. Returns: an instance of :class:`Backbone` """ if input_shape is None: input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) backbone_name = cfg.MODEL.BACKBONE.NAME backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape) assert isinstance(backbone, Backbone) return backbone ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/fpn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import math import fvcore.nn.weight_init as weight_init import torch import torch.nn.functional as F from torch import nn from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm from .backbone import Backbone from .build import BACKBONE_REGISTRY from .resnet import build_resnet_backbone __all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"] class FPN(Backbone): """ This module implements :paper:`FPN`. It creates pyramid features built on top of some input feature maps. """ _fuse_type: torch.jit.Final[str] def __init__( self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum", square_pad=0, ): """ Args: bottom_up (Backbone): module representing the bottom up subnetwork. Must be a subclass of :class:`Backbone`. The multi-scale feature maps generated by the bottom up network, and listed in `in_features`, are used to generate FPN levels. in_features (list[str]): names of the input feature maps coming from the backbone to which FPN is attached. For example, if the backbone produces ["res2", "res3", "res4"], any *contiguous* sublist of these may be used; order must be from high to low resolution. out_channels (int): number of channels in the output feature maps. norm (str): the normalization to use. top_block (nn.Module or None): if provided, an extra operation will be performed on the output of the last (smallest resolution) FPN output, and the result will extend the result list. The top_block further downsamples the feature map. It must have an attribute "num_levels", meaning the number of extra FPN levels added by this block, and "in_feature", which is a string representing its input feature (e.g., p5). fuse_type (str): types for fusing the top down features and the lateral ones. It can be "sum" (default), which sums up element-wise; or "avg", which takes the element-wise mean of the two. square_pad (int): If > 0, require input images to be padded to specific square size. """ super(FPN, self).__init__() assert isinstance(bottom_up, Backbone) assert in_features, in_features # Feature map strides and channels from the bottom up network (e.g. ResNet) input_shapes = bottom_up.output_shape() strides = [input_shapes[f].stride for f in in_features] in_channels_per_feature = [input_shapes[f].channels for f in in_features] _assert_strides_are_log2_contiguous(strides) lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(in_channels_per_feature): lateral_norm = get_norm(norm, out_channels) output_norm = get_norm(norm, out_channels) lateral_conv = Conv2d( in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) stage = int(math.log2(strides[idx])) self.add_module("fpn_lateral{}".format(stage), lateral_conv) self.add_module("fpn_output{}".format(stage), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] self.top_block = top_block self.in_features = tuple(in_features) self.bottom_up = bottom_up # Return feature names are "p", like ["p2", "p3", ..., "p6"] self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} # top block output feature maps. if self.top_block is not None: for s in range(stage, stage + self.top_block.num_levels): self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) self._out_features = list(self._out_feature_strides.keys()) self._out_feature_channels = {k: out_channels for k in self._out_features} self._size_divisibility = strides[-1] self._square_pad = square_pad assert fuse_type in {"avg", "sum"} self._fuse_type = fuse_type @property def size_divisibility(self): return self._size_divisibility @property def padding_constraints(self): return {"square_size": self._square_pad} def forward(self, x): """ Args: input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to feature map tensor for each feature level in high to low resolution order. Returns: dict[str->Tensor]: mapping from feature map name to FPN feature map tensor in high to low resolution order. Returned feature names follow the FPN paper convention: "p", where stage has stride = 2 ** stage e.g., ["p2", "p3", ..., "p6"]. """ bottom_up_features = self.bottom_up(x) results = [] prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]]) results.append(self.output_convs[0](prev_features)) # Reverse feature maps into top-down order (from low to high resolution) for idx, (lateral_conv, output_conv) in enumerate( zip(self.lateral_convs, self.output_convs) ): # Slicing of ModuleList is not supported https://github.com/pytorch/pytorch/issues/47336 # Therefore we loop over all modules but skip the first one if idx > 0: features = self.in_features[-idx - 1] features = bottom_up_features[features] top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest") lateral_features = lateral_conv(features) prev_features = lateral_features + top_down_features if self._fuse_type == "avg": prev_features /= 2 results.insert(0, output_conv(prev_features)) if self.top_block is not None: if self.top_block.in_feature in bottom_up_features: top_block_in_feature = bottom_up_features[self.top_block.in_feature] else: top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] results.extend(self.top_block(top_block_in_feature)) assert len(self._out_features) == len(results) return {f: res for f, res in zip(self._out_features, results)} def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } def _assert_strides_are_log2_contiguous(strides): """ Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2". """ for i, stride in enumerate(strides[1:], 1): assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format( stride, strides[i - 1] ) class LastLevelMaxPool(nn.Module): """ This module is used in the original FPN to generate a downsampled P6 feature from P5. """ def __init__(self): super().__init__() self.num_levels = 1 self.in_feature = "p5" def forward(self, x): return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)] class LastLevelP6P7(nn.Module): """ This module is used in RetinaNet to generate extra layers, P6 and P7 from C5 feature. """ def __init__(self, in_channels, out_channels, in_feature="res5"): super().__init__() self.num_levels = 2 self.in_feature = in_feature self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) for module in [self.p6, self.p7]: weight_init.c2_xavier_fill(module) def forward(self, c5): p6 = self.p6(c5) p7 = self.p7(F.relu(p6)) return [p6, p7] @BACKBONE_REGISTRY.register() def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_resnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=LastLevelMaxPool(), fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone @BACKBONE_REGISTRY.register() def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): """ Args: cfg: a detectron2 CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ bottom_up = build_resnet_backbone(cfg, input_shape) in_features = cfg.MODEL.FPN.IN_FEATURES out_channels = cfg.MODEL.FPN.OUT_CHANNELS in_channels_p6p7 = bottom_up.output_shape()["res5"].channels backbone = FPN( bottom_up=bottom_up, in_features=in_features, out_channels=out_channels, norm=cfg.MODEL.FPN.NORM, top_block=LastLevelP6P7(in_channels_p6p7, out_channels), fuse_type=cfg.MODEL.FPN.FUSE_TYPE, ) return backbone ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/mvit.py ================================================ import logging import numpy as np import torch import torch.nn as nn from .backbone import Backbone from .utils import ( PatchEmbed, add_decomposed_rel_pos, get_abs_pos, window_partition, window_unpartition, ) logger = logging.getLogger(__name__) __all__ = ["MViT"] def attention_pool(x, pool, norm=None): # (B, H, W, C) -> (B, C, H, W) x = x.permute(0, 3, 1, 2) x = pool(x) # (B, C, H1, W1) -> (B, H1, W1, C) x = x.permute(0, 2, 3, 1) if norm: x = norm(x) return x class MultiScaleAttention(nn.Module): """Multiscale Multi-head Attention block.""" def __init__( self, dim, dim_out, num_heads, qkv_bias=True, norm_layer=nn.LayerNorm, pool_kernel=(3, 3), stride_q=1, stride_kv=1, residual_pooling=True, window_size=0, use_rel_pos=False, rel_pos_zero_init=True, input_size=None, ): """ Args: dim (int): Number of input channels. dim_out (int): Number of output channels. num_heads (int): Number of attention heads. qkv_bias (bool: If True, add a learnable bias to query, key, value. norm_layer (nn.Module): Normalization layer. pool_kernel (tuple): kernel size for qkv pooling layers. stride_q (int): stride size for q pooling layer. stride_kv (int): stride size for kv pooling layer. residual_pooling (bool): If true, enable residual pooling. use_rel_pos (bool): If True, add relative postional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. input_size (int or None): Input resolution. """ super().__init__() self.num_heads = num_heads head_dim = dim_out // num_heads self.scale = head_dim**-0.5 self.qkv = nn.Linear(dim, dim_out * 3, bias=qkv_bias) self.proj = nn.Linear(dim_out, dim_out) # qkv pooling pool_padding = [k // 2 for k in pool_kernel] dim_conv = dim_out // num_heads self.pool_q = nn.Conv2d( dim_conv, dim_conv, pool_kernel, stride=stride_q, padding=pool_padding, groups=dim_conv, bias=False, ) self.norm_q = norm_layer(dim_conv) self.pool_k = nn.Conv2d( dim_conv, dim_conv, pool_kernel, stride=stride_kv, padding=pool_padding, groups=dim_conv, bias=False, ) self.norm_k = norm_layer(dim_conv) self.pool_v = nn.Conv2d( dim_conv, dim_conv, pool_kernel, stride=stride_kv, padding=pool_padding, groups=dim_conv, bias=False, ) self.norm_v = norm_layer(dim_conv) self.window_size = window_size if window_size: self.q_win_size = window_size // stride_q self.kv_win_size = window_size // stride_kv self.residual_pooling = residual_pooling self.use_rel_pos = use_rel_pos if self.use_rel_pos: # initialize relative positional embeddings assert input_size[0] == input_size[1] size = input_size[0] rel_dim = 2 * max(size // stride_q, size // stride_kv) - 1 self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim)) self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim)) if not rel_pos_zero_init: nn.init.trunc_normal_(self.rel_pos_h, std=0.02) nn.init.trunc_normal_(self.rel_pos_w, std=0.02) def forward(self, x): B, H, W, _ = x.shape # qkv with shape (3, B, nHead, H, W, C) qkv = self.qkv(x).reshape(B, H, W, 3, self.num_heads, -1).permute(3, 0, 4, 1, 2, 5) # q, k, v with shape (B * nHead, H, W, C) q, k, v = qkv.reshape(3, B * self.num_heads, H, W, -1).unbind(0) q = attention_pool(q, self.pool_q, self.norm_q) k = attention_pool(k, self.pool_k, self.norm_k) v = attention_pool(v, self.pool_v, self.norm_v) ori_q = q if self.window_size: q, q_hw_pad = window_partition(q, self.q_win_size) k, kv_hw_pad = window_partition(k, self.kv_win_size) v, _ = window_partition(v, self.kv_win_size) q_hw = (self.q_win_size, self.q_win_size) kv_hw = (self.kv_win_size, self.kv_win_size) else: q_hw = q.shape[1:3] kv_hw = k.shape[1:3] q = q.view(q.shape[0], np.prod(q_hw), -1) k = k.view(k.shape[0], np.prod(kv_hw), -1) v = v.view(v.shape[0], np.prod(kv_hw), -1) attn = (q * self.scale) @ k.transpose(-2, -1) if self.use_rel_pos: attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, q_hw, kv_hw) attn = attn.softmax(dim=-1) x = attn @ v x = x.view(x.shape[0], q_hw[0], q_hw[1], -1) if self.window_size: x = window_unpartition(x, self.q_win_size, q_hw_pad, ori_q.shape[1:3]) if self.residual_pooling: x += ori_q H, W = x.shape[1], x.shape[2] x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) x = self.proj(x) return x class MultiScaleBlock(nn.Module): """Multiscale Transformer blocks""" def __init__( self, dim, dim_out, num_heads, mlp_ratio=4.0, qkv_bias=True, drop_path=0.0, norm_layer=nn.LayerNorm, act_layer=nn.GELU, qkv_pool_kernel=(3, 3), stride_q=1, stride_kv=1, residual_pooling=True, window_size=0, use_rel_pos=False, rel_pos_zero_init=True, input_size=None, ): """ Args: dim (int): Number of input channels. dim_out (int): Number of output channels. num_heads (int): Number of attention heads in the MViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. qkv_pool_kernel (tuple): kernel size for qkv pooling layers. stride_q (int): stride size for q pooling layer. stride_kv (int): stride size for kv pooling layer. residual_pooling (bool): If true, enable residual pooling. window_size (int): Window size for window attention blocks. If it equals 0, then not use window attention. use_rel_pos (bool): If True, add relative postional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. input_size (int or None): Input resolution. """ super().__init__() self.norm1 = norm_layer(dim) self.attn = MultiScaleAttention( dim, dim_out, num_heads=num_heads, qkv_bias=qkv_bias, norm_layer=norm_layer, pool_kernel=qkv_pool_kernel, stride_q=stride_q, stride_kv=stride_kv, residual_pooling=residual_pooling, window_size=window_size, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, input_size=input_size, ) from timm.models.layers import DropPath, Mlp self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim_out) self.mlp = Mlp( in_features=dim_out, hidden_features=int(dim_out * mlp_ratio), out_features=dim_out, act_layer=act_layer, ) if dim != dim_out: self.proj = nn.Linear(dim, dim_out) if stride_q > 1: kernel_skip = stride_q + 1 padding_skip = int(kernel_skip // 2) self.pool_skip = nn.MaxPool2d(kernel_skip, stride_q, padding_skip, ceil_mode=False) def forward(self, x): x_norm = self.norm1(x) x_block = self.attn(x_norm) if hasattr(self, "proj"): x = self.proj(x_norm) if hasattr(self, "pool_skip"): x = attention_pool(x, self.pool_skip) x = x + self.drop_path(x_block) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class MViT(Backbone): """ This module implements Multiscale Vision Transformer (MViT) backbone in :paper:'mvitv2'. """ def __init__( self, img_size=224, patch_kernel=(7, 7), patch_stride=(4, 4), patch_padding=(3, 3), in_chans=3, embed_dim=96, depth=16, num_heads=1, last_block_indexes=(0, 2, 11, 15), qkv_pool_kernel=(3, 3), adaptive_kv_stride=4, adaptive_window_size=56, residual_pooling=True, mlp_ratio=4.0, qkv_bias=True, drop_path_rate=0.0, norm_layer=nn.LayerNorm, act_layer=nn.GELU, use_abs_pos=False, use_rel_pos=True, rel_pos_zero_init=True, use_act_checkpoint=False, pretrain_img_size=224, pretrain_use_cls_token=True, out_features=("scale2", "scale3", "scale4", "scale5"), ): """ Args: img_size (int): Input image size. patch_kernel (tuple): kernel size for patch embedding. patch_stride (tuple): stride size for patch embedding. patch_padding (tuple): padding size for patch embedding. in_chans (int): Number of input image channels. embed_dim (int): Patch embedding dimension. depth (int): Depth of MViT. num_heads (int): Number of base attention heads in each MViT block. last_block_indexes (tuple): Block indexes for last blocks in each stage. qkv_pool_kernel (tuple): kernel size for qkv pooling layers. adaptive_kv_stride (int): adaptive stride size for kv pooling. adaptive_window_size (int): adaptive window size for window attention blocks. residual_pooling (bool): If true, enable residual pooling. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path_rate (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_abs_pos (bool): If True, use absolute positional embeddings. use_rel_pos (bool): If True, add relative postional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. use_act_checkpoint (bool): If True, use activation checkpointing. pretrain_img_size (int): input image size for pretraining models. pretrain_use_cls_token (bool): If True, pretrainig models use class token. out_features (tuple): name of the feature maps from each stage. """ super().__init__() self.pretrain_use_cls_token = pretrain_use_cls_token self.patch_embed = PatchEmbed( kernel_size=patch_kernel, stride=patch_stride, padding=patch_padding, in_chans=in_chans, embed_dim=embed_dim, ) if use_abs_pos: # Initialize absoluate positional embedding with pretrain image size. num_patches = (pretrain_img_size // patch_stride[0]) * ( pretrain_img_size // patch_stride[1] ) num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim)) else: self.pos_embed = None # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] dim_out = embed_dim stride_kv = adaptive_kv_stride window_size = adaptive_window_size input_size = (img_size // patch_stride[0], img_size // patch_stride[1]) stage = 2 stride = patch_stride[0] self._out_feature_strides = {} self._out_feature_channels = {} self.blocks = nn.ModuleList() for i in range(depth): # Multiply stride_kv by 2 if it's the last block of stage2 and stage3. if i == last_block_indexes[1] or i == last_block_indexes[2]: stride_kv_ = stride_kv * 2 else: stride_kv_ = stride_kv # hybrid window attention: global attention in last three stages. window_size_ = 0 if i in last_block_indexes[1:] else window_size block = MultiScaleBlock( dim=embed_dim, dim_out=dim_out, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop_path=dpr[i], norm_layer=norm_layer, qkv_pool_kernel=qkv_pool_kernel, stride_q=2 if i - 1 in last_block_indexes else 1, stride_kv=stride_kv_, residual_pooling=residual_pooling, window_size=window_size_, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, input_size=input_size, ) if use_act_checkpoint: # TODO: use torch.utils.checkpoint from fairscale.nn.checkpoint import checkpoint_wrapper block = checkpoint_wrapper(block) self.blocks.append(block) embed_dim = dim_out if i in last_block_indexes: name = f"scale{stage}" if name in out_features: self._out_feature_channels[name] = dim_out self._out_feature_strides[name] = stride self.add_module(f"{name}_norm", norm_layer(dim_out)) dim_out *= 2 num_heads *= 2 stride_kv = max(stride_kv // 2, 1) stride *= 2 stage += 1 if i - 1 in last_block_indexes: window_size = window_size // 2 input_size = [s // 2 for s in input_size] self._out_features = out_features self._last_block_indexes = last_block_indexes if self.pos_embed is not None: nn.init.trunc_normal_(self.pos_embed, std=0.02) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): x = self.patch_embed(x) if self.pos_embed is not None: x = x + get_abs_pos(self.pos_embed, self.pretrain_use_cls_token, x.shape[1:3]) outputs = {} stage = 2 for i, blk in enumerate(self.blocks): x = blk(x) if i in self._last_block_indexes: name = f"scale{stage}" if name in self._out_features: x_out = getattr(self, f"{name}_norm")(x) outputs[name] = x_out.permute(0, 3, 1, 2) stage += 1 return outputs ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/regnet.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ Implementation of RegNet models from :paper:`dds` and :paper:`scaling`. This code is adapted from https://github.com/facebookresearch/pycls with minimal modifications. Some code duplication exists between RegNet and ResNets (e.g., ResStem) in order to simplify model loading. """ import numpy as np from torch import nn from annotator.oneformer.detectron2.layers import CNNBlockBase, ShapeSpec, get_norm from .backbone import Backbone __all__ = [ "AnyNet", "RegNet", "ResStem", "SimpleStem", "VanillaBlock", "ResBasicBlock", "ResBottleneckBlock", ] def conv2d(w_in, w_out, k, *, stride=1, groups=1, bias=False): """Helper for building a conv2d layer.""" assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues." s, p, g, b = stride, (k - 1) // 2, groups, bias return nn.Conv2d(w_in, w_out, k, stride=s, padding=p, groups=g, bias=b) def gap2d(): """Helper for building a global average pooling layer.""" return nn.AdaptiveAvgPool2d((1, 1)) def pool2d(k, *, stride=1): """Helper for building a pool2d layer.""" assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues." return nn.MaxPool2d(k, stride=stride, padding=(k - 1) // 2) def init_weights(m): """Performs ResNet-style weight initialization.""" if isinstance(m, nn.Conv2d): # Note that there is no bias due to BN fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(mean=0.0, std=np.sqrt(2.0 / fan_out)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1.0) m.bias.data.zero_() elif isinstance(m, nn.Linear): m.weight.data.normal_(mean=0.0, std=0.01) m.bias.data.zero_() class ResStem(CNNBlockBase): """ResNet stem for ImageNet: 7x7, BN, AF, MaxPool.""" def __init__(self, w_in, w_out, norm, activation_class): super().__init__(w_in, w_out, 4) self.conv = conv2d(w_in, w_out, 7, stride=2) self.bn = get_norm(norm, w_out) self.af = activation_class() self.pool = pool2d(3, stride=2) def forward(self, x): for layer in self.children(): x = layer(x) return x class SimpleStem(CNNBlockBase): """Simple stem for ImageNet: 3x3, BN, AF.""" def __init__(self, w_in, w_out, norm, activation_class): super().__init__(w_in, w_out, 2) self.conv = conv2d(w_in, w_out, 3, stride=2) self.bn = get_norm(norm, w_out) self.af = activation_class() def forward(self, x): for layer in self.children(): x = layer(x) return x class SE(nn.Module): """Squeeze-and-Excitation (SE) block: AvgPool, FC, Act, FC, Sigmoid.""" def __init__(self, w_in, w_se, activation_class): super().__init__() self.avg_pool = gap2d() self.f_ex = nn.Sequential( conv2d(w_in, w_se, 1, bias=True), activation_class(), conv2d(w_se, w_in, 1, bias=True), nn.Sigmoid(), ) def forward(self, x): return x * self.f_ex(self.avg_pool(x)) class VanillaBlock(CNNBlockBase): """Vanilla block: [3x3 conv, BN, Relu] x2.""" def __init__(self, w_in, w_out, stride, norm, activation_class, _params): super().__init__(w_in, w_out, stride) self.a = conv2d(w_in, w_out, 3, stride=stride) self.a_bn = get_norm(norm, w_out) self.a_af = activation_class() self.b = conv2d(w_out, w_out, 3) self.b_bn = get_norm(norm, w_out) self.b_af = activation_class() def forward(self, x): for layer in self.children(): x = layer(x) return x class BasicTransform(nn.Module): """Basic transformation: [3x3 conv, BN, Relu] x2.""" def __init__(self, w_in, w_out, stride, norm, activation_class, _params): super().__init__() self.a = conv2d(w_in, w_out, 3, stride=stride) self.a_bn = get_norm(norm, w_out) self.a_af = activation_class() self.b = conv2d(w_out, w_out, 3) self.b_bn = get_norm(norm, w_out) self.b_bn.final_bn = True def forward(self, x): for layer in self.children(): x = layer(x) return x class ResBasicBlock(CNNBlockBase): """Residual basic block: x + f(x), f = basic transform.""" def __init__(self, w_in, w_out, stride, norm, activation_class, params): super().__init__(w_in, w_out, stride) self.proj, self.bn = None, None if (w_in != w_out) or (stride != 1): self.proj = conv2d(w_in, w_out, 1, stride=stride) self.bn = get_norm(norm, w_out) self.f = BasicTransform(w_in, w_out, stride, norm, activation_class, params) self.af = activation_class() def forward(self, x): x_p = self.bn(self.proj(x)) if self.proj else x return self.af(x_p + self.f(x)) class BottleneckTransform(nn.Module): """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1.""" def __init__(self, w_in, w_out, stride, norm, activation_class, params): super().__init__() w_b = int(round(w_out * params["bot_mul"])) w_se = int(round(w_in * params["se_r"])) groups = w_b // params["group_w"] self.a = conv2d(w_in, w_b, 1) self.a_bn = get_norm(norm, w_b) self.a_af = activation_class() self.b = conv2d(w_b, w_b, 3, stride=stride, groups=groups) self.b_bn = get_norm(norm, w_b) self.b_af = activation_class() self.se = SE(w_b, w_se, activation_class) if w_se else None self.c = conv2d(w_b, w_out, 1) self.c_bn = get_norm(norm, w_out) self.c_bn.final_bn = True def forward(self, x): for layer in self.children(): x = layer(x) return x class ResBottleneckBlock(CNNBlockBase): """Residual bottleneck block: x + f(x), f = bottleneck transform.""" def __init__(self, w_in, w_out, stride, norm, activation_class, params): super().__init__(w_in, w_out, stride) self.proj, self.bn = None, None if (w_in != w_out) or (stride != 1): self.proj = conv2d(w_in, w_out, 1, stride=stride) self.bn = get_norm(norm, w_out) self.f = BottleneckTransform(w_in, w_out, stride, norm, activation_class, params) self.af = activation_class() def forward(self, x): x_p = self.bn(self.proj(x)) if self.proj else x return self.af(x_p + self.f(x)) class AnyStage(nn.Module): """AnyNet stage (sequence of blocks w/ the same output shape).""" def __init__(self, w_in, w_out, stride, d, block_class, norm, activation_class, params): super().__init__() for i in range(d): block = block_class(w_in, w_out, stride, norm, activation_class, params) self.add_module("b{}".format(i + 1), block) stride, w_in = 1, w_out def forward(self, x): for block in self.children(): x = block(x) return x class AnyNet(Backbone): """AnyNet model. See :paper:`dds`.""" def __init__( self, *, stem_class, stem_width, block_class, depths, widths, group_widths, strides, bottleneck_ratios, se_ratio, activation_class, freeze_at=0, norm="BN", out_features=None, ): """ Args: stem_class (callable): A callable taking 4 arguments (channels in, channels out, normalization, callable returning an activation function) that returns another callable implementing the stem module. stem_width (int): The number of output channels that the stem produces. block_class (callable): A callable taking 6 arguments (channels in, channels out, stride, normalization, callable returning an activation function, a dict of block-specific parameters) that returns another callable implementing the repeated block module. depths (list[int]): Number of blocks in each stage. widths (list[int]): For each stage, the number of output channels of each block. group_widths (list[int]): For each stage, the number of channels per group in group convolution, if the block uses group convolution. strides (list[int]): The stride that each network stage applies to its input. bottleneck_ratios (list[float]): For each stage, the ratio of the number of bottleneck channels to the number of block input channels (or, equivalently, output channels), if the block uses a bottleneck. se_ratio (float): The ratio of the number of channels used inside the squeeze-excitation (SE) module to it number of input channels, if SE the block uses SE. activation_class (callable): A callable taking no arguments that returns another callable implementing an activation function. freeze_at (int): The number of stages at the beginning to freeze. see :meth:`freeze` for detailed explanation. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. out_features (list[str]): name of the layers whose outputs should be returned in forward. RegNet's use "stem" and "s1", "s2", etc for the stages after the stem. If None, will return the output of the last layer. """ super().__init__() self.stem = stem_class(3, stem_width, norm, activation_class) current_stride = self.stem.stride self._out_feature_strides = {"stem": current_stride} self._out_feature_channels = {"stem": self.stem.out_channels} self.stages_and_names = [] prev_w = stem_width for i, (d, w, s, b, g) in enumerate( zip(depths, widths, strides, bottleneck_ratios, group_widths) ): params = {"bot_mul": b, "group_w": g, "se_r": se_ratio} stage = AnyStage(prev_w, w, s, d, block_class, norm, activation_class, params) name = "s{}".format(i + 1) self.add_module(name, stage) self.stages_and_names.append((stage, name)) self._out_feature_strides[name] = current_stride = int( current_stride * np.prod([k.stride for k in stage.children()]) ) self._out_feature_channels[name] = list(stage.children())[-1].out_channels prev_w = w self.apply(init_weights) if out_features is None: out_features = [name] self._out_features = out_features assert len(self._out_features) children = [x[0] for x in self.named_children()] for out_feature in self._out_features: assert out_feature in children, "Available children: {} does not include {}".format( ", ".join(children), out_feature ) self.freeze(freeze_at) def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert x.dim() == 4, f"Model takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} x = self.stem(x) if "stem" in self._out_features: outputs["stem"] = x for stage, name in self.stages_and_names: x = stage(x) if name in self._out_features: outputs[name] = x return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } def freeze(self, freeze_at=0): """ Freeze the first several stages of the model. Commonly used in fine-tuning. Layers that produce the same feature map spatial size are defined as one "stage" by :paper:`FPN`. Args: freeze_at (int): number of stages to freeze. `1` means freezing the stem. `2` means freezing the stem and one residual stage, etc. Returns: nn.Module: this model itself """ if freeze_at >= 1: self.stem.freeze() for idx, (stage, _) in enumerate(self.stages_and_names, start=2): if freeze_at >= idx: for block in stage.children(): block.freeze() return self def adjust_block_compatibility(ws, bs, gs): """Adjusts the compatibility of widths, bottlenecks, and groups.""" assert len(ws) == len(bs) == len(gs) assert all(w > 0 and b > 0 and g > 0 for w, b, g in zip(ws, bs, gs)) vs = [int(max(1, w * b)) for w, b in zip(ws, bs)] gs = [int(min(g, v)) for g, v in zip(gs, vs)] ms = [np.lcm(g, b) if b > 1 else g for g, b in zip(gs, bs)] vs = [max(m, int(round(v / m) * m)) for v, m in zip(vs, ms)] ws = [int(v / b) for v, b in zip(vs, bs)] assert all(w * b % g == 0 for w, b, g in zip(ws, bs, gs)) return ws, bs, gs def generate_regnet_parameters(w_a, w_0, w_m, d, q=8): """Generates per stage widths and depths from RegNet parameters.""" assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0 # Generate continuous per-block ws ws_cont = np.arange(d) * w_a + w_0 # Generate quantized per-block ws ks = np.round(np.log(ws_cont / w_0) / np.log(w_m)) ws_all = w_0 * np.power(w_m, ks) ws_all = np.round(np.divide(ws_all, q)).astype(int) * q # Generate per stage ws and ds (assumes ws_all are sorted) ws, ds = np.unique(ws_all, return_counts=True) # Compute number of actual stages and total possible stages num_stages, total_stages = len(ws), ks.max() + 1 # Convert numpy arrays to lists and return ws, ds, ws_all, ws_cont = (x.tolist() for x in (ws, ds, ws_all, ws_cont)) return ws, ds, num_stages, total_stages, ws_all, ws_cont class RegNet(AnyNet): """RegNet model. See :paper:`dds`.""" def __init__( self, *, stem_class, stem_width, block_class, depth, w_a, w_0, w_m, group_width, stride=2, bottleneck_ratio=1.0, se_ratio=0.0, activation_class=None, freeze_at=0, norm="BN", out_features=None, ): """ Build a RegNet from the parameterization described in :paper:`dds` Section 3.3. Args: See :class:`AnyNet` for arguments that are not listed here. depth (int): Total number of blocks in the RegNet. w_a (float): Factor by which block width would increase prior to quantizing block widths by stage. See :paper:`dds` Section 3.3. w_0 (int): Initial block width. See :paper:`dds` Section 3.3. w_m (float): Parameter controlling block width quantization. See :paper:`dds` Section 3.3. group_width (int): Number of channels per group in group convolution, if the block uses group convolution. bottleneck_ratio (float): The ratio of the number of bottleneck channels to the number of block input channels (or, equivalently, output channels), if the block uses a bottleneck. stride (int): The stride that each network stage applies to its input. """ ws, ds = generate_regnet_parameters(w_a, w_0, w_m, depth)[0:2] ss = [stride for _ in ws] bs = [bottleneck_ratio for _ in ws] gs = [group_width for _ in ws] ws, bs, gs = adjust_block_compatibility(ws, bs, gs) def default_activation_class(): return nn.ReLU(inplace=True) super().__init__( stem_class=stem_class, stem_width=stem_width, block_class=block_class, depths=ds, widths=ws, strides=ss, group_widths=gs, bottleneck_ratios=bs, se_ratio=se_ratio, activation_class=default_activation_class if activation_class is None else activation_class, freeze_at=freeze_at, norm=norm, out_features=out_features, ) ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/resnet.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import numpy as np import fvcore.nn.weight_init as weight_init import torch import torch.nn.functional as F from torch import nn from annotator.oneformer.detectron2.layers import ( CNNBlockBase, Conv2d, DeformConv, ModulatedDeformConv, ShapeSpec, get_norm, ) from .backbone import Backbone from .build import BACKBONE_REGISTRY __all__ = [ "ResNetBlockBase", "BasicBlock", "BottleneckBlock", "DeformBottleneckBlock", "BasicStem", "ResNet", "make_stage", "build_resnet_backbone", ] class BasicBlock(CNNBlockBase): """ The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`, with two 3x3 conv layers and a projection shortcut if needed. """ def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"): """ Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. stride (int): Stride for the first conv. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. """ super().__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None self.conv1 = Conv2d( in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False, norm=get_norm(norm, out_channels), ) self.conv2 = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) def forward(self, x): out = self.conv1(x) out = F.relu_(out) out = self.conv2(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu_(out) return out class BottleneckBlock(CNNBlockBase): """ The standard bottleneck residual block used by ResNet-50, 101 and 152 defined in :paper:`ResNet`. It contains 3 conv layers with kernels 1x1, 3x3, 1x1, and a projection shortcut if needed. """ def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, ): """ Args: bottleneck_channels (int): number of output channels for the 3x3 "bottleneck" conv layers. num_groups (int): number of groups for the 3x3 conv layer. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. stride_in_1x1 (bool): when stride>1, whether to put stride in the first 1x1 convolution or the bottleneck 3x3 convolution. dilation (int): the dilation rate of the 3x3 conv layer. """ super().__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None # The original MSRA ResNet models have stride in the first 1x1 conv # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have # stride in the 3x3 conv stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, norm=get_norm(norm, bottleneck_channels), ) self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, norm=get_norm(norm, bottleneck_channels), ) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) # Zero-initialize the last normalization in each residual branch, # so that at the beginning, the residual branch starts with zeros, # and each residual block behaves like an identity. # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": # "For BN layers, the learnable scaling coefficient γ is initialized # to be 1, except for each residual block's last BN # where γ is initialized to be 0." # nn.init.constant_(self.conv3.norm.weight, 0) # TODO this somehow hurts performance when training GN models from scratch. # Add it as an option when we need to use this code to train a backbone. def forward(self, x): out = self.conv1(x) out = F.relu_(out) out = self.conv2(out) out = F.relu_(out) out = self.conv3(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu_(out) return out class DeformBottleneckBlock(CNNBlockBase): """ Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv ` in the 3x3 convolution. """ def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, deform_modulated=False, deform_num_groups=1, ): super().__init__(in_channels, out_channels, stride) self.deform_modulated = deform_modulated if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, norm=get_norm(norm, bottleneck_channels), ) if deform_modulated: deform_conv_op = ModulatedDeformConv # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size offset_channels = 27 else: deform_conv_op = DeformConv offset_channels = 18 self.conv2_offset = Conv2d( bottleneck_channels, offset_channels * deform_num_groups, kernel_size=3, stride=stride_3x3, padding=1 * dilation, dilation=dilation, ) self.conv2 = deform_conv_op( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, deformable_groups=deform_num_groups, norm=get_norm(norm, bottleneck_channels), ) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) nn.init.constant_(self.conv2_offset.weight, 0) nn.init.constant_(self.conv2_offset.bias, 0) def forward(self, x): out = self.conv1(x) out = F.relu_(out) if self.deform_modulated: offset_mask = self.conv2_offset(out) offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((offset_x, offset_y), dim=1) mask = mask.sigmoid() out = self.conv2(out, offset, mask) else: offset = self.conv2_offset(out) out = self.conv2(out, offset) out = F.relu_(out) out = self.conv3(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu_(out) return out class BasicStem(CNNBlockBase): """ The standard ResNet stem (layers before the first residual block), with a conv, relu and max_pool. """ def __init__(self, in_channels=3, out_channels=64, norm="BN"): """ Args: norm (str or callable): norm after the first conv layer. See :func:`layers.get_norm` for supported format. """ super().__init__(in_channels, out_channels, 4) self.in_channels = in_channels self.conv1 = Conv2d( in_channels, out_channels, kernel_size=7, stride=2, padding=3, bias=False, norm=get_norm(norm, out_channels), ) weight_init.c2_msra_fill(self.conv1) def forward(self, x): x = self.conv1(x) x = F.relu_(x) x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) return x class ResNet(Backbone): """ Implement :paper:`ResNet`. """ def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0): """ Args: stem (nn.Module): a stem module stages (list[list[CNNBlockBase]]): several (typically 4) stages, each contains multiple :class:`CNNBlockBase`. num_classes (None or int): if None, will not perform classification. Otherwise, will create a linear layer. out_features (list[str]): name of the layers whose outputs should be returned in forward. Can be anything in "stem", "linear", or "res2" ... If None, will return the output of the last layer. freeze_at (int): The number of stages at the beginning to freeze. see :meth:`freeze` for detailed explanation. """ super().__init__() self.stem = stem self.num_classes = num_classes current_stride = self.stem.stride self._out_feature_strides = {"stem": current_stride} self._out_feature_channels = {"stem": self.stem.out_channels} self.stage_names, self.stages = [], [] if out_features is not None: # Avoid keeping unused layers in this module. They consume extra memory # and may cause allreduce to fail num_stages = max( [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features] ) stages = stages[:num_stages] for i, blocks in enumerate(stages): assert len(blocks) > 0, len(blocks) for block in blocks: assert isinstance(block, CNNBlockBase), block name = "res" + str(i + 2) stage = nn.Sequential(*blocks) self.add_module(name, stage) self.stage_names.append(name) self.stages.append(stage) self._out_feature_strides[name] = current_stride = int( current_stride * np.prod([k.stride for k in blocks]) ) self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels self.stage_names = tuple(self.stage_names) # Make it static for scripting if num_classes is not None: self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.linear = nn.Linear(curr_channels, num_classes) # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": # "The 1000-way fully-connected layer is initialized by # drawing weights from a zero-mean Gaussian with standard deviation of 0.01." nn.init.normal_(self.linear.weight, std=0.01) name = "linear" if out_features is None: out_features = [name] self._out_features = out_features assert len(self._out_features) children = [x[0] for x in self.named_children()] for out_feature in self._out_features: assert out_feature in children, "Available children: {}".format(", ".join(children)) self.freeze(freeze_at) def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} x = self.stem(x) if "stem" in self._out_features: outputs["stem"] = x for name, stage in zip(self.stage_names, self.stages): x = stage(x) if name in self._out_features: outputs[name] = x if self.num_classes is not None: x = self.avgpool(x) x = torch.flatten(x, 1) x = self.linear(x) if "linear" in self._out_features: outputs["linear"] = x return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } def freeze(self, freeze_at=0): """ Freeze the first several stages of the ResNet. Commonly used in fine-tuning. Layers that produce the same feature map spatial size are defined as one "stage" by :paper:`FPN`. Args: freeze_at (int): number of stages to freeze. `1` means freezing the stem. `2` means freezing the stem and one residual stage, etc. Returns: nn.Module: this ResNet itself """ if freeze_at >= 1: self.stem.freeze() for idx, stage in enumerate(self.stages, start=2): if freeze_at >= idx: for block in stage.children(): block.freeze() return self @staticmethod def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs): """ Create a list of blocks of the same type that forms one ResNet stage. Args: block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this stage. A module of this type must not change spatial resolution of inputs unless its stride != 1. num_blocks (int): number of blocks in this stage in_channels (int): input channels of the entire stage. out_channels (int): output channels of **every block** in the stage. kwargs: other arguments passed to the constructor of `block_class`. If the argument name is "xx_per_block", the argument is a list of values to be passed to each block in the stage. Otherwise, the same argument is passed to every block in the stage. Returns: list[CNNBlockBase]: a list of block module. Examples: :: stage = ResNet.make_stage( BottleneckBlock, 3, in_channels=16, out_channels=64, bottleneck_channels=16, num_groups=1, stride_per_block=[2, 1, 1], dilations_per_block=[1, 1, 2] ) Usually, layers that produce the same feature map spatial size are defined as one "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should all be 1. """ blocks = [] for i in range(num_blocks): curr_kwargs = {} for k, v in kwargs.items(): if k.endswith("_per_block"): assert len(v) == num_blocks, ( f"Argument '{k}' of make_stage should have the " f"same length as num_blocks={num_blocks}." ) newk = k[: -len("_per_block")] assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!" curr_kwargs[newk] = v[i] else: curr_kwargs[k] = v blocks.append( block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs) ) in_channels = out_channels return blocks @staticmethod def make_default_stages(depth, block_class=None, **kwargs): """ Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152). If it doesn't create the ResNet variant you need, please use :meth:`make_stage` instead for fine-grained customization. Args: depth (int): depth of ResNet block_class (type): the CNN block class. Has to accept `bottleneck_channels` argument for depth > 50. By default it is BasicBlock or BottleneckBlock, based on the depth. kwargs: other arguments to pass to `make_stage`. Should not contain stride and channels, as they are predefined for each depth. Returns: list[list[CNNBlockBase]]: modules in all stages; see arguments of :class:`ResNet.__init__`. """ num_blocks_per_stage = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3], }[depth] if block_class is None: block_class = BasicBlock if depth < 50 else BottleneckBlock if depth < 50: in_channels = [64, 64, 128, 256] out_channels = [64, 128, 256, 512] else: in_channels = [64, 256, 512, 1024] out_channels = [256, 512, 1024, 2048] ret = [] for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels): if depth >= 50: kwargs["bottleneck_channels"] = o // 4 ret.append( ResNet.make_stage( block_class=block_class, num_blocks=n, stride_per_block=[s] + [1] * (n - 1), in_channels=i, out_channels=o, **kwargs, ) ) return ret ResNetBlockBase = CNNBlockBase """ Alias for backward compatibiltiy. """ def make_stage(*args, **kwargs): """ Deprecated alias for backward compatibiltiy. """ return ResNet.make_stage(*args, **kwargs) @BACKBONE_REGISTRY.register() def build_resnet_backbone(cfg, input_shape): """ Create a ResNet instance from config. Returns: ResNet: a :class:`ResNet` instance. """ # need registration of new blocks/stems? norm = cfg.MODEL.RESNETS.NORM stem = BasicStem( in_channels=input_shape.channels, out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, norm=norm, ) # fmt: off freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT out_features = cfg.MODEL.RESNETS.OUT_FEATURES depth = cfg.MODEL.RESNETS.DEPTH num_groups = cfg.MODEL.RESNETS.NUM_GROUPS width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP bottleneck_channels = num_groups * width_per_group in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS # fmt: on assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation) num_blocks_per_stage = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3], }[depth] if depth in [18, 34]: assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34" assert not any( deform_on_per_stage ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34" assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34" assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34" stages = [] for idx, stage_idx in enumerate(range(2, 6)): # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper dilation = res5_dilation if stage_idx == 5 else 1 first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2 stage_kargs = { "num_blocks": num_blocks_per_stage[idx], "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1), "in_channels": in_channels, "out_channels": out_channels, "norm": norm, } # Use BasicBlock for R18 and R34. if depth in [18, 34]: stage_kargs["block_class"] = BasicBlock else: stage_kargs["bottleneck_channels"] = bottleneck_channels stage_kargs["stride_in_1x1"] = stride_in_1x1 stage_kargs["dilation"] = dilation stage_kargs["num_groups"] = num_groups if deform_on_per_stage[idx]: stage_kargs["block_class"] = DeformBottleneckBlock stage_kargs["deform_modulated"] = deform_modulated stage_kargs["deform_num_groups"] = deform_num_groups else: stage_kargs["block_class"] = BottleneckBlock blocks = ResNet.make_stage(**stage_kargs) in_channels = out_channels out_channels *= 2 bottleneck_channels *= 2 stages.append(blocks) return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at) ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/swin.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ Implementation of Swin models from :paper:`swin`. This code is adapted from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py with minimal modifications. # noqa -------------------------------------------------------- Swin Transformer Copyright (c) 2021 Microsoft Licensed under The MIT License [see LICENSE for details] Written by Ze Liu, Yutong Lin, Yixuan Wei -------------------------------------------------------- LICENSE: https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/461e003166a8083d0b620beacd4662a2df306bd6/LICENSE """ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from annotator.oneformer.detectron2.modeling.backbone.backbone import Backbone _to_2tuple = nn.modules.utils._ntuple(2) class Mlp(nn.Module): """Multilayer perceptron.""" def __init__( self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): """Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) ) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """Forward function. Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = ( self.qkv(x) .reshape(B_, N, 3, self.num_heads, C // self.num_heads) .permute(2, 0, 3, 1, 4) ) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = q @ k.transpose(-2, -1) relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 ) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1 ).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x class SwinTransformerBlock(nn.Module): """Swin Transformer Block. Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__( self, dim, num_heads, window_size=7, shift_size=0, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, ): super().__init__() self.dim = dim self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=_to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) if drop_path > 0.0: from timm.models.layers import DropPath self.drop_path = DropPath(drop_path) else: self.drop_path = nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop ) self.H = None self.W = None def forward(self, x, mask_matrix): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. mask_matrix: Attention mask for cyclic shift. """ B, L, C = x.shape H, W = self.H, self.W assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # pad feature maps to multiples of window size pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) _, Hp, Wp, _ = x.shape # cyclic shift if self.shift_size > 0: shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) attn_mask = mask_matrix else: shifted_x = x attn_mask = None # partition windows x_windows = window_partition( shifted_x, self.window_size ) # nW*B, window_size, window_size, C x_windows = x_windows.view( -1, self.window_size * self.window_size, C ) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class PatchMerging(nn.Module): """Patch Merging Layer Args: dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ B, L, C = x.shape assert L == H * W, "input feature has wrong size" x = x.view(B, H, W, C) # padding pad_input = (H % 2 == 1) or (W % 2 == 1) if pad_input: x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x class BasicLayer(nn.Module): """A basic Swin Transformer layer for one stage. Args: dim (int): Number of feature channels depth (int): Depths of this stage. num_heads (int): Number of attention head. window_size (int): Local window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, dim, depth, num_heads, window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, ): super().__init__() self.window_size = window_size self.shift_size = window_size // 2 self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList( [ SwinTransformerBlock( dim=dim, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer, ) for i in range(depth) ] ) # patch merging layer if downsample is not None: self.downsample = downsample(dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ # calculate attention mask for SW-MSA Hp = int(np.ceil(H / self.window_size)) * self.window_size Wp = int(np.ceil(W / self.window_size)) * self.window_size img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 h_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) w_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition( img_mask, self.window_size ) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0) ) for blk in self.blocks: blk.H, blk.W = H, W if self.use_checkpoint: x = checkpoint.checkpoint(blk, x, attn_mask) else: x = blk(x, attn_mask) if self.downsample is not None: x_down = self.downsample(x, H, W) Wh, Ww = (H + 1) // 2, (W + 1) // 2 return x, H, W, x_down, Wh, Ww else: return x, H, W, x, H, W class PatchEmbed(nn.Module): """Image to Patch Embedding Args: patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() patch_size = _to_2tuple(patch_size) self.patch_size = patch_size self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): """Forward function.""" # padding _, _, H, W = x.size() if W % self.patch_size[1] != 0: x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) if H % self.patch_size[0] != 0: x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) x = self.proj(x) # B C Wh Ww if self.norm is not None: Wh, Ww = x.size(2), x.size(3) x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) return x class SwinTransformer(Backbone): """Swin Transformer backbone. A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default 224. patch_size (int | tuple(int)): Patch size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. depths (tuple[int]): Depths of each Swin Transformer stage. num_heads (tuple[int]): Number of attention head of each stage. window_size (int): Window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. drop_rate (float): Dropout rate. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Default: 0.2. norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. patch_norm (bool): If True, add normalization after patch embedding. Default: True. out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, pretrain_img_size=224, patch_size=4, in_chans=3, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, out_indices=(0, 1, 2, 3), frozen_stages=-1, use_checkpoint=False, ): super().__init__() self.pretrain_img_size = pretrain_img_size self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.out_indices = out_indices self.frozen_stages = frozen_stages # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, ) # absolute position embedding if self.ape: pretrain_img_size = _to_2tuple(pretrain_img_size) patch_size = _to_2tuple(patch_size) patches_resolution = [ pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1], ] self.absolute_pos_embed = nn.Parameter( torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) ) nn.init.trunc_normal_(self.absolute_pos_embed, std=0.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2**i_layer), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint, ) self.layers.append(layer) num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] self.num_features = num_features # add a norm layer for each output for i_layer in out_indices: layer = norm_layer(num_features[i_layer]) layer_name = f"norm{i_layer}" self.add_module(layer_name, layer) self._freeze_stages() self._out_features = ["p{}".format(i) for i in self.out_indices] self._out_feature_channels = { "p{}".format(i): self.embed_dim * 2**i for i in self.out_indices } self._out_feature_strides = {"p{}".format(i): 2 ** (i + 2) for i in self.out_indices} self._size_devisibility = 32 self.apply(self._init_weights) def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.frozen_stages >= 1 and self.ape: self.absolute_pos_embed.requires_grad = False if self.frozen_stages >= 2: self.pos_drop.eval() for i in range(0, self.frozen_stages - 1): m = self.layers[i] m.eval() for param in m.parameters(): param.requires_grad = False def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) @property def size_divisibility(self): return self._size_divisibility def forward(self, x): """Forward function.""" x = self.patch_embed(x) Wh, Ww = x.size(2), x.size(3) if self.ape: # interpolate the position embedding to the corresponding size absolute_pos_embed = F.interpolate( self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" ) x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C else: x = x.flatten(2).transpose(1, 2) x = self.pos_drop(x) outs = {} for i in range(self.num_layers): layer = self.layers[i] x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) if i in self.out_indices: norm_layer = getattr(self, f"norm{i}") x_out = norm_layer(x_out) out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs["p{}".format(i)] = out return outs ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import math import torch import torch.nn as nn import torch.nn.functional as F __all__ = [ "window_partition", "window_unpartition", "add_decomposed_rel_pos", "get_abs_pos", "PatchEmbed", ] def window_partition(x, window_size): """ Partition into non-overlapping windows with padding if needed. Args: x (tensor): input tokens with [B, H, W, C]. window_size (int): window size. Returns: windows: windows after partition with [B * num_windows, window_size, window_size, C]. (Hp, Wp): padded height and width before partition """ B, H, W, C = x.shape pad_h = (window_size - H % window_size) % window_size pad_w = (window_size - W % window_size) % window_size if pad_h > 0 or pad_w > 0: x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) Hp, Wp = H + pad_h, W + pad_w x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows, (Hp, Wp) def window_unpartition(windows, window_size, pad_hw, hw): """ Window unpartition into original sequences and removing padding. Args: x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. window_size (int): window size. pad_hw (Tuple): padded height and width (Hp, Wp). hw (Tuple): original height and width (H, W) before padding. Returns: x: unpartitioned sequences with [B, H, W, C]. """ Hp, Wp = pad_hw H, W = hw B = windows.shape[0] // (Hp * Wp // window_size // window_size) x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) if Hp > H or Wp > W: x = x[:, :H, :W, :].contiguous() return x def get_rel_pos(q_size, k_size, rel_pos): """ Get relative positional embeddings according to the relative positions of query and key sizes. Args: q_size (int): size of query q. k_size (int): size of key k. rel_pos (Tensor): relative position embeddings (L, C). Returns: Extracted positional embeddings according to relative positions. """ max_rel_dist = int(2 * max(q_size, k_size) - 1) # Interpolate rel pos if needed. if rel_pos.shape[0] != max_rel_dist: # Interpolate rel pos. rel_pos_resized = F.interpolate( rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), size=max_rel_dist, mode="linear", ) rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) else: rel_pos_resized = rel_pos # Scale the coords with short length if shapes for q and k are different. q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) return rel_pos_resized[relative_coords.long()] def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size): """ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 Args: attn (Tensor): attention map. q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. q_size (Tuple): spatial sequence size of query q with (q_h, q_w). k_size (Tuple): spatial sequence size of key k with (k_h, k_w). Returns: attn (Tensor): attention map with added relative positional embeddings. """ q_h, q_w = q_size k_h, k_w = k_size Rh = get_rel_pos(q_h, k_h, rel_pos_h) Rw = get_rel_pos(q_w, k_w, rel_pos_w) B, _, dim = q.shape r_q = q.reshape(B, q_h, q_w, dim) rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) attn = ( attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] ).view(B, q_h * q_w, k_h * k_w) return attn def get_abs_pos(abs_pos, has_cls_token, hw): """ Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the original embeddings. Args: abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. hw (Tuple): size of input image tokens. Returns: Absolute positional embeddings after processing with shape (1, H, W, C) """ h, w = hw if has_cls_token: abs_pos = abs_pos[:, 1:] xy_num = abs_pos.shape[1] size = int(math.sqrt(xy_num)) assert size * size == xy_num if size != h or size != w: new_abs_pos = F.interpolate( abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2), size=(h, w), mode="bicubic", align_corners=False, ) return new_abs_pos.permute(0, 2, 3, 1) else: return abs_pos.reshape(1, h, w, -1) class PatchEmbed(nn.Module): """ Image to Patch Embedding. """ def __init__( self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768 ): """ Args: kernel_size (Tuple): kernel size of the projection layer. stride (Tuple): stride of the projection layer. padding (Tuple): padding size of the projection layer. in_chans (int): Number of input image channels. embed_dim (int): embed_dim (int): Patch embedding dimension. """ super().__init__() self.proj = nn.Conv2d( in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding ) def forward(self, x): x = self.proj(x) # B C H W -> B H W C x = x.permute(0, 2, 3, 1) return x ================================================ FILE: annotator/oneformer/detectron2/modeling/backbone/vit.py ================================================ import logging import math import fvcore.nn.weight_init as weight_init import torch import torch.nn as nn from annotator.oneformer.detectron2.layers import CNNBlockBase, Conv2d, get_norm from annotator.oneformer.detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous from .backbone import Backbone from .utils import ( PatchEmbed, add_decomposed_rel_pos, get_abs_pos, window_partition, window_unpartition, ) logger = logging.getLogger(__name__) __all__ = ["ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"] class Attention(nn.Module): """Multi-head Attention block with relative position embeddings.""" def __init__( self, dim, num_heads=8, qkv_bias=True, use_rel_pos=False, rel_pos_zero_init=True, input_size=None, ): """ Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. qkv_bias (bool: If True, add a learnable bias to query, key, value. rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. input_size (int or None): Input resolution for calculating the relative positional parameter size. """ super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.proj = nn.Linear(dim, dim) self.use_rel_pos = use_rel_pos if self.use_rel_pos: # initialize relative positional embeddings self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) if not rel_pos_zero_init: nn.init.trunc_normal_(self.rel_pos_h, std=0.02) nn.init.trunc_normal_(self.rel_pos_w, std=0.02) def forward(self, x): B, H, W, _ = x.shape # qkv with shape (3, B, nHead, H * W, C) qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # q, k, v with shape (B * nHead, H * W, C) q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) attn = (q * self.scale) @ k.transpose(-2, -1) if self.use_rel_pos: attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) attn = attn.softmax(dim=-1) x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) x = self.proj(x) return x class ResBottleneckBlock(CNNBlockBase): """ The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels 1x1, 3x3, 1x1. """ def __init__( self, in_channels, out_channels, bottleneck_channels, norm="LN", act_layer=nn.GELU, ): """ Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. bottleneck_channels (int): number of output channels for the 3x3 "bottleneck" conv layers. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. act_layer (callable): activation for all conv layers. """ super().__init__(in_channels, out_channels, 1) self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False) self.norm1 = get_norm(norm, bottleneck_channels) self.act1 = act_layer() self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, 3, padding=1, bias=False, ) self.norm2 = get_norm(norm, bottleneck_channels) self.act2 = act_layer() self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False) self.norm3 = get_norm(norm, out_channels) for layer in [self.conv1, self.conv2, self.conv3]: weight_init.c2_msra_fill(layer) for layer in [self.norm1, self.norm2]: layer.weight.data.fill_(1.0) layer.bias.data.zero_() # zero init last norm layer. self.norm3.weight.data.zero_() self.norm3.bias.data.zero_() def forward(self, x): out = x for layer in self.children(): out = layer(out) out = x + out return out class Block(nn.Module): """Transformer blocks with support of window attention and residual propagation blocks""" def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=True, drop_path=0.0, norm_layer=nn.LayerNorm, act_layer=nn.GELU, use_rel_pos=False, rel_pos_zero_init=True, window_size=0, use_residual_block=False, input_size=None, ): """ Args: dim (int): Number of input channels. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. If it equals 0, then not use window attention. use_residual_block (bool): If True, use a residual block after the MLP block. input_size (int or None): Input resolution for calculating the relative positional parameter size. """ super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, input_size=input_size if window_size == 0 else (window_size, window_size), ) from timm.models.layers import DropPath, Mlp self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer) self.window_size = window_size self.use_residual_block = use_residual_block if use_residual_block: # Use a residual block with bottleneck channel as dim // 2 self.residual = ResBottleneckBlock( in_channels=dim, out_channels=dim, bottleneck_channels=dim // 2, norm="LN", act_layer=act_layer, ) def forward(self, x): shortcut = x x = self.norm1(x) # Window partition if self.window_size > 0: H, W = x.shape[1], x.shape[2] x, pad_hw = window_partition(x, self.window_size) x = self.attn(x) # Reverse window partition if self.window_size > 0: x = window_unpartition(x, self.window_size, pad_hw, (H, W)) x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) if self.use_residual_block: x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) return x class ViT(Backbone): """ This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`. "Exploring Plain Vision Transformer Backbones for Object Detection", https://arxiv.org/abs/2203.16527 """ def __init__( self, img_size=1024, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, qkv_bias=True, drop_path_rate=0.0, norm_layer=nn.LayerNorm, act_layer=nn.GELU, use_abs_pos=True, use_rel_pos=False, rel_pos_zero_init=True, window_size=0, window_block_indexes=(), residual_block_indexes=(), use_act_checkpoint=False, pretrain_img_size=224, pretrain_use_cls_token=True, out_feature="last_feat", ): """ Args: img_size (int): Input image size. patch_size (int): Patch size. in_chans (int): Number of input image channels. embed_dim (int): Patch embedding dimension. depth (int): Depth of ViT. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path_rate (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_abs_pos (bool): If True, use absolute positional embeddings. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. window_block_indexes (list): Indexes for blocks using window attention. residual_block_indexes (list): Indexes for blocks using conv propagation. use_act_checkpoint (bool): If True, use activation checkpointing. pretrain_img_size (int): input image size for pretraining models. pretrain_use_cls_token (bool): If True, pretrainig models use class token. out_feature (str): name of the feature from the last block. """ super().__init__() self.pretrain_use_cls_token = pretrain_use_cls_token self.patch_embed = PatchEmbed( kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), in_chans=in_chans, embed_dim=embed_dim, ) if use_abs_pos: # Initialize absolute positional embedding with pretrain image size. num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size) num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim)) else: self.pos_embed = None # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] self.blocks = nn.ModuleList() for i in range(depth): block = Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, window_size=window_size if i in window_block_indexes else 0, use_residual_block=i in residual_block_indexes, input_size=(img_size // patch_size, img_size // patch_size), ) if use_act_checkpoint: # TODO: use torch.utils.checkpoint from fairscale.nn.checkpoint import checkpoint_wrapper block = checkpoint_wrapper(block) self.blocks.append(block) self._out_feature_channels = {out_feature: embed_dim} self._out_feature_strides = {out_feature: patch_size} self._out_features = [out_feature] if self.pos_embed is not None: nn.init.trunc_normal_(self.pos_embed, std=0.02) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): x = self.patch_embed(x) if self.pos_embed is not None: x = x + get_abs_pos( self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2]) ) for blk in self.blocks: x = blk(x) outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)} return outputs class SimpleFeaturePyramid(Backbone): """ This module implements SimpleFeaturePyramid in :paper:`vitdet`. It creates pyramid features built on top of the input feature map. """ def __init__( self, net, in_feature, out_channels, scale_factors, top_block=None, norm="LN", square_pad=0, ): """ Args: net (Backbone): module representing the subnetwork backbone. Must be a subclass of :class:`Backbone`. in_feature (str): names of the input feature maps coming from the net. out_channels (int): number of channels in the output feature maps. scale_factors (list[float]): list of scaling factors to upsample or downsample the input features for creating pyramid features. top_block (nn.Module or None): if provided, an extra operation will be performed on the output of the last (smallest resolution) pyramid output, and the result will extend the result list. The top_block further downsamples the feature map. It must have an attribute "num_levels", meaning the number of extra pyramid levels added by this block, and "in_feature", which is a string representing its input feature (e.g., p5). norm (str): the normalization to use. square_pad (int): If > 0, require input images to be padded to specific square size. """ super(SimpleFeaturePyramid, self).__init__() assert isinstance(net, Backbone) self.scale_factors = scale_factors input_shapes = net.output_shape() strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors] _assert_strides_are_log2_contiguous(strides) dim = input_shapes[in_feature].channels self.stages = [] use_bias = norm == "" for idx, scale in enumerate(scale_factors): out_dim = dim if scale == 4.0: layers = [ nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2), get_norm(norm, dim // 2), nn.GELU(), nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2), ] out_dim = dim // 4 elif scale == 2.0: layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)] out_dim = dim // 2 elif scale == 1.0: layers = [] elif scale == 0.5: layers = [nn.MaxPool2d(kernel_size=2, stride=2)] else: raise NotImplementedError(f"scale_factor={scale} is not supported yet.") layers.extend( [ Conv2d( out_dim, out_channels, kernel_size=1, bias=use_bias, norm=get_norm(norm, out_channels), ), Conv2d( out_channels, out_channels, kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, out_channels), ), ] ) layers = nn.Sequential(*layers) stage = int(math.log2(strides[idx])) self.add_module(f"simfp_{stage}", layers) self.stages.append(layers) self.net = net self.in_feature = in_feature self.top_block = top_block # Return feature names are "p", like ["p2", "p3", ..., "p6"] self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} # top block output feature maps. if self.top_block is not None: for s in range(stage, stage + self.top_block.num_levels): self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) self._out_features = list(self._out_feature_strides.keys()) self._out_feature_channels = {k: out_channels for k in self._out_features} self._size_divisibility = strides[-1] self._square_pad = square_pad @property def padding_constraints(self): return { "size_divisiblity": self._size_divisibility, "square_size": self._square_pad, } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: mapping from feature map name to pyramid feature map tensor in high to low resolution order. Returned feature names follow the FPN convention: "p", where stage has stride = 2 ** stage e.g., ["p2", "p3", ..., "p6"]. """ bottom_up_features = self.net(x) features = bottom_up_features[self.in_feature] results = [] for stage in self.stages: results.append(stage(features)) if self.top_block is not None: if self.top_block.in_feature in bottom_up_features: top_block_in_feature = bottom_up_features[self.top_block.in_feature] else: top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] results.extend(self.top_block(top_block_in_feature)) assert len(self._out_features) == len(results) return {f: res for f, res in zip(self._out_features, results)} def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): """ Calculate lr decay rate for different ViT blocks. Args: name (string): parameter name. lr_decay_rate (float): base lr decay rate. num_layers (int): number of ViT blocks. Returns: lr decay rate for the given parameter. """ layer_id = num_layers + 1 if name.startswith("backbone"): if ".pos_embed" in name or ".patch_embed" in name: layer_id = 0 elif ".blocks." in name and ".residual." not in name: layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1 return lr_decay_rate ** (num_layers + 1 - layer_id) ================================================ FILE: annotator/oneformer/detectron2/modeling/box_regression.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import math from typing import List, Tuple, Union import torch from fvcore.nn import giou_loss, smooth_l1_loss from torch.nn import functional as F from annotator.oneformer.detectron2.layers import cat, ciou_loss, diou_loss from annotator.oneformer.detectron2.structures import Boxes # Value for clamping large dw and dh predictions. The heuristic is that we clamp # such that dw and dh are no larger than what would transform a 16px box into a # 1000px box (based on a small anchor, 16px, and a typical image size, 1000px). _DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16) __all__ = ["Box2BoxTransform", "Box2BoxTransformRotated", "Box2BoxTransformLinear"] @torch.jit.script class Box2BoxTransform(object): """ The box-to-box transform defined in R-CNN. The transformation is parameterized by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height). """ def __init__( self, weights: Tuple[float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP ): """ Args: weights (4-element tuple): Scaling factors that are applied to the (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set such that the deltas have unit variance; now they are treated as hyperparameters of the system. scale_clamp (float): When predicting deltas, the predicted box scaling factors (dw and dh) are clamped such that they are <= scale_clamp. """ self.weights = weights self.scale_clamp = scale_clamp def get_deltas(self, src_boxes, target_boxes): """ Get box regression transformation deltas (dx, dy, dw, dh) that can be used to transform the `src_boxes` into the `target_boxes`. That is, the relation ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless any delta is too large and is clamped). Args: src_boxes (Tensor): source boxes, e.g., object proposals target_boxes (Tensor): target of the transformation, e.g., ground-truth boxes. """ assert isinstance(src_boxes, torch.Tensor), type(src_boxes) assert isinstance(target_boxes, torch.Tensor), type(target_boxes) src_widths = src_boxes[:, 2] - src_boxes[:, 0] src_heights = src_boxes[:, 3] - src_boxes[:, 1] src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights target_widths = target_boxes[:, 2] - target_boxes[:, 0] target_heights = target_boxes[:, 3] - target_boxes[:, 1] target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights wx, wy, ww, wh = self.weights dx = wx * (target_ctr_x - src_ctr_x) / src_widths dy = wy * (target_ctr_y - src_ctr_y) / src_heights dw = ww * torch.log(target_widths / src_widths) dh = wh * torch.log(target_heights / src_heights) deltas = torch.stack((dx, dy, dw, dh), dim=1) assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!" return deltas def apply_deltas(self, deltas, boxes): """ Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`. Args: deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1. deltas[i] represents k potentially different class-specific box transformations for the single box boxes[i]. boxes (Tensor): boxes to transform, of shape (N, 4) """ deltas = deltas.float() # ensure fp32 for decoding precision boxes = boxes.to(deltas.dtype) widths = boxes[:, 2] - boxes[:, 0] heights = boxes[:, 3] - boxes[:, 1] ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights wx, wy, ww, wh = self.weights dx = deltas[:, 0::4] / wx dy = deltas[:, 1::4] / wy dw = deltas[:, 2::4] / ww dh = deltas[:, 3::4] / wh # Prevent sending too large values into torch.exp() dw = torch.clamp(dw, max=self.scale_clamp) dh = torch.clamp(dh, max=self.scale_clamp) pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] pred_w = torch.exp(dw) * widths[:, None] pred_h = torch.exp(dh) * heights[:, None] x1 = pred_ctr_x - 0.5 * pred_w y1 = pred_ctr_y - 0.5 * pred_h x2 = pred_ctr_x + 0.5 * pred_w y2 = pred_ctr_y + 0.5 * pred_h pred_boxes = torch.stack((x1, y1, x2, y2), dim=-1) return pred_boxes.reshape(deltas.shape) @torch.jit.script class Box2BoxTransformRotated(object): """ The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height), and rotate a box's angle by da (radians). Note: angles of deltas are in radians while angles of boxes are in degrees. """ def __init__( self, weights: Tuple[float, float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP, ): """ Args: weights (5-element tuple): Scaling factors that are applied to the (dx, dy, dw, dh, da) deltas. These are treated as hyperparameters of the system. scale_clamp (float): When predicting deltas, the predicted box scaling factors (dw and dh) are clamped such that they are <= scale_clamp. """ self.weights = weights self.scale_clamp = scale_clamp def get_deltas(self, src_boxes, target_boxes): """ Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used to transform the `src_boxes` into the `target_boxes`. That is, the relation ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless any delta is too large and is clamped). Args: src_boxes (Tensor): Nx5 source boxes, e.g., object proposals target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth boxes. """ assert isinstance(src_boxes, torch.Tensor), type(src_boxes) assert isinstance(target_boxes, torch.Tensor), type(target_boxes) src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1) target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind( target_boxes, dim=1 ) wx, wy, ww, wh, wa = self.weights dx = wx * (target_ctr_x - src_ctr_x) / src_widths dy = wy * (target_ctr_y - src_ctr_y) / src_heights dw = ww * torch.log(target_widths / src_widths) dh = wh * torch.log(target_heights / src_heights) # Angles of deltas are in radians while angles of boxes are in degrees. # the conversion to radians serve as a way to normalize the values da = target_angles - src_angles da = (da + 180.0) % 360.0 - 180.0 # make it in [-180, 180) da *= wa * math.pi / 180.0 deltas = torch.stack((dx, dy, dw, dh, da), dim=1) assert ( (src_widths > 0).all().item() ), "Input boxes to Box2BoxTransformRotated are not valid!" return deltas def apply_deltas(self, deltas, boxes): """ Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`. Args: deltas (Tensor): transformation deltas of shape (N, k*5). deltas[i] represents box transformation for the single box boxes[i]. boxes (Tensor): boxes to transform, of shape (N, 5) """ assert deltas.shape[1] % 5 == 0 and boxes.shape[1] == 5 boxes = boxes.to(deltas.dtype).unsqueeze(2) ctr_x = boxes[:, 0] ctr_y = boxes[:, 1] widths = boxes[:, 2] heights = boxes[:, 3] angles = boxes[:, 4] wx, wy, ww, wh, wa = self.weights dx = deltas[:, 0::5] / wx dy = deltas[:, 1::5] / wy dw = deltas[:, 2::5] / ww dh = deltas[:, 3::5] / wh da = deltas[:, 4::5] / wa # Prevent sending too large values into torch.exp() dw = torch.clamp(dw, max=self.scale_clamp) dh = torch.clamp(dh, max=self.scale_clamp) pred_boxes = torch.zeros_like(deltas) pred_boxes[:, 0::5] = dx * widths + ctr_x # x_ctr pred_boxes[:, 1::5] = dy * heights + ctr_y # y_ctr pred_boxes[:, 2::5] = torch.exp(dw) * widths # width pred_boxes[:, 3::5] = torch.exp(dh) * heights # height # Following original RRPN implementation, # angles of deltas are in radians while angles of boxes are in degrees. pred_angle = da * 180.0 / math.pi + angles pred_angle = (pred_angle + 180.0) % 360.0 - 180.0 # make it in [-180, 180) pred_boxes[:, 4::5] = pred_angle return pred_boxes class Box2BoxTransformLinear(object): """ The linear box-to-box transform defined in FCOS. The transformation is parameterized by the distance from the center of (square) src box to 4 edges of the target box. """ def __init__(self, normalize_by_size=True): """ Args: normalize_by_size: normalize deltas by the size of src (anchor) boxes. """ self.normalize_by_size = normalize_by_size def get_deltas(self, src_boxes, target_boxes): """ Get box regression transformation deltas (dx1, dy1, dx2, dy2) that can be used to transform the `src_boxes` into the `target_boxes`. That is, the relation ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true. The center of src must be inside target boxes. Args: src_boxes (Tensor): square source boxes, e.g., anchors target_boxes (Tensor): target of the transformation, e.g., ground-truth boxes. """ assert isinstance(src_boxes, torch.Tensor), type(src_boxes) assert isinstance(target_boxes, torch.Tensor), type(target_boxes) src_ctr_x = 0.5 * (src_boxes[:, 0] + src_boxes[:, 2]) src_ctr_y = 0.5 * (src_boxes[:, 1] + src_boxes[:, 3]) target_l = src_ctr_x - target_boxes[:, 0] target_t = src_ctr_y - target_boxes[:, 1] target_r = target_boxes[:, 2] - src_ctr_x target_b = target_boxes[:, 3] - src_ctr_y deltas = torch.stack((target_l, target_t, target_r, target_b), dim=1) if self.normalize_by_size: stride_w = src_boxes[:, 2] - src_boxes[:, 0] stride_h = src_boxes[:, 3] - src_boxes[:, 1] strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1) deltas = deltas / strides return deltas def apply_deltas(self, deltas, boxes): """ Apply transformation `deltas` (dx1, dy1, dx2, dy2) to `boxes`. Args: deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1. deltas[i] represents k potentially different class-specific box transformations for the single box boxes[i]. boxes (Tensor): boxes to transform, of shape (N, 4) """ # Ensure the output is a valid box. See Sec 2.1 of https://arxiv.org/abs/2006.09214 deltas = F.relu(deltas) boxes = boxes.to(deltas.dtype) ctr_x = 0.5 * (boxes[:, 0] + boxes[:, 2]) ctr_y = 0.5 * (boxes[:, 1] + boxes[:, 3]) if self.normalize_by_size: stride_w = boxes[:, 2] - boxes[:, 0] stride_h = boxes[:, 3] - boxes[:, 1] strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1) deltas = deltas * strides l = deltas[:, 0::4] t = deltas[:, 1::4] r = deltas[:, 2::4] b = deltas[:, 3::4] pred_boxes = torch.zeros_like(deltas) pred_boxes[:, 0::4] = ctr_x[:, None] - l # x1 pred_boxes[:, 1::4] = ctr_y[:, None] - t # y1 pred_boxes[:, 2::4] = ctr_x[:, None] + r # x2 pred_boxes[:, 3::4] = ctr_y[:, None] + b # y2 return pred_boxes def _dense_box_regression_loss( anchors: List[Union[Boxes, torch.Tensor]], box2box_transform: Box2BoxTransform, pred_anchor_deltas: List[torch.Tensor], gt_boxes: List[torch.Tensor], fg_mask: torch.Tensor, box_reg_loss_type="smooth_l1", smooth_l1_beta=0.0, ): """ Compute loss for dense multi-level box regression. Loss is accumulated over ``fg_mask``. Args: anchors: #lvl anchor boxes, each is (HixWixA, 4) pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4) gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A)) fg_mask: the foreground boolean mask of shape (N, R) to compute loss on box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou", "diou", "ciou". smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1" """ if isinstance(anchors[0], Boxes): anchors = type(anchors[0]).cat(anchors).tensor # (R, 4) else: anchors = cat(anchors) if box_reg_loss_type == "smooth_l1": gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes] gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4) loss_box_reg = smooth_l1_loss( cat(pred_anchor_deltas, dim=1)[fg_mask], gt_anchor_deltas[fg_mask], beta=smooth_l1_beta, reduction="sum", ) elif box_reg_loss_type == "giou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = giou_loss( torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum" ) elif box_reg_loss_type == "diou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = diou_loss( torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum" ) elif box_reg_loss_type == "ciou": pred_boxes = [ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1) ] loss_box_reg = ciou_loss( torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum" ) else: raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'") return loss_box_reg ================================================ FILE: annotator/oneformer/detectron2/modeling/matcher.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from typing import List import torch from annotator.oneformer.detectron2.layers import nonzero_tuple # TODO: the name is too general class Matcher(object): """ This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements. The matching is determined by the MxN match_quality_matrix, that characterizes how well each (ground-truth, prediction)-pair match each other. For example, if the elements are boxes, this matrix may contain box intersection-over-union overlap values. The matcher returns (a) a vector of length N containing the index of the ground-truth element m in [0, M) that matches to prediction n in [0, N). (b) a vector of length N containing the labels for each prediction. """ def __init__( self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False ): """ Args: thresholds (list): a list of thresholds used to stratify predictions into levels. labels (list): a list of values to label predictions belonging at each level. A label can be one of {-1, 0, 1} signifying {ignore, negative class, positive class}, respectively. allow_low_quality_matches (bool): if True, produce additional matches for predictions with maximum match quality lower than high_threshold. See set_low_quality_matches_ for more details. For example, thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will be marked with -1 and thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and thus will be considered as true positives. """ # Add -inf and +inf to first and last position in thresholds thresholds = thresholds[:] assert thresholds[0] > 0 thresholds.insert(0, -float("inf")) thresholds.append(float("inf")) # Currently torchscript does not support all + generator assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])]) assert all([l in [-1, 0, 1] for l in labels]) assert len(labels) == len(thresholds) - 1 self.thresholds = thresholds self.labels = labels self.allow_low_quality_matches = allow_low_quality_matches def __call__(self, match_quality_matrix): """ Args: match_quality_matrix (Tensor[float]): an MxN tensor, containing the pairwise quality between M ground-truth elements and N predicted elements. All elements must be >= 0 (due to the us of `torch.nonzero` for selecting indices in :meth:`set_low_quality_matches_`). Returns: matches (Tensor[int64]): a vector of length N, where matches[i] is a matched ground-truth index in [0, M) match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates whether a prediction is a true or false positive or ignored """ assert match_quality_matrix.dim() == 2 if match_quality_matrix.numel() == 0: default_matches = match_quality_matrix.new_full( (match_quality_matrix.size(1),), 0, dtype=torch.int64 ) # When no gt boxes exist, we define IOU = 0 and therefore set labels # to `self.labels[0]`, which usually defaults to background class 0 # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds default_match_labels = match_quality_matrix.new_full( (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8 ) return default_matches, default_match_labels assert torch.all(match_quality_matrix >= 0) # match_quality_matrix is M (gt) x N (predicted) # Max over gt elements (dim 0) to find best gt candidate for each prediction matched_vals, matches = match_quality_matrix.max(dim=0) match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8) for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]): low_high = (matched_vals >= low) & (matched_vals < high) match_labels[low_high] = l if self.allow_low_quality_matches: self.set_low_quality_matches_(match_labels, match_quality_matrix) return matches, match_labels def set_low_quality_matches_(self, match_labels, match_quality_matrix): """ Produce additional matches for predictions that have only low-quality matches. Specifically, for each ground-truth G find the set of predictions that have maximum overlap with it (including ties); for each prediction in that set, if it is unmatched, then match it to the ground-truth G. This function implements the RPN assignment case (i) in Sec. 3.1.2 of :paper:`Faster R-CNN`. """ # For each gt, find the prediction with which it has highest quality highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) # Find the highest quality match available, even if it is low, including ties. # Note that the matches qualities must be positive due to the use of # `torch.nonzero`. _, pred_inds_with_highest_quality = nonzero_tuple( match_quality_matrix == highest_quality_foreach_gt[:, None] ) # If an anchor was labeled positive only due to a low-quality match # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B. # This follows the implementation in Detectron, and is found to have no significant impact. match_labels[pred_inds_with_highest_quality] = 1 ================================================ FILE: annotator/oneformer/detectron2/modeling/meta_arch/__init__.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. from .build import META_ARCH_REGISTRY, build_model # isort:skip from .panoptic_fpn import PanopticFPN # import all the meta_arch, so they will be registered from .rcnn import GeneralizedRCNN, ProposalNetwork from .dense_detector import DenseDetector from .retinanet import RetinaNet from .fcos import FCOS from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head __all__ = list(globals().keys()) ================================================ FILE: annotator/oneformer/detectron2/modeling/meta_arch/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch from annotator.oneformer.detectron2.utils.logger import _log_api_usage from annotator.oneformer.detectron2.utils.registry import Registry META_ARCH_REGISTRY = Registry("META_ARCH") # noqa F401 isort:skip META_ARCH_REGISTRY.__doc__ = """ Registry for meta-architectures, i.e. the whole model. The registered object will be called with `obj(cfg)` and expected to return a `nn.Module` object. """ def build_model(cfg): """ Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``. Note that it does not load any weights from ``cfg``. """ meta_arch = cfg.MODEL.META_ARCHITECTURE model = META_ARCH_REGISTRY.get(meta_arch)(cfg) _log_api_usage("modeling.meta_arch." + meta_arch) return model ================================================ FILE: annotator/oneformer/detectron2/modeling/meta_arch/dense_detector.py ================================================ import numpy as np from typing import Dict, List, Optional, Tuple import torch from torch import Tensor, nn from annotator.oneformer.detectron2.data.detection_utils import convert_image_to_rgb from annotator.oneformer.detectron2.layers import move_device_like from annotator.oneformer.detectron2.modeling import Backbone from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances from annotator.oneformer.detectron2.utils.events import get_event_storage from ..postprocessing import detector_postprocess def permute_to_N_HWA_K(tensor, K: int): """ Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K) """ assert tensor.dim() == 4, tensor.shape N, _, H, W = tensor.shape tensor = tensor.view(N, -1, K, H, W) tensor = tensor.permute(0, 3, 4, 1, 2) tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K) return tensor class DenseDetector(nn.Module): """ Base class for dense detector. We define a dense detector as a fully-convolutional model that makes per-pixel (i.e. dense) predictions. """ def __init__( self, backbone: Backbone, head: nn.Module, head_in_features: Optional[List[str]] = None, *, pixel_mean, pixel_std, ): """ Args: backbone: backbone module head: head module head_in_features: backbone features to use in head. Default to all backbone features. pixel_mean (Tuple[float]): Values to be used for image normalization (BGR order). To train on images of different number of channels, set different mean & std. Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675] pixel_std (Tuple[float]): When using pre-trained models in Detectron1 or any MSRA models, std has been absorbed into its conv1 weights, so the std needs to be set 1. Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std) """ super().__init__() self.backbone = backbone self.head = head if head_in_features is None: shapes = self.backbone.output_shape() self.head_in_features = sorted(shapes.keys(), key=lambda x: shapes[x].stride) else: self.head_in_features = head_in_features self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) @property def device(self): return self.pixel_mean.device def _move_to_current_device(self, x): return move_device_like(x, self.pixel_mean) def forward(self, batched_inputs: List[Dict[str, Tensor]]): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. In inference, the standard output format, described in :doc:`/tutorials/models`. """ images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) features = [features[f] for f in self.head_in_features] predictions = self.head(features) if self.training: assert not torch.jit.is_scripting(), "Not supported" assert "instances" in batched_inputs[0], "Instance annotations are missing in training!" gt_instances = [x["instances"].to(self.device) for x in batched_inputs] return self.forward_training(images, features, predictions, gt_instances) else: results = self.forward_inference(images, features, predictions) if torch.jit.is_scripting(): return results processed_results = [] for results_per_image, input_per_image, image_size in zip( results, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results def forward_training(self, images, features, predictions, gt_instances): raise NotImplementedError() def preprocess_image(self, batched_inputs: List[Dict[str, Tensor]]): """ Normalize, pad and batch the input images. """ images = [self._move_to_current_device(x["image"]) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors( images, self.backbone.size_divisibility, padding_constraints=self.backbone.padding_constraints, ) return images def _transpose_dense_predictions( self, predictions: List[List[Tensor]], dims_per_anchor: List[int] ) -> List[List[Tensor]]: """ Transpose the dense per-level predictions. Args: predictions: a list of outputs, each is a list of per-level predictions with shape (N, Ai x K, Hi, Wi), where N is the number of images, Ai is the number of anchors per location on level i, K is the dimension of predictions per anchor. dims_per_anchor: the value of K for each predictions. e.g. 4 for box prediction, #classes for classification prediction. Returns: List[List[Tensor]]: each prediction is transposed to (N, Hi x Wi x Ai, K). """ assert len(predictions) == len(dims_per_anchor) res: List[List[Tensor]] = [] for pred, dim_per_anchor in zip(predictions, dims_per_anchor): pred = [permute_to_N_HWA_K(x, dim_per_anchor) for x in pred] res.append(pred) return res def _ema_update(self, name: str, value: float, initial_value: float, momentum: float = 0.9): """ Apply EMA update to `self.name` using `value`. This is mainly used for loss normalizer. In Detectron1, loss is normalized by number of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a large variance and using it lead to lower performance. Therefore we maintain an EMA of #foreground to stabilize the normalizer. Args: name: name of the normalizer value: the new value to update initial_value: the initial value to start with momentum: momentum of EMA Returns: float: the updated EMA value """ if hasattr(self, name): old = getattr(self, name) else: old = initial_value new = old * momentum + value * (1 - momentum) setattr(self, name, new) return new def _decode_per_level_predictions( self, anchors: Boxes, pred_scores: Tensor, pred_deltas: Tensor, score_thresh: float, topk_candidates: int, image_size: Tuple[int, int], ) -> Instances: """ Decode boxes and classification predictions of one featuer level, by the following steps: 1. filter the predictions based on score threshold and top K scores. 2. transform the box regression outputs 3. return the predicted scores, classes and boxes Args: anchors: Boxes, anchor for this feature level pred_scores: HxWxA,K pred_deltas: HxWxA,4 Returns: Instances: with field "scores", "pred_boxes", "pred_classes". """ # Apply two filtering to make NMS faster. # 1. Keep boxes with confidence score higher than threshold keep_idxs = pred_scores > score_thresh pred_scores = pred_scores[keep_idxs] topk_idxs = torch.nonzero(keep_idxs) # Kx2 # 2. Keep top k top scoring boxes only topk_idxs_size = topk_idxs.shape[0] if isinstance(topk_idxs_size, Tensor): # It's a tensor in tracing num_topk = torch.clamp(topk_idxs_size, max=topk_candidates) else: num_topk = min(topk_idxs_size, topk_candidates) pred_scores, idxs = pred_scores.topk(num_topk) topk_idxs = topk_idxs[idxs] anchor_idxs, classes_idxs = topk_idxs.unbind(dim=1) pred_boxes = self.box2box_transform.apply_deltas( pred_deltas[anchor_idxs], anchors.tensor[anchor_idxs] ) return Instances( image_size, pred_boxes=Boxes(pred_boxes), scores=pred_scores, pred_classes=classes_idxs ) def _decode_multi_level_predictions( self, anchors: List[Boxes], pred_scores: List[Tensor], pred_deltas: List[Tensor], score_thresh: float, topk_candidates: int, image_size: Tuple[int, int], ) -> Instances: """ Run `_decode_per_level_predictions` for all feature levels and concat the results. """ predictions = [ self._decode_per_level_predictions( anchors_i, box_cls_i, box_reg_i, self.test_score_thresh, self.test_topk_candidates, image_size, ) # Iterate over every feature level for box_cls_i, box_reg_i, anchors_i in zip(pred_scores, pred_deltas, anchors) ] return predictions[0].cat(predictions) # 'Instances.cat' is not scriptale but this is def visualize_training(self, batched_inputs, results): """ A function used to visualize ground truth images and final network predictions. It shows ground truth bounding boxes on the original image and up to 20 predicted object bounding boxes on the original image. Args: batched_inputs (list): a list that contains input to the model. results (List[Instances]): a list of #images elements returned by forward_inference(). """ from annotator.oneformer.detectron2.utils.visualizer import Visualizer assert len(batched_inputs) == len( results ), "Cannot visualize inputs and results of different sizes" storage = get_event_storage() max_boxes = 20 image_index = 0 # only visualize a single image img = batched_inputs[image_index]["image"] img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format) v_gt = Visualizer(img, None) v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes) anno_img = v_gt.get_image() processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1]) predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy() v_pred = Visualizer(img, None) v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes]) prop_img = v_pred.get_image() vis_img = np.vstack((anno_img, prop_img)) vis_img = vis_img.transpose(2, 0, 1) vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results" storage.put_image(vis_name, vis_img) ================================================ FILE: annotator/oneformer/detectron2/modeling/meta_arch/fcos.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging from typing import List, Optional, Tuple import torch from fvcore.nn import sigmoid_focal_loss_jit from torch import nn from torch.nn import functional as F from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_point_box_distance from annotator.oneformer.detectron2.utils.events import get_event_storage from ..anchor_generator import DefaultAnchorGenerator from ..backbone import Backbone from ..box_regression import Box2BoxTransformLinear, _dense_box_regression_loss from .dense_detector import DenseDetector from .retinanet import RetinaNetHead __all__ = ["FCOS"] logger = logging.getLogger(__name__) class FCOS(DenseDetector): """ Implement FCOS in :paper:`fcos`. """ def __init__( self, *, backbone: Backbone, head: nn.Module, head_in_features: Optional[List[str]] = None, box2box_transform=None, num_classes, center_sampling_radius: float = 1.5, focal_loss_alpha=0.25, focal_loss_gamma=2.0, test_score_thresh=0.2, test_topk_candidates=1000, test_nms_thresh=0.6, max_detections_per_image=100, pixel_mean, pixel_std, ): """ Args: center_sampling_radius: radius of the "center" of a groundtruth box, within which all anchor points are labeled positive. Other arguments mean the same as in :class:`RetinaNet`. """ super().__init__( backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std ) self.num_classes = num_classes # FCOS uses one anchor point per location. # We represent the anchor point by a box whose size equals the anchor stride. feature_shapes = backbone.output_shape() fpn_strides = [feature_shapes[k].stride for k in self.head_in_features] self.anchor_generator = DefaultAnchorGenerator( sizes=[[k] for k in fpn_strides], aspect_ratios=[1.0], strides=fpn_strides ) # FCOS parameterizes box regression by a linear transform, # where predictions are normalized by anchor stride (equal to anchor size). if box2box_transform is None: box2box_transform = Box2BoxTransformLinear(normalize_by_size=True) self.box2box_transform = box2box_transform self.center_sampling_radius = float(center_sampling_radius) # Loss parameters: self.focal_loss_alpha = focal_loss_alpha self.focal_loss_gamma = focal_loss_gamma # Inference parameters: self.test_score_thresh = test_score_thresh self.test_topk_candidates = test_topk_candidates self.test_nms_thresh = test_nms_thresh self.max_detections_per_image = max_detections_per_image def forward_training(self, images, features, predictions, gt_instances): # Transpose the Hi*Wi*A dimension to the middle: pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions( predictions, [self.num_classes, 4, 1] ) anchors = self.anchor_generator(features) gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances) return self.losses( anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness ) @torch.no_grad() def _match_anchors(self, gt_boxes: Boxes, anchors: List[Boxes]): """ Match ground-truth boxes to a set of multi-level anchors. Args: gt_boxes: Ground-truth boxes from instances of an image. anchors: List of anchors for each feature map (of different scales). Returns: torch.Tensor A tensor of shape `(M, R)`, given `M` ground-truth boxes and total `R` anchor points from all feature levels, indicating the quality of match between m-th box and r-th anchor. Higher value indicates better match. """ # Naming convention: (M = ground-truth boxes, R = anchor points) # Anchor points are represented as square boxes of size = stride. num_anchors_per_level = [len(x) for x in anchors] anchors = Boxes.cat(anchors) # (R, 4) anchor_centers = anchors.get_centers() # (R, 2) anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0] # (R, ) lower_bound = anchor_sizes * 4 lower_bound[: num_anchors_per_level[0]] = 0 upper_bound = anchor_sizes * 8 upper_bound[-num_anchors_per_level[-1] :] = float("inf") gt_centers = gt_boxes.get_centers() # FCOS with center sampling: anchor point must be close enough to # ground-truth box center. center_dists = (anchor_centers[None, :, :] - gt_centers[:, None, :]).abs_() sampling_regions = self.center_sampling_radius * anchor_sizes[None, :] match_quality_matrix = center_dists.max(dim=2).values < sampling_regions pairwise_dist = pairwise_point_box_distance(anchor_centers, gt_boxes) pairwise_dist = pairwise_dist.permute(1, 0, 2) # (M, R, 4) # The original FCOS anchor matching rule: anchor point must be inside GT. match_quality_matrix &= pairwise_dist.min(dim=2).values > 0 # Multilevel anchor matching in FCOS: each anchor is only responsible # for certain scale range. pairwise_dist = pairwise_dist.max(dim=2).values match_quality_matrix &= (pairwise_dist > lower_bound[None, :]) & ( pairwise_dist < upper_bound[None, :] ) # Match the GT box with minimum area, if there are multiple GT matches. gt_areas = gt_boxes.area() # (M, ) match_quality_matrix = match_quality_matrix.to(torch.float32) match_quality_matrix *= 1e8 - gt_areas[:, None] return match_quality_matrix # (M, R) @torch.no_grad() def label_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]): """ Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS anchor matching rule. Unlike RetinaNet, there are no ignored anchors. """ gt_labels, matched_gt_boxes = [], [] for inst in gt_instances: if len(inst) > 0: match_quality_matrix = self._match_anchors(inst.gt_boxes, anchors) # Find matched ground-truth box per anchor. Un-matched anchors are # assigned -1. This is equivalent to using an anchor matcher as used # in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])` match_quality, matched_idxs = match_quality_matrix.max(dim=0) matched_idxs[match_quality < 1e-5] = -1 matched_gt_boxes_i = inst.gt_boxes.tensor[matched_idxs.clip(min=0)] gt_labels_i = inst.gt_classes[matched_idxs.clip(min=0)] # Anchors with matched_idxs = -1 are labeled background. gt_labels_i[matched_idxs < 0] = self.num_classes else: matched_gt_boxes_i = torch.zeros_like(Boxes.cat(anchors).tensor) gt_labels_i = torch.full( (len(matched_gt_boxes_i),), fill_value=self.num_classes, dtype=torch.long, device=matched_gt_boxes_i.device, ) gt_labels.append(gt_labels_i) matched_gt_boxes.append(matched_gt_boxes_i) return gt_labels, matched_gt_boxes def losses( self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness ): """ This method is almost identical to :meth:`RetinaNet.losses`, with an extra "loss_centerness" in the returned dict. """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (M, R) pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes) num_pos_anchors = pos_mask.sum().item() get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images) normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 300) # classification and regression loss gt_labels_target = F.one_hot(gt_labels, num_classes=self.num_classes + 1)[ :, :, :-1 ] # no loss for the last (background) class loss_cls = sigmoid_focal_loss_jit( torch.cat(pred_logits, dim=1), gt_labels_target.to(pred_logits[0].dtype), alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) loss_box_reg = _dense_box_regression_loss( anchors, self.box2box_transform, pred_anchor_deltas, gt_boxes, pos_mask, box_reg_loss_type="giou", ) ctrness_targets = self.compute_ctrness_targets(anchors, gt_boxes) # (M, R) pred_centerness = torch.cat(pred_centerness, dim=1).squeeze(dim=2) # (M, R) ctrness_loss = F.binary_cross_entropy_with_logits( pred_centerness[pos_mask], ctrness_targets[pos_mask], reduction="sum" ) return { "loss_fcos_cls": loss_cls / normalizer, "loss_fcos_loc": loss_box_reg / normalizer, "loss_fcos_ctr": ctrness_loss / normalizer, } def compute_ctrness_targets(self, anchors: List[Boxes], gt_boxes: List[torch.Tensor]): anchors = Boxes.cat(anchors).tensor # Rx4 reg_targets = [self.box2box_transform.get_deltas(anchors, m) for m in gt_boxes] reg_targets = torch.stack(reg_targets, dim=0) # NxRx4 if len(reg_targets) == 0: return reg_targets.new_zeros(len(reg_targets)) left_right = reg_targets[:, :, [0, 2]] top_bottom = reg_targets[:, :, [1, 3]] ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * ( top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0] ) return torch.sqrt(ctrness) def forward_inference( self, images: ImageList, features: List[torch.Tensor], predictions: List[List[torch.Tensor]], ): pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions( predictions, [self.num_classes, 4, 1] ) anchors = self.anchor_generator(features) results: List[Instances] = [] for img_idx, image_size in enumerate(images.image_sizes): scores_per_image = [ # Multiply and sqrt centerness & classification scores # (See eqn. 4 in https://arxiv.org/abs/2006.09214) torch.sqrt(x[img_idx].sigmoid_() * y[img_idx].sigmoid_()) for x, y in zip(pred_logits, pred_centerness) ] deltas_per_image = [x[img_idx] for x in pred_anchor_deltas] results_per_image = self.inference_single_image( anchors, scores_per_image, deltas_per_image, image_size ) results.append(results_per_image) return results def inference_single_image( self, anchors: List[Boxes], box_cls: List[torch.Tensor], box_delta: List[torch.Tensor], image_size: Tuple[int, int], ): """ Identical to :meth:`RetinaNet.inference_single_image. """ pred = self._decode_multi_level_predictions( anchors, box_cls, box_delta, self.test_score_thresh, self.test_topk_candidates, image_size, ) keep = batched_nms( pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh ) return pred[keep[: self.max_detections_per_image]] class FCOSHead(RetinaNetHead): """ The head used in :paper:`fcos`. It adds an additional centerness prediction branch on top of :class:`RetinaNetHead`. """ def __init__(self, *, input_shape: List[ShapeSpec], conv_dims: List[int], **kwargs): super().__init__(input_shape=input_shape, conv_dims=conv_dims, num_anchors=1, **kwargs) # Unlike original FCOS, we do not add an additional learnable scale layer # because it's found to have no benefits after normalizing regression targets by stride. self._num_features = len(input_shape) self.ctrness = nn.Conv2d(conv_dims[-1], 1, kernel_size=3, stride=1, padding=1) torch.nn.init.normal_(self.ctrness.weight, std=0.01) torch.nn.init.constant_(self.ctrness.bias, 0) def forward(self, features): assert len(features) == self._num_features logits = [] bbox_reg = [] ctrness = [] for feature in features: logits.append(self.cls_score(self.cls_subnet(feature))) bbox_feature = self.bbox_subnet(feature) bbox_reg.append(self.bbox_pred(bbox_feature)) ctrness.append(self.ctrness(bbox_feature)) return logits, bbox_reg, ctrness ================================================ FILE: annotator/oneformer/detectron2/modeling/meta_arch/panoptic_fpn.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import logging from typing import Dict, List import torch from torch import nn from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.structures import ImageList from ..postprocessing import detector_postprocess, sem_seg_postprocess from .build import META_ARCH_REGISTRY from .rcnn import GeneralizedRCNN from .semantic_seg import build_sem_seg_head __all__ = ["PanopticFPN"] @META_ARCH_REGISTRY.register() class PanopticFPN(GeneralizedRCNN): """ Implement the paper :paper:`PanopticFPN`. """ @configurable def __init__( self, *, sem_seg_head: nn.Module, combine_overlap_thresh: float = 0.5, combine_stuff_area_thresh: float = 4096, combine_instances_score_thresh: float = 0.5, **kwargs, ): """ NOTE: this interface is experimental. Args: sem_seg_head: a module for the semantic segmentation head. combine_overlap_thresh: combine masks into one instances if they have enough overlap combine_stuff_area_thresh: ignore stuff areas smaller than this threshold combine_instances_score_thresh: ignore instances whose score is smaller than this threshold Other arguments are the same as :class:`GeneralizedRCNN`. """ super().__init__(**kwargs) self.sem_seg_head = sem_seg_head # options when combining instance & semantic outputs self.combine_overlap_thresh = combine_overlap_thresh self.combine_stuff_area_thresh = combine_stuff_area_thresh self.combine_instances_score_thresh = combine_instances_score_thresh @classmethod def from_config(cls, cfg): ret = super().from_config(cfg) ret.update( { "combine_overlap_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH, "combine_stuff_area_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT, "combine_instances_score_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH, # noqa } ) ret["sem_seg_head"] = build_sem_seg_head(cfg, ret["backbone"].output_shape()) logger = logging.getLogger(__name__) if not cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED: logger.warning( "PANOPTIC_FPN.COMBINED.ENABLED is no longer used. " " model.inference(do_postprocess=) should be used to toggle postprocessing." ) if cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT != 1.0: w = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT logger.warning( "PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT should be replaced by weights on each ROI head." ) def update_weight(x): if isinstance(x, dict): return {k: v * w for k, v in x.items()} else: return x * w roi_heads = ret["roi_heads"] roi_heads.box_predictor.loss_weight = update_weight(roi_heads.box_predictor.loss_weight) roi_heads.mask_head.loss_weight = update_weight(roi_heads.mask_head.loss_weight) return ret def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "instances": Instances * "sem_seg": semantic segmentation ground truth. * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: each dict has the results for one image. The dict contains the following keys: * "instances": see :meth:`GeneralizedRCNN.forward` for its format. * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format. * "panoptic_seg": See the return value of :func:`combine_semantic_and_instance_outputs` for its format. """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) assert "sem_seg" in batched_inputs[0] gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] gt_sem_seg = ImageList.from_tensors( gt_sem_seg, self.backbone.size_divisibility, self.sem_seg_head.ignore_value, self.backbone.padding_constraints, ).tensor sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg) gt_instances = [x["instances"].to(self.device) for x in batched_inputs] proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) detector_results, detector_losses = self.roi_heads( images, features, proposals, gt_instances ) losses = sem_seg_losses losses.update(proposal_losses) losses.update(detector_losses) return losses def inference(self, batched_inputs: List[Dict[str, torch.Tensor]], do_postprocess: bool = True): """ Run inference on the given inputs. Args: batched_inputs (list[dict]): same as in :meth:`forward` do_postprocess (bool): whether to apply post-processing on the outputs. Returns: When do_postprocess=True, see docs in :meth:`forward`. Otherwise, returns a (list[Instances], list[Tensor]) that contains the raw detector outputs, and raw semantic segmentation outputs. """ images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) sem_seg_results, sem_seg_losses = self.sem_seg_head(features, None) proposals, _ = self.proposal_generator(images, features, None) detector_results, _ = self.roi_heads(images, features, proposals, None) if do_postprocess: processed_results = [] for sem_seg_result, detector_result, input_per_image, image_size in zip( sem_seg_results, detector_results, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width) detector_r = detector_postprocess(detector_result, height, width) processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r}) panoptic_r = combine_semantic_and_instance_outputs( detector_r, sem_seg_r.argmax(dim=0), self.combine_overlap_thresh, self.combine_stuff_area_thresh, self.combine_instances_score_thresh, ) processed_results[-1]["panoptic_seg"] = panoptic_r return processed_results else: return detector_results, sem_seg_results def combine_semantic_and_instance_outputs( instance_results, semantic_results, overlap_threshold, stuff_area_thresh, instances_score_thresh, ): """ Implement a simple combining logic following "combine_semantic_and_instance_predictions.py" in panopticapi to produce panoptic segmentation outputs. Args: instance_results: output of :func:`detector_postprocess`. semantic_results: an (H, W) tensor, each element is the contiguous semantic category id Returns: panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. segments_info (list[dict]): Describe each segment in `panoptic_seg`. Each dict contains keys "id", "category_id", "isthing". """ panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32) # sort instance outputs by scores sorted_inds = torch.argsort(-instance_results.scores) current_segment_id = 0 segments_info = [] instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device) # Add instances one-by-one, check for overlaps with existing ones for inst_id in sorted_inds: score = instance_results.scores[inst_id].item() if score < instances_score_thresh: break mask = instance_masks[inst_id] # H,W mask_area = mask.sum().item() if mask_area == 0: continue intersect = (mask > 0) & (panoptic_seg > 0) intersect_area = intersect.sum().item() if intersect_area * 1.0 / mask_area > overlap_threshold: continue if intersect_area > 0: mask = mask & (panoptic_seg == 0) current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append( { "id": current_segment_id, "isthing": True, "score": score, "category_id": instance_results.pred_classes[inst_id].item(), "instance_id": inst_id.item(), } ) # Add semantic results to remaining empty areas semantic_labels = torch.unique(semantic_results).cpu().tolist() for semantic_label in semantic_labels: if semantic_label == 0: # 0 is a special "thing" class continue mask = (semantic_results == semantic_label) & (panoptic_seg == 0) mask_area = mask.sum().item() if mask_area < stuff_area_thresh: continue current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append( { "id": current_segment_id, "isthing": False, "category_id": semantic_label, "area": mask_area, } ) return panoptic_seg, segments_info ================================================ FILE: annotator/oneformer/detectron2/modeling/meta_arch/rcnn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np from typing import Dict, List, Optional, Tuple import torch from torch import nn from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.data.detection_utils import convert_image_to_rgb from annotator.oneformer.detectron2.layers import move_device_like from annotator.oneformer.detectron2.structures import ImageList, Instances from annotator.oneformer.detectron2.utils.events import get_event_storage from annotator.oneformer.detectron2.utils.logger import log_first_n from ..backbone import Backbone, build_backbone from ..postprocessing import detector_postprocess from ..proposal_generator import build_proposal_generator from ..roi_heads import build_roi_heads from .build import META_ARCH_REGISTRY __all__ = ["GeneralizedRCNN", "ProposalNetwork"] @META_ARCH_REGISTRY.register() class GeneralizedRCNN(nn.Module): """ Generalized R-CNN. Any models that contains the following three components: 1. Per-image feature extraction (aka backbone) 2. Region proposal generation 3. Per-region feature extraction and prediction """ @configurable def __init__( self, *, backbone: Backbone, proposal_generator: nn.Module, roi_heads: nn.Module, pixel_mean: Tuple[float], pixel_std: Tuple[float], input_format: Optional[str] = None, vis_period: int = 0, ): """ Args: backbone: a backbone module, must follow detectron2's backbone interface proposal_generator: a module that generates proposals using backbone features roi_heads: a ROI head that performs per-region computation pixel_mean, pixel_std: list or tuple with #channels element, representing the per-channel mean and std to be used to normalize the input image input_format: describe the meaning of channels of input. Needed by visualization vis_period: the period to run visualization. Set to 0 to disable. """ super().__init__() self.backbone = backbone self.proposal_generator = proposal_generator self.roi_heads = roi_heads self.input_format = input_format self.vis_period = vis_period if vis_period > 0: assert input_format is not None, "input_format is required for visualization!" self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) assert ( self.pixel_mean.shape == self.pixel_std.shape ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!" @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) return { "backbone": backbone, "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), "roi_heads": build_roi_heads(cfg, backbone.output_shape()), "input_format": cfg.INPUT.FORMAT, "vis_period": cfg.VIS_PERIOD, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, } @property def device(self): return self.pixel_mean.device def _move_to_current_device(self, x): return move_device_like(x, self.pixel_mean) def visualize_training(self, batched_inputs, proposals): """ A function used to visualize images and proposals. It shows ground truth bounding boxes on the original image and up to 20 top-scoring predicted object proposals on the original image. Users can implement different visualization functions for different models. Args: batched_inputs (list): a list that contains input to the model. proposals (list): a list that contains predicted proposals. Both batched_inputs and proposals should have the same length. """ from annotator.oneformer.detectron2.utils.visualizer import Visualizer storage = get_event_storage() max_vis_prop = 20 for input, prop in zip(batched_inputs, proposals): img = input["image"] img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format) v_gt = Visualizer(img, None) v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes) anno_img = v_gt.get_image() box_size = min(len(prop.proposal_boxes), max_vis_prop) v_pred = Visualizer(img, None) v_pred = v_pred.overlay_instances( boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy() ) prop_img = v_pred.get_image() vis_img = np.concatenate((anno_img, prop_img), axis=1) vis_img = vis_img.transpose(2, 0, 1) vis_name = "Left: GT bounding boxes; Right: Predicted proposals" storage.put_image(vis_name, vis_img) break # only visualize one image in a batch def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (C, H, W) format. * instances (optional): groundtruth :class:`Instances` * proposals (optional): :class:`Instances`, precomputed proposals. Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "instances" whose value is a :class:`Instances`. The :class:`Instances` object has the following keys: "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] else: gt_instances = None features = self.backbone(images.tensor) if self.proposal_generator is not None: proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) else: assert "proposals" in batched_inputs[0] proposals = [x["proposals"].to(self.device) for x in batched_inputs] proposal_losses = {} _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) if self.vis_period > 0: storage = get_event_storage() if storage.iter % self.vis_period == 0: self.visualize_training(batched_inputs, proposals) losses = {} losses.update(detector_losses) losses.update(proposal_losses) return losses def inference( self, batched_inputs: List[Dict[str, torch.Tensor]], detected_instances: Optional[List[Instances]] = None, do_postprocess: bool = True, ): """ Run inference on the given inputs. Args: batched_inputs (list[dict]): same as in :meth:`forward` detected_instances (None or list[Instances]): if not None, it contains an `Instances` object per image. The `Instances` object contains "pred_boxes" and "pred_classes" which are known boxes in the image. The inference will then skip the detection of bounding boxes, and only predict other per-ROI outputs. do_postprocess (bool): whether to apply post-processing on the outputs. Returns: When do_postprocess=True, same as in :meth:`forward`. Otherwise, a list[Instances] containing raw network outputs. """ assert not self.training images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) if detected_instances is None: if self.proposal_generator is not None: proposals, _ = self.proposal_generator(images, features, None) else: assert "proposals" in batched_inputs[0] proposals = [x["proposals"].to(self.device) for x in batched_inputs] results, _ = self.roi_heads(images, features, proposals, None) else: detected_instances = [x.to(self.device) for x in detected_instances] results = self.roi_heads.forward_with_given_boxes(features, detected_instances) if do_postprocess: assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) return results def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]): """ Normalize, pad and batch the input images. """ images = [self._move_to_current_device(x["image"]) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors( images, self.backbone.size_divisibility, padding_constraints=self.backbone.padding_constraints, ) return images @staticmethod def _postprocess(instances, batched_inputs: List[Dict[str, torch.Tensor]], image_sizes): """ Rescale the output instances to the target size. """ # note: private function; subject to changes processed_results = [] for results_per_image, input_per_image, image_size in zip( instances, batched_inputs, image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"instances": r}) return processed_results @META_ARCH_REGISTRY.register() class ProposalNetwork(nn.Module): """ A meta architecture that only predicts object proposals. """ @configurable def __init__( self, *, backbone: Backbone, proposal_generator: nn.Module, pixel_mean: Tuple[float], pixel_std: Tuple[float], ): """ Args: backbone: a backbone module, must follow detectron2's backbone interface proposal_generator: a module that generates proposals using backbone features pixel_mean, pixel_std: list or tuple with #channels element, representing the per-channel mean and std to be used to normalize the input image """ super().__init__() self.backbone = backbone self.proposal_generator = proposal_generator self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) return { "backbone": backbone, "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, } @property def device(self): return self.pixel_mean.device def _move_to_current_device(self, x): return move_device_like(x, self.pixel_mean) def forward(self, batched_inputs): """ Args: Same as in :class:`GeneralizedRCNN.forward` Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "proposals" whose value is a :class:`Instances` with keys "proposal_boxes" and "objectness_logits". """ images = [self._move_to_current_device(x["image"]) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors( images, self.backbone.size_divisibility, padding_constraints=self.backbone.padding_constraints, ) features = self.backbone(images.tensor) if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 ) gt_instances = [x["targets"].to(self.device) for x in batched_inputs] else: gt_instances = None proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) # In training, the proposals are not useful at all but we generate them anyway. # This makes RPN-only models about 5% slower. if self.training: return proposal_losses processed_results = [] for results_per_image, input_per_image, image_size in zip( proposals, batched_inputs, images.image_sizes ): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"proposals": r}) return processed_results ================================================ FILE: annotator/oneformer/detectron2/modeling/meta_arch/retinanet.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import math from typing import List, Tuple import torch from fvcore.nn import sigmoid_focal_loss_jit from torch import Tensor, nn from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import CycleBatchNormList, ShapeSpec, batched_nms, cat, get_norm from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_iou from annotator.oneformer.detectron2.utils.events import get_event_storage from ..anchor_generator import build_anchor_generator from ..backbone import Backbone, build_backbone from ..box_regression import Box2BoxTransform, _dense_box_regression_loss from ..matcher import Matcher from .build import META_ARCH_REGISTRY from .dense_detector import DenseDetector, permute_to_N_HWA_K # noqa __all__ = ["RetinaNet"] logger = logging.getLogger(__name__) @META_ARCH_REGISTRY.register() class RetinaNet(DenseDetector): """ Implement RetinaNet in :paper:`RetinaNet`. """ @configurable def __init__( self, *, backbone: Backbone, head: nn.Module, head_in_features, anchor_generator, box2box_transform, anchor_matcher, num_classes, focal_loss_alpha=0.25, focal_loss_gamma=2.0, smooth_l1_beta=0.0, box_reg_loss_type="smooth_l1", test_score_thresh=0.05, test_topk_candidates=1000, test_nms_thresh=0.5, max_detections_per_image=100, pixel_mean, pixel_std, vis_period=0, input_format="BGR", ): """ NOTE: this interface is experimental. Args: backbone: a backbone module, must follow detectron2's backbone interface head (nn.Module): a module that predicts logits and regression deltas for each level from a list of per-level features head_in_features (Tuple[str]): Names of the input feature maps to be used in head anchor_generator (nn.Module): a module that creates anchors from a list of features. Usually an instance of :class:`AnchorGenerator` box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to instance boxes anchor_matcher (Matcher): label the anchors by matching them with ground truth. num_classes (int): number of classes. Used to label background proposals. # Loss parameters: focal_loss_alpha (float): focal_loss_alpha focal_loss_gamma (float): focal_loss_gamma smooth_l1_beta (float): smooth_l1_beta box_reg_loss_type (str): Options are "smooth_l1", "giou", "diou", "ciou" # Inference parameters: test_score_thresh (float): Inference cls score threshold, only anchors with score > INFERENCE_TH are considered for inference (to improve speed) test_topk_candidates (int): Select topk candidates before NMS test_nms_thresh (float): Overlap threshold used for non-maximum suppression (suppress boxes with IoU >= this threshold) max_detections_per_image (int): Maximum number of detections to return per image during inference (100 is based on the limit established for the COCO dataset). pixel_mean, pixel_std: see :class:`DenseDetector`. """ super().__init__( backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std ) self.num_classes = num_classes # Anchors self.anchor_generator = anchor_generator self.box2box_transform = box2box_transform self.anchor_matcher = anchor_matcher # Loss parameters: self.focal_loss_alpha = focal_loss_alpha self.focal_loss_gamma = focal_loss_gamma self.smooth_l1_beta = smooth_l1_beta self.box_reg_loss_type = box_reg_loss_type # Inference parameters: self.test_score_thresh = test_score_thresh self.test_topk_candidates = test_topk_candidates self.test_nms_thresh = test_nms_thresh self.max_detections_per_image = max_detections_per_image # Vis parameters self.vis_period = vis_period self.input_format = input_format @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) backbone_shape = backbone.output_shape() feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES] head = RetinaNetHead(cfg, feature_shapes) anchor_generator = build_anchor_generator(cfg, feature_shapes) return { "backbone": backbone, "head": head, "anchor_generator": anchor_generator, "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS), "anchor_matcher": Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ), "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES, "head_in_features": cfg.MODEL.RETINANET.IN_FEATURES, # Loss parameters: "focal_loss_alpha": cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA, "focal_loss_gamma": cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA, "smooth_l1_beta": cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA, "box_reg_loss_type": cfg.MODEL.RETINANET.BBOX_REG_LOSS_TYPE, # Inference parameters: "test_score_thresh": cfg.MODEL.RETINANET.SCORE_THRESH_TEST, "test_topk_candidates": cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST, "test_nms_thresh": cfg.MODEL.RETINANET.NMS_THRESH_TEST, "max_detections_per_image": cfg.TEST.DETECTIONS_PER_IMAGE, # Vis parameters "vis_period": cfg.VIS_PERIOD, "input_format": cfg.INPUT.FORMAT, } def forward_training(self, images, features, predictions, gt_instances): # Transpose the Hi*Wi*A dimension to the middle: pred_logits, pred_anchor_deltas = self._transpose_dense_predictions( predictions, [self.num_classes, 4] ) anchors = self.anchor_generator(features) gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances) return self.losses(anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes) def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes): """ Args: anchors (list[Boxes]): a list of #feature level Boxes gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of anchors across levels, i.e. sum(Hi x Wi x Ai) pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4). Where K is the number of classes used in `pred_logits`. Returns: dict[str, Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, R) valid_mask = gt_labels >= 0 pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes) num_pos_anchors = pos_mask.sum().item() get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images) normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 100) # classification and regression loss gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[ :, :-1 ] # no loss for the last (background) class loss_cls = sigmoid_focal_loss_jit( cat(pred_logits, dim=1)[valid_mask], gt_labels_target.to(pred_logits[0].dtype), alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum", ) loss_box_reg = _dense_box_regression_loss( anchors, self.box2box_transform, pred_anchor_deltas, gt_boxes, pos_mask, box_reg_loss_type=self.box_reg_loss_type, smooth_l1_beta=self.smooth_l1_beta, ) return { "loss_cls": loss_cls / normalizer, "loss_box_reg": loss_box_reg / normalizer, } @torch.no_grad() def label_anchors(self, anchors, gt_instances): """ Args: anchors (list[Boxes]): A list of #feature level Boxes. The Boxes contains anchors of this image on the specific feature level. gt_instances (list[Instances]): a list of N `Instances`s. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Returns: list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is the total number of anchors across all feature maps (sum(Hi * Wi * A)). Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background. list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors across feature maps. The values are the matched gt boxes for each anchor. Values are undefined for those anchors not labeled as foreground. """ anchors = Boxes.cat(anchors) # Rx4 gt_labels = [] matched_gt_boxes = [] for gt_per_image in gt_instances: match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors) matched_idxs, anchor_labels = self.anchor_matcher(match_quality_matrix) del match_quality_matrix if len(gt_per_image) > 0: matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs] gt_labels_i = gt_per_image.gt_classes[matched_idxs] # Anchors with label 0 are treated as background. gt_labels_i[anchor_labels == 0] = self.num_classes # Anchors with label -1 are ignored. gt_labels_i[anchor_labels == -1] = -1 else: matched_gt_boxes_i = torch.zeros_like(anchors.tensor) gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes gt_labels.append(gt_labels_i) matched_gt_boxes.append(matched_gt_boxes_i) return gt_labels, matched_gt_boxes def forward_inference( self, images: ImageList, features: List[Tensor], predictions: List[List[Tensor]] ): pred_logits, pred_anchor_deltas = self._transpose_dense_predictions( predictions, [self.num_classes, 4] ) anchors = self.anchor_generator(features) results: List[Instances] = [] for img_idx, image_size in enumerate(images.image_sizes): scores_per_image = [x[img_idx].sigmoid_() for x in pred_logits] deltas_per_image = [x[img_idx] for x in pred_anchor_deltas] results_per_image = self.inference_single_image( anchors, scores_per_image, deltas_per_image, image_size ) results.append(results_per_image) return results def inference_single_image( self, anchors: List[Boxes], box_cls: List[Tensor], box_delta: List[Tensor], image_size: Tuple[int, int], ): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Arguments: anchors (list[Boxes]): list of #feature levels. Each entry contains a Boxes object, which contains all the anchors in that feature level. box_cls (list[Tensor]): list of #feature levels. Each entry contains tensor of size (H x W x A, K) box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. image_size (tuple(H, W)): a tuple of the image height and width. Returns: Same as `inference`, but for only one image. """ pred = self._decode_multi_level_predictions( anchors, box_cls, box_delta, self.test_score_thresh, self.test_topk_candidates, image_size, ) keep = batched_nms( # per-class NMS pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh ) return pred[keep[: self.max_detections_per_image]] class RetinaNetHead(nn.Module): """ The head used in RetinaNet for object classification and box regression. It has two subnets for the two tasks, with a common structure but separate parameters. """ @configurable def __init__( self, *, input_shape: List[ShapeSpec], num_classes, num_anchors, conv_dims: List[int], norm="", prior_prob=0.01, ): """ NOTE: this interface is experimental. Args: input_shape (List[ShapeSpec]): input shape num_classes (int): number of classes. Used to label background proposals. num_anchors (int): number of generated anchors conv_dims (List[int]): dimensions for each convolution layer norm (str or callable): Normalization for conv layers except for the two output layers. See :func:`detectron2.layers.get_norm` for supported types. prior_prob (float): Prior weight for computing bias """ super().__init__() self._num_features = len(input_shape) if norm == "BN" or norm == "SyncBN": logger.info( f"Using domain-specific {norm} in RetinaNetHead with len={self._num_features}." ) bn_class = nn.BatchNorm2d if norm == "BN" else nn.SyncBatchNorm def norm(c): return CycleBatchNormList( length=self._num_features, bn_class=bn_class, num_features=c ) else: norm_name = str(type(get_norm(norm, 32))) if "BN" in norm_name: logger.warning( f"Shared BatchNorm (type={norm_name}) may not work well in RetinaNetHead." ) cls_subnet = [] bbox_subnet = [] for in_channels, out_channels in zip( [input_shape[0].channels] + list(conv_dims), conv_dims ): cls_subnet.append( nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) ) if norm: cls_subnet.append(get_norm(norm, out_channels)) cls_subnet.append(nn.ReLU()) bbox_subnet.append( nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) ) if norm: bbox_subnet.append(get_norm(norm, out_channels)) bbox_subnet.append(nn.ReLU()) self.cls_subnet = nn.Sequential(*cls_subnet) self.bbox_subnet = nn.Sequential(*bbox_subnet) self.cls_score = nn.Conv2d( conv_dims[-1], num_anchors * num_classes, kernel_size=3, stride=1, padding=1 ) self.bbox_pred = nn.Conv2d( conv_dims[-1], num_anchors * 4, kernel_size=3, stride=1, padding=1 ) # Initialization for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) torch.nn.init.constant_(layer.bias, 0) # Use prior in model initialization to improve stability bias_value = -(math.log((1 - prior_prob) / prior_prob)) torch.nn.init.constant_(self.cls_score.bias, bias_value) @classmethod def from_config(cls, cfg, input_shape: List[ShapeSpec]): num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] return { "input_shape": input_shape, "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES, "conv_dims": [input_shape[0].channels] * cfg.MODEL.RETINANET.NUM_CONVS, "prior_prob": cfg.MODEL.RETINANET.PRIOR_PROB, "norm": cfg.MODEL.RETINANET.NORM, "num_anchors": num_anchors, } def forward(self, features: List[Tensor]): """ Arguments: features (list[Tensor]): FPN feature map tensors in high to low resolution. Each tensor in the list correspond to different feature levels. Returns: logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi). The tensor predicts the classification probability at each spatial position for each of the A anchors and K object classes. bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi). The tensor predicts 4-vector (dx,dy,dw,dh) box regression values for every anchor. These values are the relative offset between the anchor and the ground truth box. """ assert len(features) == self._num_features logits = [] bbox_reg = [] for feature in features: logits.append(self.cls_score(self.cls_subnet(feature))) bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature))) return logits, bbox_reg ================================================ FILE: annotator/oneformer/detectron2/modeling/meta_arch/semantic_seg.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import numpy as np from typing import Callable, Dict, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm from annotator.oneformer.detectron2.structures import ImageList from annotator.oneformer.detectron2.utils.registry import Registry from ..backbone import Backbone, build_backbone from ..postprocessing import sem_seg_postprocess from .build import META_ARCH_REGISTRY __all__ = [ "SemanticSegmentor", "SEM_SEG_HEADS_REGISTRY", "SemSegFPNHead", "build_sem_seg_head", ] SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS") SEM_SEG_HEADS_REGISTRY.__doc__ = """ Registry for semantic segmentation heads, which make semantic segmentation predictions from feature maps. """ @META_ARCH_REGISTRY.register() class SemanticSegmentor(nn.Module): """ Main class for semantic segmentation architectures. """ @configurable def __init__( self, *, backbone: Backbone, sem_seg_head: nn.Module, pixel_mean: Tuple[float], pixel_std: Tuple[float], ): """ Args: backbone: a backbone module, must follow detectron2's backbone interface sem_seg_head: a module that predicts semantic segmentation from backbone features pixel_mean, pixel_std: list or tuple with #channels element, representing the per-channel mean and std to be used to normalize the input image """ super().__init__() self.backbone = backbone self.sem_seg_head = sem_seg_head self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape()) return { "backbone": backbone, "sem_seg_head": sem_seg_head, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, } @property def device(self): return self.pixel_mean.device def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "sem_seg": semantic segmentation ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "sem_seg" whose value is a Tensor that represents the per-pixel segmentation prediced by the head. The prediction has shape KxHxW that represents the logits of each class for each pixel. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors( images, self.backbone.size_divisibility, padding_constraints=self.backbone.padding_constraints, ) features = self.backbone(images.tensor) if "sem_seg" in batched_inputs[0]: targets = [x["sem_seg"].to(self.device) for x in batched_inputs] targets = ImageList.from_tensors( targets, self.backbone.size_divisibility, self.sem_seg_head.ignore_value, self.backbone.padding_constraints, ).tensor else: targets = None results, losses = self.sem_seg_head(features, targets) if self.training: return losses processed_results = [] for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = sem_seg_postprocess(result, image_size, height, width) processed_results.append({"sem_seg": r}) return processed_results def build_sem_seg_head(cfg, input_shape): """ Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`. """ name = cfg.MODEL.SEM_SEG_HEAD.NAME return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape) @SEM_SEG_HEADS_REGISTRY.register() class SemSegFPNHead(nn.Module): """ A semantic segmentation head described in :paper:`PanopticFPN`. It takes a list of FPN features as input, and applies a sequence of 3x3 convs and upsampling to scale all of them to the stride defined by ``common_stride``. Then these features are added and used to make final predictions by another 1x1 conv layer. """ @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, num_classes: int, conv_dims: int, common_stride: int, loss_weight: float = 1.0, norm: Optional[Union[str, Callable]] = None, ignore_value: int = -1, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features num_classes: number of classes to predict conv_dims: number of output channels for the intermediate conv layers. common_stride: the common stride that all features will be upscaled to loss_weight: loss weight norm (str or callable): normalization for all conv layers ignore_value: category id to be ignored during training. """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) if not len(input_shape): raise ValueError("SemSegFPNHead(input_shape=) cannot be empty!") self.in_features = [k for k, v in input_shape] feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] self.ignore_value = ignore_value self.common_stride = common_stride self.loss_weight = loss_weight self.scale_heads = [] for in_feature, stride, channels in zip( self.in_features, feature_strides, feature_channels ): head_ops = [] head_length = max(1, int(np.log2(stride) - np.log2(self.common_stride))) for k in range(head_length): norm_module = get_norm(norm, conv_dims) conv = Conv2d( channels if k == 0 else conv_dims, conv_dims, kernel_size=3, stride=1, padding=1, bias=not norm, norm=norm_module, activation=F.relu, ) weight_init.c2_msra_fill(conv) head_ops.append(conv) if stride != self.common_stride: head_ops.append( nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) ) self.scale_heads.append(nn.Sequential(*head_ops)) self.add_module(in_feature, self.scale_heads[-1]) self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) weight_init.c2_msra_fill(self.predictor) @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): return { "input_shape": { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES }, "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, "conv_dims": cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM, "common_stride": cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE, "norm": cfg.MODEL.SEM_SEG_HEAD.NORM, "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, } def forward(self, features, targets=None): """ Returns: In training, returns (None, dict of losses) In inference, returns (CxHxW logits, {}) """ x = self.layers(features) if self.training: return None, self.losses(x, targets) else: x = F.interpolate( x, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) return x, {} def layers(self, features): for i, f in enumerate(self.in_features): if i == 0: x = self.scale_heads[i](features[f]) else: x = x + self.scale_heads[i](features[f]) x = self.predictor(x) return x def losses(self, predictions, targets): predictions = predictions.float() # https://github.com/pytorch/pytorch/issues/48163 predictions = F.interpolate( predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False, ) loss = F.cross_entropy( predictions, targets, reduction="mean", ignore_index=self.ignore_value ) losses = {"loss_sem_seg": loss * self.loss_weight} return losses ================================================ FILE: annotator/oneformer/detectron2/modeling/mmdet_wrapper.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import itertools import logging import numpy as np from collections import OrderedDict from collections.abc import Mapping from typing import Dict, List, Optional, Tuple, Union import torch from omegaconf import DictConfig, OmegaConf from torch import Tensor, nn from annotator.oneformer.detectron2.layers import ShapeSpec from annotator.oneformer.detectron2.structures import BitMasks, Boxes, ImageList, Instances from annotator.oneformer.detectron2.utils.events import get_event_storage from .backbone import Backbone logger = logging.getLogger(__name__) def _to_container(cfg): """ mmdet will assert the type of dict/list. So convert omegaconf objects to dict/list. """ if isinstance(cfg, DictConfig): cfg = OmegaConf.to_container(cfg, resolve=True) from mmcv.utils import ConfigDict return ConfigDict(cfg) class MMDetBackbone(Backbone): """ Wrapper of mmdetection backbones to use in detectron2. mmdet backbones produce list/tuple of tensors, while detectron2 backbones produce a dict of tensors. This class wraps the given backbone to produce output in detectron2's convention, so it can be used in place of detectron2 backbones. """ def __init__( self, backbone: Union[nn.Module, Mapping], neck: Union[nn.Module, Mapping, None] = None, *, output_shapes: List[ShapeSpec], output_names: Optional[List[str]] = None, ): """ Args: backbone: either a backbone module or a mmdet config dict that defines a backbone. The backbone takes a 4D image tensor and returns a sequence of tensors. neck: either a backbone module or a mmdet config dict that defines a neck. The neck takes outputs of backbone and returns a sequence of tensors. If None, no neck is used. output_shapes: shape for every output of the backbone (or neck, if given). stride and channels are often needed. output_names: names for every output of the backbone (or neck, if given). By default, will use "out0", "out1", ... """ super().__init__() if isinstance(backbone, Mapping): from mmdet.models import build_backbone backbone = build_backbone(_to_container(backbone)) self.backbone = backbone if isinstance(neck, Mapping): from mmdet.models import build_neck neck = build_neck(_to_container(neck)) self.neck = neck # "Neck" weights, if any, are part of neck itself. This is the interface # of mmdet so we follow it. Reference: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py logger.info("Initializing mmdet backbone weights...") self.backbone.init_weights() # train() in mmdet modules is non-trivial, and has to be explicitly # called. Reference: # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py self.backbone.train() if self.neck is not None: logger.info("Initializing mmdet neck weights ...") if isinstance(self.neck, nn.Sequential): for m in self.neck: m.init_weights() else: self.neck.init_weights() self.neck.train() self._output_shapes = output_shapes if not output_names: output_names = [f"out{i}" for i in range(len(output_shapes))] self._output_names = output_names def forward(self, x) -> Dict[str, Tensor]: outs = self.backbone(x) if self.neck is not None: outs = self.neck(outs) assert isinstance( outs, (list, tuple) ), "mmdet backbone should return a list/tuple of tensors!" if len(outs) != len(self._output_shapes): raise ValueError( "Length of output_shapes does not match outputs from the mmdet backbone: " f"{len(outs)} != {len(self._output_shapes)}" ) return {k: v for k, v in zip(self._output_names, outs)} def output_shape(self) -> Dict[str, ShapeSpec]: return {k: v for k, v in zip(self._output_names, self._output_shapes)} class MMDetDetector(nn.Module): """ Wrapper of a mmdetection detector model, for detection and instance segmentation. Input/output formats of this class follow detectron2's convention, so a mmdetection model can be trained and evaluated in detectron2. """ def __init__( self, detector: Union[nn.Module, Mapping], *, # Default is 32 regardless of model: # https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets size_divisibility=32, pixel_mean: Tuple[float], pixel_std: Tuple[float], ): """ Args: detector: a mmdet detector, or a mmdet config dict that defines a detector. size_divisibility: pad input images to multiple of this number pixel_mean: per-channel mean to normalize input image pixel_std: per-channel stddev to normalize input image """ super().__init__() if isinstance(detector, Mapping): from mmdet.models import build_detector detector = build_detector(_to_container(detector)) self.detector = detector self.detector.init_weights() self.size_divisibility = size_divisibility self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False) assert ( self.pixel_mean.shape == self.pixel_std.shape ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!" def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor metas = [] rescale = {"height" in x for x in batched_inputs} if len(rescale) != 1: raise ValueError("Some inputs have original height/width, but some don't!") rescale = list(rescale)[0] output_shapes = [] for input in batched_inputs: meta = {} c, h, w = input["image"].shape meta["img_shape"] = meta["ori_shape"] = (h, w, c) if rescale: scale_factor = np.array( [w / input["width"], h / input["height"]] * 2, dtype="float32" ) ori_shape = (input["height"], input["width"]) output_shapes.append(ori_shape) meta["ori_shape"] = ori_shape + (c,) else: scale_factor = 1.0 output_shapes.append((h, w)) meta["scale_factor"] = scale_factor meta["flip"] = False padh, padw = images.shape[-2:] meta["pad_shape"] = (padh, padw, c) metas.append(meta) if self.training: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] if gt_instances[0].has("gt_masks"): from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks def convert_mask(m, shape): # mmdet mask format if isinstance(m, BitMasks): return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1]) else: return mm_PolygonMasks(m.polygons, shape[0], shape[1]) gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances] losses_and_metrics = self.detector.forward_train( images, metas, [x.gt_boxes.tensor for x in gt_instances], [x.gt_classes for x in gt_instances], gt_masks=gt_masks, ) else: losses_and_metrics = self.detector.forward_train( images, metas, [x.gt_boxes.tensor for x in gt_instances], [x.gt_classes for x in gt_instances], ) return _parse_losses(losses_and_metrics) else: results = self.detector.simple_test(images, metas, rescale=rescale) results = [ {"instances": _convert_mmdet_result(r, shape)} for r, shape in zip(results, output_shapes) ] return results @property def device(self): return self.pixel_mean.device # Reference: show_result() in # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py def _convert_mmdet_result(result, shape: Tuple[int, int]) -> Instances: if isinstance(result, tuple): bbox_result, segm_result = result if isinstance(segm_result, tuple): segm_result = segm_result[0] else: bbox_result, segm_result = result, None bboxes = torch.from_numpy(np.vstack(bbox_result)) # Nx5 bboxes, scores = bboxes[:, :4], bboxes[:, -1] labels = [ torch.full((bbox.shape[0],), i, dtype=torch.int32) for i, bbox in enumerate(bbox_result) ] labels = torch.cat(labels) inst = Instances(shape) inst.pred_boxes = Boxes(bboxes) inst.scores = scores inst.pred_classes = labels if segm_result is not None and len(labels) > 0: segm_result = list(itertools.chain(*segm_result)) segm_result = [torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in segm_result] segm_result = torch.stack(segm_result, dim=0) inst.pred_masks = segm_result return inst # reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py def _parse_losses(losses: Dict[str, Tensor]) -> Dict[str, Tensor]: log_vars = OrderedDict() for loss_name, loss_value in losses.items(): if isinstance(loss_value, torch.Tensor): log_vars[loss_name] = loss_value.mean() elif isinstance(loss_value, list): log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) else: raise TypeError(f"{loss_name} is not a tensor or list of tensors") if "loss" not in loss_name: # put metrics to storage; don't return them storage = get_event_storage() value = log_vars.pop(loss_name).cpu().item() storage.put_scalar(loss_name, value) return log_vars ================================================ FILE: annotator/oneformer/detectron2/modeling/poolers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import math from typing import List, Optional import torch from torch import nn from torchvision.ops import RoIPool from annotator.oneformer.detectron2.layers import ROIAlign, ROIAlignRotated, cat, nonzero_tuple, shapes_to_tensor from annotator.oneformer.detectron2.structures import Boxes from annotator.oneformer.detectron2.utils.tracing import assert_fx_safe, is_fx_tracing """ To export ROIPooler to torchscript, in this file, variables that should be annotated with `Union[List[Boxes], List[RotatedBoxes]]` are only annotated with `List[Boxes]`. TODO: Correct these annotations when torchscript support `Union`. https://github.com/pytorch/pytorch/issues/41412 """ __all__ = ["ROIPooler"] def assign_boxes_to_levels( box_lists: List[Boxes], min_level: int, max_level: int, canonical_box_size: int, canonical_level: int, ): """ Map each box in `box_lists` to a feature map level index and return the assignment vector. Args: box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. min_level (int): Smallest feature map level index. The input is considered index 0, the output of stage 1 is index 1, and so. max_level (int): Largest feature map level index. canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). canonical_level (int): The feature map level index on which a canonically-sized box should be placed. Returns: A tensor of length M, where M is the total number of boxes aggregated over all N batch images. The memory layout corresponds to the concatenation of boxes from all images. Each element is the feature map index, as an offset from `self.min_level`, for the corresponding box (so value i means the box is at `self.min_level + i`). """ box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists])) # Eqn.(1) in FPN paper level_assignments = torch.floor( canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8) ) # clamp level to (min, max), in case the box size is too large or too small # for the available feature maps level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level) return level_assignments.to(torch.int64) - min_level # script the module to avoid hardcoded device type @torch.jit.script_if_tracing def _convert_boxes_to_pooler_format(boxes: torch.Tensor, sizes: torch.Tensor) -> torch.Tensor: sizes = sizes.to(device=boxes.device) indices = torch.repeat_interleave( torch.arange(len(sizes), dtype=boxes.dtype, device=boxes.device), sizes ) return cat([indices[:, None], boxes], dim=1) def convert_boxes_to_pooler_format(box_lists: List[Boxes]): """ Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops (see description under Returns). Args: box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. Returns: When input is list[Boxes]: A tensor of shape (M, 5), where M is the total number of boxes aggregated over all N batch images. The 5 columns are (batch index, x0, y0, x1, y1), where batch index is the index in [0, N) identifying which batch image the box with corners at (x0, y0, x1, y1) comes from. When input is list[RotatedBoxes]: A tensor of shape (M, 6), where M is the total number of boxes aggregated over all N batch images. The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees), where batch index is the index in [0, N) identifying which batch image the rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from. """ boxes = torch.cat([x.tensor for x in box_lists], dim=0) # __len__ returns Tensor in tracing. sizes = shapes_to_tensor([x.__len__() for x in box_lists]) return _convert_boxes_to_pooler_format(boxes, sizes) @torch.jit.script_if_tracing def _create_zeros( batch_target: Optional[torch.Tensor], channels: int, height: int, width: int, like_tensor: torch.Tensor, ) -> torch.Tensor: batches = batch_target.shape[0] if batch_target is not None else 0 sizes = (batches, channels, height, width) return torch.zeros(sizes, dtype=like_tensor.dtype, device=like_tensor.device) class ROIPooler(nn.Module): """ Region of interest feature map pooler that supports pooling from one or more feature maps. """ def __init__( self, output_size, scales, sampling_ratio, pooler_type, canonical_box_size=224, canonical_level=4, ): """ Args: output_size (int, tuple[int] or list[int]): output size of the pooled region, e.g., 14 x 14. If tuple or list is given, the length must be 2. scales (list[float]): The scale for each low-level pooling op relative to the input image. For a feature map with stride s relative to the input image, scale is defined as 1/s. The stride must be power of 2. When there are multiple scales, they must form a pyramid, i.e. they must be a monotically decreasing geometric sequence with a factor of 1/2. sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op. pooler_type (string): Name of the type of pooling operation that should be applied. For instance, "ROIPool" or "ROIAlignV2". canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default is heuristically defined as 224 pixels in the FPN paper (based on ImageNet pre-training). canonical_level (int): The feature map level index from which a canonically-sized box should be placed. The default is defined as level 4 (stride=16) in the FPN paper, i.e., a box of size 224x224 will be placed on the feature with stride=16. The box placement for all boxes will be determined from their sizes w.r.t canonical_box_size. For example, a box whose area is 4x that of a canonical box should be used to pool features from feature level ``canonical_level+1``. Note that the actual input feature maps given to this module may not have sufficiently many levels for the input boxes. If the boxes are too large or too small for the input feature maps, the closest level will be used. """ super().__init__() if isinstance(output_size, int): output_size = (output_size, output_size) assert len(output_size) == 2 assert isinstance(output_size[0], int) and isinstance(output_size[1], int) self.output_size = output_size if pooler_type == "ROIAlign": self.level_poolers = nn.ModuleList( ROIAlign( output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False ) for scale in scales ) elif pooler_type == "ROIAlignV2": self.level_poolers = nn.ModuleList( ROIAlign( output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True ) for scale in scales ) elif pooler_type == "ROIPool": self.level_poolers = nn.ModuleList( RoIPool(output_size, spatial_scale=scale) for scale in scales ) elif pooler_type == "ROIAlignRotated": self.level_poolers = nn.ModuleList( ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio) for scale in scales ) else: raise ValueError("Unknown pooler type: {}".format(pooler_type)) # Map scale (defined as 1 / stride) to its feature map level under the # assumption that stride is a power of 2. min_level = -(math.log2(scales[0])) max_level = -(math.log2(scales[-1])) assert math.isclose(min_level, int(min_level)) and math.isclose( max_level, int(max_level) ), "Featuremap stride is not power of 2!" self.min_level = int(min_level) self.max_level = int(max_level) assert ( len(scales) == self.max_level - self.min_level + 1 ), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!" assert 0 <= self.min_level and self.min_level <= self.max_level self.canonical_level = canonical_level assert canonical_box_size > 0 self.canonical_box_size = canonical_box_size def forward(self, x: List[torch.Tensor], box_lists: List[Boxes]): """ Args: x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those used to construct this module. box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. The box coordinates are defined on the original image and will be scaled by the `scales` argument of :class:`ROIPooler`. Returns: Tensor: A tensor of shape (M, C, output_size, output_size) where M is the total number of boxes aggregated over all N batch images and C is the number of channels in `x`. """ num_level_assignments = len(self.level_poolers) if not is_fx_tracing(): torch._assert( isinstance(x, list) and isinstance(box_lists, list), "Arguments to pooler must be lists", ) assert_fx_safe( len(x) == num_level_assignments, "unequal value, num_level_assignments={}, but x is list of {} Tensors".format( num_level_assignments, len(x) ), ) assert_fx_safe( len(box_lists) == x[0].size(0), "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format( x[0].size(0), len(box_lists) ), ) if len(box_lists) == 0: return _create_zeros(None, x[0].shape[1], *self.output_size, x[0]) pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists) if num_level_assignments == 1: return self.level_poolers[0](x[0], pooler_fmt_boxes) level_assignments = assign_boxes_to_levels( box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level ) num_channels = x[0].shape[1] output_size = self.output_size[0] output = _create_zeros(pooler_fmt_boxes, num_channels, output_size, output_size, x[0]) for level, pooler in enumerate(self.level_poolers): inds = nonzero_tuple(level_assignments == level)[0] pooler_fmt_boxes_level = pooler_fmt_boxes[inds] # Use index_put_ instead of advance indexing, to avoid pytorch/issues/49852 output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level)) return output ================================================ FILE: annotator/oneformer/detectron2/modeling/postprocessing.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch from torch.nn import functional as F from annotator.oneformer.detectron2.structures import Instances, ROIMasks # perhaps should rename to "resize_instance" def detector_postprocess( results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5 ): """ Resize the output instances. The input images are often resized when entering an object detector. As a result, we often need the outputs of the detector in a different resolution from its inputs. This function will resize the raw outputs of an R-CNN detector to produce outputs according to the desired output resolution. Args: results (Instances): the raw outputs from the detector. `results.image_size` contains the input image resolution the detector sees. This object might be modified in-place. output_height, output_width: the desired output resolution. Returns: Instances: the resized output from the model, based on the output resolution """ if isinstance(output_width, torch.Tensor): # This shape might (but not necessarily) be tensors during tracing. # Converts integer tensors to float temporaries to ensure true # division is performed when computing scale_x and scale_y. output_width_tmp = output_width.float() output_height_tmp = output_height.float() new_size = torch.stack([output_height, output_width]) else: new_size = (output_height, output_width) output_width_tmp = output_width output_height_tmp = output_height scale_x, scale_y = ( output_width_tmp / results.image_size[1], output_height_tmp / results.image_size[0], ) results = Instances(new_size, **results.get_fields()) if results.has("pred_boxes"): output_boxes = results.pred_boxes elif results.has("proposal_boxes"): output_boxes = results.proposal_boxes else: output_boxes = None assert output_boxes is not None, "Predictions must contain boxes!" output_boxes.scale(scale_x, scale_y) output_boxes.clip(results.image_size) results = results[output_boxes.nonempty()] if results.has("pred_masks"): if isinstance(results.pred_masks, ROIMasks): roi_masks = results.pred_masks else: # pred_masks is a tensor of shape (N, 1, M, M) roi_masks = ROIMasks(results.pred_masks[:, 0, :, :]) results.pred_masks = roi_masks.to_bitmasks( results.pred_boxes, output_height, output_width, mask_threshold ).tensor # TODO return ROIMasks/BitMask object in the future if results.has("pred_keypoints"): results.pred_keypoints[:, :, 0] *= scale_x results.pred_keypoints[:, :, 1] *= scale_y return results def sem_seg_postprocess(result, img_size, output_height, output_width): """ Return semantic segmentation predictions in the original resolution. The input images are often resized when entering semantic segmentor. Moreover, in same cases, they also padded inside segmentor to be divisible by maximum network stride. As a result, we often need the predictions of the segmentor in a different resolution from its inputs. Args: result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W), where C is the number of classes, and H, W are the height and width of the prediction. img_size (tuple): image size that segmentor is taking as input. output_height, output_width: the desired output resolution. Returns: semantic segmentation prediction (Tensor): A tensor of the shape (C, output_height, output_width) that contains per-pixel soft predictions. """ result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1) result = F.interpolate( result, size=(output_height, output_width), mode="bilinear", align_corners=False )[0] return result ================================================ FILE: annotator/oneformer/detectron2/modeling/proposal_generator/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator from .rpn import RPN_HEAD_REGISTRY, build_rpn_head, RPN, StandardRPNHead __all__ = list(globals().keys()) ================================================ FILE: annotator/oneformer/detectron2/modeling/proposal_generator/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from annotator.oneformer.detectron2.utils.registry import Registry PROPOSAL_GENERATOR_REGISTRY = Registry("PROPOSAL_GENERATOR") PROPOSAL_GENERATOR_REGISTRY.__doc__ = """ Registry for proposal generator, which produces object proposals from feature maps. The registered object will be called with `obj(cfg, input_shape)`. The call should return a `nn.Module` object. """ from . import rpn, rrpn # noqa F401 isort:skip def build_proposal_generator(cfg, input_shape): """ Build a proposal generator from `cfg.MODEL.PROPOSAL_GENERATOR.NAME`. The name can be "PrecomputedProposals" to use no proposal generator. """ name = cfg.MODEL.PROPOSAL_GENERATOR.NAME if name == "PrecomputedProposals": return None return PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape) ================================================ FILE: annotator/oneformer/detectron2/modeling/proposal_generator/proposal_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import math from typing import List, Tuple, Union import torch from annotator.oneformer.detectron2.layers import batched_nms, cat, move_device_like from annotator.oneformer.detectron2.structures import Boxes, Instances logger = logging.getLogger(__name__) def _is_tracing(): # (fixed in TORCH_VERSION >= 1.9) if torch.jit.is_scripting(): # https://github.com/pytorch/pytorch/issues/47379 return False else: return torch.jit.is_tracing() def find_top_rpn_proposals( proposals: List[torch.Tensor], pred_objectness_logits: List[torch.Tensor], image_sizes: List[Tuple[int, int]], nms_thresh: float, pre_nms_topk: int, post_nms_topk: int, min_box_size: float, training: bool, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps for each image. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). image_sizes (list[tuple]): sizes (h, w) for each image nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_size (float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: list[Instances]: list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i, sorted by their objectness score in descending order. """ num_images = len(image_sizes) device = ( proposals[0].device if torch.jit.is_scripting() else ("cpu" if torch.jit.is_tracing() else proposals[0].device) ) # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = move_device_like(torch.arange(num_images, device=device), proposals[0]) for level_id, (proposals_i, logits_i) in enumerate(zip(proposals, pred_objectness_logits)): Hi_Wi_A = logits_i.shape[1] if isinstance(Hi_Wi_A, torch.Tensor): # it's a tensor in tracing num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk) else: num_proposals_i = min(Hi_Wi_A, pre_nms_topk) topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append( move_device_like( torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device), proposals[0], ) ) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results: List[Instances] = [] for n, image_size in enumerate(image_sizes): boxes = Boxes(topk_proposals[n]) scores_per_img = topk_scores[n] lvl = level_ids valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img) if not valid_mask.all(): if training: raise FloatingPointError( "Predicted boxes or scores contain Inf/NaN. Training has diverged." ) boxes = boxes[valid_mask] scores_per_img = scores_per_img[valid_mask] lvl = lvl[valid_mask] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_size) if _is_tracing() or keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], lvl[keep] keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] # keep is already sorted res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results def add_ground_truth_to_proposals( gt: Union[List[Instances], List[Boxes]], proposals: List[Instances] ) -> List[Instances]: """ Call `add_ground_truth_to_proposals_single_image` for all images. Args: gt(Union[List[Instances], List[Boxes]): list of N elements. Element i is a Instances representing the ground-truth for image i. proposals (list[Instances]): list of N elements. Element i is a Instances representing the proposals for image i. Returns: list[Instances]: list of N Instances. Each is the proposals for the image, with field "proposal_boxes" and "objectness_logits". """ assert gt is not None if len(proposals) != len(gt): raise ValueError("proposals and gt should have the same length as the number of images!") if len(proposals) == 0: return proposals return [ add_ground_truth_to_proposals_single_image(gt_i, proposals_i) for gt_i, proposals_i in zip(gt, proposals) ] def add_ground_truth_to_proposals_single_image( gt: Union[Instances, Boxes], proposals: Instances ) -> Instances: """ Augment `proposals` with `gt`. Args: Same as `add_ground_truth_to_proposals`, but with gt and proposals per image. Returns: Same as `add_ground_truth_to_proposals`, but for only one image. """ if isinstance(gt, Boxes): # convert Boxes to Instances gt = Instances(proposals.image_size, gt_boxes=gt) gt_boxes = gt.gt_boxes device = proposals.objectness_logits.device # Assign all ground-truth boxes an objectness logit corresponding to # P(object) = sigmoid(logit) =~ 1. gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10))) gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device) # Concatenating gt_boxes with proposals requires them to have the same fields gt_proposal = Instances(proposals.image_size, **gt.get_fields()) gt_proposal.proposal_boxes = gt_boxes gt_proposal.objectness_logits = gt_logits for key in proposals.get_fields().keys(): assert gt_proposal.has( key ), "The attribute '{}' in `proposals` does not exist in `gt`".format(key) # NOTE: Instances.cat only use fields from the first item. Extra fields in latter items # will be thrown away. new_proposals = Instances.cat([proposals, gt_proposal]) return new_proposals ================================================ FILE: annotator/oneformer/detectron2/modeling/proposal_generator/rpn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from typing import Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F from torch import nn from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, cat from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_iou from annotator.oneformer.detectron2.utils.events import get_event_storage from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom from annotator.oneformer.detectron2.utils.registry import Registry from ..anchor_generator import build_anchor_generator from ..box_regression import Box2BoxTransform, _dense_box_regression_loss from ..matcher import Matcher from ..sampling import subsample_labels from .build import PROPOSAL_GENERATOR_REGISTRY from .proposal_utils import find_top_rpn_proposals RPN_HEAD_REGISTRY = Registry("RPN_HEAD") RPN_HEAD_REGISTRY.__doc__ = """ Registry for RPN heads, which take feature maps and perform objectness classification and bounding box regression for anchors. The registered object will be called with `obj(cfg, input_shape)`. The call should return a `nn.Module` object. """ """ Shape shorthand in this module: N: number of images in the minibatch L: number of feature maps per image on which RPN is run A: number of cell anchors (must be the same for all feature maps) Hi, Wi: height and width of the i-th feature map B: size of the box parameterization Naming convention: objectness: refers to the binary classification of an anchor as object vs. not object. deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box transform (see :class:`box_regression.Box2BoxTransform`), or 5d for rotated boxes. pred_objectness_logits: predicted objectness scores in [-inf, +inf]; use sigmoid(pred_objectness_logits) to estimate P(object). gt_labels: ground-truth binary classification labels for objectness pred_anchor_deltas: predicted box2box transform deltas gt_anchor_deltas: ground-truth box2box transform deltas """ def build_rpn_head(cfg, input_shape): """ Build an RPN head defined by `cfg.MODEL.RPN.HEAD_NAME`. """ name = cfg.MODEL.RPN.HEAD_NAME return RPN_HEAD_REGISTRY.get(name)(cfg, input_shape) @RPN_HEAD_REGISTRY.register() class StandardRPNHead(nn.Module): """ Standard RPN classification and regression heads described in :paper:`Faster R-CNN`. Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas specifying how to deform each anchor into an object proposal. """ @configurable def __init__( self, *, in_channels: int, num_anchors: int, box_dim: int = 4, conv_dims: List[int] = (-1,) ): """ NOTE: this interface is experimental. Args: in_channels (int): number of input feature channels. When using multiple input features, they must have the same number of channels. num_anchors (int): number of anchors to predict for *each spatial position* on the feature map. The total number of anchors for each feature map will be `num_anchors * H * W`. box_dim (int): dimension of a box, which is also the number of box regression predictions to make for each anchor. An axis aligned box has box_dim=4, while a rotated box has box_dim=5. conv_dims (list[int]): a list of integers representing the output channels of N conv layers. Set it to -1 to use the same number of output channels as input channels. """ super().__init__() cur_channels = in_channels # Keeping the old variable names and structure for backwards compatiblity. # Otherwise the old checkpoints will fail to load. if len(conv_dims) == 1: out_channels = cur_channels if conv_dims[0] == -1 else conv_dims[0] # 3x3 conv for the hidden representation self.conv = self._get_rpn_conv(cur_channels, out_channels) cur_channels = out_channels else: self.conv = nn.Sequential() for k, conv_dim in enumerate(conv_dims): out_channels = cur_channels if conv_dim == -1 else conv_dim if out_channels <= 0: raise ValueError( f"Conv output channels should be greater than 0. Got {out_channels}" ) conv = self._get_rpn_conv(cur_channels, out_channels) self.conv.add_module(f"conv{k}", conv) cur_channels = out_channels # 1x1 conv for predicting objectness logits self.objectness_logits = nn.Conv2d(cur_channels, num_anchors, kernel_size=1, stride=1) # 1x1 conv for predicting box2box transform deltas self.anchor_deltas = nn.Conv2d(cur_channels, num_anchors * box_dim, kernel_size=1, stride=1) # Keeping the order of weights initialization same for backwards compatiblility. for layer in self.modules(): if isinstance(layer, nn.Conv2d): nn.init.normal_(layer.weight, std=0.01) nn.init.constant_(layer.bias, 0) def _get_rpn_conv(self, in_channels, out_channels): return Conv2d( in_channels, out_channels, kernel_size=3, stride=1, padding=1, activation=nn.ReLU(), ) @classmethod def from_config(cls, cfg, input_shape): # Standard RPN is shared across levels: in_channels = [s.channels for s in input_shape] assert len(set(in_channels)) == 1, "Each level must have the same channel!" in_channels = in_channels[0] # RPNHead should take the same input as anchor generator # NOTE: it assumes that creating an anchor generator does not have unwanted side effect. anchor_generator = build_anchor_generator(cfg, input_shape) num_anchors = anchor_generator.num_anchors box_dim = anchor_generator.box_dim assert ( len(set(num_anchors)) == 1 ), "Each level must have the same number of anchors per spatial position" return { "in_channels": in_channels, "num_anchors": num_anchors[0], "box_dim": box_dim, "conv_dims": cfg.MODEL.RPN.CONV_DIMS, } def forward(self, features: List[torch.Tensor]): """ Args: features (list[Tensor]): list of feature maps Returns: list[Tensor]: A list of L elements. Element i is a tensor of shape (N, A, Hi, Wi) representing the predicted objectness logits for all anchors. A is the number of cell anchors. list[Tensor]: A list of L elements. Element i is a tensor of shape (N, A*box_dim, Hi, Wi) representing the predicted "deltas" used to transform anchors to proposals. """ pred_objectness_logits = [] pred_anchor_deltas = [] for x in features: t = self.conv(x) pred_objectness_logits.append(self.objectness_logits(t)) pred_anchor_deltas.append(self.anchor_deltas(t)) return pred_objectness_logits, pred_anchor_deltas @PROPOSAL_GENERATOR_REGISTRY.register() class RPN(nn.Module): """ Region Proposal Network, introduced by :paper:`Faster R-CNN`. """ @configurable def __init__( self, *, in_features: List[str], head: nn.Module, anchor_generator: nn.Module, anchor_matcher: Matcher, box2box_transform: Box2BoxTransform, batch_size_per_image: int, positive_fraction: float, pre_nms_topk: Tuple[float, float], post_nms_topk: Tuple[float, float], nms_thresh: float = 0.7, min_box_size: float = 0.0, anchor_boundary_thresh: float = -1.0, loss_weight: Union[float, Dict[str, float]] = 1.0, box_reg_loss_type: str = "smooth_l1", smooth_l1_beta: float = 0.0, ): """ NOTE: this interface is experimental. Args: in_features (list[str]): list of names of input features to use head (nn.Module): a module that predicts logits and regression deltas for each level from a list of per-level features anchor_generator (nn.Module): a module that creates anchors from a list of features. Usually an instance of :class:`AnchorGenerator` anchor_matcher (Matcher): label the anchors by matching them with ground truth. box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to instance boxes batch_size_per_image (int): number of anchors per image to sample for training positive_fraction (float): fraction of foreground anchors to sample for training pre_nms_topk (tuple[float]): (train, test) that represents the number of top k proposals to select before NMS, in training and testing. post_nms_topk (tuple[float]): (train, test) that represents the number of top k proposals to select after NMS, in training and testing. nms_thresh (float): NMS threshold used to de-duplicate the predicted proposals min_box_size (float): remove proposal boxes with any side smaller than this threshold, in the unit of input image pixels anchor_boundary_thresh (float): legacy option loss_weight (float|dict): weights to use for losses. Can be single float for weighting all rpn losses together, or a dict of individual weightings. Valid dict keys are: "loss_rpn_cls" - applied to classification loss "loss_rpn_loc" - applied to box regression loss box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou". smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1" """ super().__init__() self.in_features = in_features self.rpn_head = head self.anchor_generator = anchor_generator self.anchor_matcher = anchor_matcher self.box2box_transform = box2box_transform self.batch_size_per_image = batch_size_per_image self.positive_fraction = positive_fraction # Map from self.training state to train/test settings self.pre_nms_topk = {True: pre_nms_topk[0], False: pre_nms_topk[1]} self.post_nms_topk = {True: post_nms_topk[0], False: post_nms_topk[1]} self.nms_thresh = nms_thresh self.min_box_size = float(min_box_size) self.anchor_boundary_thresh = anchor_boundary_thresh if isinstance(loss_weight, float): loss_weight = {"loss_rpn_cls": loss_weight, "loss_rpn_loc": loss_weight} self.loss_weight = loss_weight self.box_reg_loss_type = box_reg_loss_type self.smooth_l1_beta = smooth_l1_beta @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): in_features = cfg.MODEL.RPN.IN_FEATURES ret = { "in_features": in_features, "min_box_size": cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE, "nms_thresh": cfg.MODEL.RPN.NMS_THRESH, "batch_size_per_image": cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, "positive_fraction": cfg.MODEL.RPN.POSITIVE_FRACTION, "loss_weight": { "loss_rpn_cls": cfg.MODEL.RPN.LOSS_WEIGHT, "loss_rpn_loc": cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT, }, "anchor_boundary_thresh": cfg.MODEL.RPN.BOUNDARY_THRESH, "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS), "box_reg_loss_type": cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE, "smooth_l1_beta": cfg.MODEL.RPN.SMOOTH_L1_BETA, } ret["pre_nms_topk"] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, cfg.MODEL.RPN.PRE_NMS_TOPK_TEST) ret["post_nms_topk"] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, cfg.MODEL.RPN.POST_NMS_TOPK_TEST) ret["anchor_generator"] = build_anchor_generator(cfg, [input_shape[f] for f in in_features]) ret["anchor_matcher"] = Matcher( cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True ) ret["head"] = build_rpn_head(cfg, [input_shape[f] for f in in_features]) return ret def _subsample_labels(self, label): """ Randomly sample a subset of positive and negative examples, and overwrite the label vector to the ignore value (-1) for all elements that are not included in the sample. Args: labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned. """ pos_idx, neg_idx = subsample_labels( label, self.batch_size_per_image, self.positive_fraction, 0 ) # Fill with the ignore label (-1), then set positive and negative labels label.fill_(-1) label.scatter_(0, pos_idx, 1) label.scatter_(0, neg_idx, 0) return label @torch.jit.unused @torch.no_grad() def label_and_sample_anchors( self, anchors: List[Boxes], gt_instances: List[Instances] ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: """ Args: anchors (list[Boxes]): anchors for each feature map. gt_instances: the ground-truth instances for each image. Returns: list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is the total number of anchors across all feature maps R = sum(Hi * Wi * A). Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class. list[Tensor]: i-th element is a Rx4 tensor. The values are the matched gt boxes for each anchor. Values are undefined for those anchors not labeled as 1. """ anchors = Boxes.cat(anchors) gt_boxes = [x.gt_boxes for x in gt_instances] image_sizes = [x.image_size for x in gt_instances] del gt_instances gt_labels = [] matched_gt_boxes = [] for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes): """ image_size_i: (h, w) for the i-th image gt_boxes_i: ground-truth boxes for i-th image """ match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors) matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix) # Matching is memory-expensive and may result in CPU tensors. But the result is small gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device) del match_quality_matrix if self.anchor_boundary_thresh >= 0: # Discard anchors that go out of the boundaries of the image # NOTE: This is legacy functionality that is turned off by default in Detectron2 anchors_inside_image = anchors.inside_box(image_size_i, self.anchor_boundary_thresh) gt_labels_i[~anchors_inside_image] = -1 # A vector of labels (-1, 0, 1) for each anchor gt_labels_i = self._subsample_labels(gt_labels_i) if len(gt_boxes_i) == 0: # These values won't be used anyway since the anchor is labeled as background matched_gt_boxes_i = torch.zeros_like(anchors.tensor) else: # TODO wasted indexing computation for ignored boxes matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor gt_labels.append(gt_labels_i) # N,AHW matched_gt_boxes.append(matched_gt_boxes_i) return gt_labels, matched_gt_boxes @torch.jit.unused def losses( self, anchors: List[Boxes], pred_objectness_logits: List[torch.Tensor], gt_labels: List[torch.Tensor], pred_anchor_deltas: List[torch.Tensor], gt_boxes: List[torch.Tensor], ) -> Dict[str, torch.Tensor]: """ Return the losses from a set of RPN predictions and their associated ground-truth. Args: anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each has shape (Hi*Wi*A, B), where B is box dimension (4 or 5). pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A) representing the predicted objectness logits for all anchors. gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`. pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors to proposals. gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`. Returns: dict[loss name -> loss value]: A dict mapping from loss name to loss value. Loss names are: `loss_rpn_cls` for objectness classification and `loss_rpn_loc` for proposal localization. """ num_images = len(gt_labels) gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai)) # Log the number of positive/negative anchors per-image that's used in training pos_mask = gt_labels == 1 num_pos_anchors = pos_mask.sum().item() num_neg_anchors = (gt_labels == 0).sum().item() storage = get_event_storage() storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images) storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images) localization_loss = _dense_box_regression_loss( anchors, self.box2box_transform, pred_anchor_deltas, gt_boxes, pos_mask, box_reg_loss_type=self.box_reg_loss_type, smooth_l1_beta=self.smooth_l1_beta, ) valid_mask = gt_labels >= 0 objectness_loss = F.binary_cross_entropy_with_logits( cat(pred_objectness_logits, dim=1)[valid_mask], gt_labels[valid_mask].to(torch.float32), reduction="sum", ) normalizer = self.batch_size_per_image * num_images losses = { "loss_rpn_cls": objectness_loss / normalizer, # The original Faster R-CNN paper uses a slightly different normalizer # for loc loss. But it doesn't matter in practice "loss_rpn_loc": localization_loss / normalizer, } losses = {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()} return losses def forward( self, images: ImageList, features: Dict[str, torch.Tensor], gt_instances: Optional[List[Instances]] = None, ): """ Args: images (ImageList): input images of length `N` features (dict[str, Tensor]): input data as a mapping from feature map name to tensor. Axis 0 represents the number of images `N` in the input data; axes 1-3 are channels, height, and width, which may vary between feature maps (e.g., if a feature pyramid is used). gt_instances (list[Instances], optional): a length `N` list of `Instances`s. Each `Instances` stores ground-truth instances for the corresponding image. Returns: proposals: list[Instances]: contains fields "proposal_boxes", "objectness_logits" loss: dict[Tensor] or None """ features = [features[f] for f in self.in_features] anchors = self.anchor_generator(features) pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features) # Transpose the Hi*Wi*A dimension to the middle: pred_objectness_logits = [ # (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A) score.permute(0, 2, 3, 1).flatten(1) for score in pred_objectness_logits ] pred_anchor_deltas = [ # (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) -> (N, Hi*Wi*A, B) x.view(x.shape[0], -1, self.anchor_generator.box_dim, x.shape[-2], x.shape[-1]) .permute(0, 3, 4, 1, 2) .flatten(1, -2) for x in pred_anchor_deltas ] if self.training: assert gt_instances is not None, "RPN requires gt_instances in training!" gt_labels, gt_boxes = self.label_and_sample_anchors(anchors, gt_instances) losses = self.losses( anchors, pred_objectness_logits, gt_labels, pred_anchor_deltas, gt_boxes ) else: losses = {} proposals = self.predict_proposals( anchors, pred_objectness_logits, pred_anchor_deltas, images.image_sizes ) return proposals, losses def predict_proposals( self, anchors: List[Boxes], pred_objectness_logits: List[torch.Tensor], pred_anchor_deltas: List[torch.Tensor], image_sizes: List[Tuple[int, int]], ): """ Decode all the predicted box regression deltas to proposals. Find the top proposals by applying NMS and removing boxes that are too small. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i, sorted by their objectness score in descending order. """ # The proposals are treated as fixed for joint training with roi heads. # This approach ignores the derivative w.r.t. the proposal boxes’ coordinates that # are also network responses. with torch.no_grad(): pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) return find_top_rpn_proposals( pred_proposals, pred_objectness_logits, image_sizes, self.nms_thresh, self.pre_nms_topk[self.training], self.post_nms_topk[self.training], self.min_box_size, self.training, ) def _decode_proposals(self, anchors: List[Boxes], pred_anchor_deltas: List[torch.Tensor]): """ Transform anchors into proposals by applying the predicted anchor deltas. Returns: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, B) """ N = pred_anchor_deltas[0].shape[0] proposals = [] # For each feature map for anchors_i, pred_anchor_deltas_i in zip(anchors, pred_anchor_deltas): B = anchors_i.tensor.size(1) pred_anchor_deltas_i = pred_anchor_deltas_i.reshape(-1, B) # Expand anchors to shape (N*Hi*Wi*A, B) anchors_i = anchors_i.tensor.unsqueeze(0).expand(N, -1, -1).reshape(-1, B) proposals_i = self.box2box_transform.apply_deltas(pred_anchor_deltas_i, anchors_i) # Append feature map proposals with shape (N, Hi*Wi*A, B) proposals.append(proposals_i.view(N, -1, B)) return proposals ================================================ FILE: annotator/oneformer/detectron2/modeling/proposal_generator/rrpn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import itertools import logging from typing import Dict, List import torch from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms_rotated, cat from annotator.oneformer.detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom from ..box_regression import Box2BoxTransformRotated from .build import PROPOSAL_GENERATOR_REGISTRY from .proposal_utils import _is_tracing from .rpn import RPN logger = logging.getLogger(__name__) def find_top_rrpn_proposals( proposals, pred_objectness_logits, image_sizes, nms_thresh, pre_nms_topk, post_nms_topk, min_box_size, training, ): """ For each feature map, select the `pre_nms_topk` highest scoring proposals, apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` highest scoring proposals among all the feature maps if `training` is True, otherwise, returns the highest `post_nms_topk` scoring proposals for each feature map. Args: proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5). All proposal predictions on the feature maps. pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). image_sizes (list[tuple]): sizes (h, w) for each image nms_thresh (float): IoU threshold to use for NMS pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. When RRPN is run on multiple feature maps (as in FPN) this number is per feature map. post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. When RRPN is run on multiple feature maps (as in FPN) this number is total, over all feature maps. min_box_size(float): minimum proposal box side length in pixels (absolute units wrt input images). training (bool): True if proposals are to be used in training, otherwise False. This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." comment. Returns: proposals (list[Instances]): list of N Instances. The i-th Instances stores post_nms_topk object proposals for image i. """ num_images = len(image_sizes) device = proposals[0].device # 1. Select top-k anchor for every level and every image topk_scores = [] # #lvl Tensor, each of shape N x topk topk_proposals = [] level_ids = [] # #lvl Tensor, each of shape (topk,) batch_idx = torch.arange(num_images, device=device) for level_id, proposals_i, logits_i in zip( itertools.count(), proposals, pred_objectness_logits ): Hi_Wi_A = logits_i.shape[1] if isinstance(Hi_Wi_A, torch.Tensor): # it's a tensor in tracing num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk) else: num_proposals_i = min(Hi_Wi_A, pre_nms_topk) topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) # each is N x topk topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 5 topk_proposals.append(topk_proposals_i) topk_scores.append(topk_scores_i) level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device)) # 2. Concat all levels together topk_scores = cat(topk_scores, dim=1) topk_proposals = cat(topk_proposals, dim=1) level_ids = cat(level_ids, dim=0) # 3. For each image, run a per-level NMS, and choose topk results. results = [] for n, image_size in enumerate(image_sizes): boxes = RotatedBoxes(topk_proposals[n]) scores_per_img = topk_scores[n] lvl = level_ids valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img) if not valid_mask.all(): if training: raise FloatingPointError( "Predicted boxes or scores contain Inf/NaN. Training has diverged." ) boxes = boxes[valid_mask] scores_per_img = scores_per_img[valid_mask] lvl = lvl[valid_mask] boxes.clip(image_size) # filter empty boxes keep = boxes.nonempty(threshold=min_box_size) if _is_tracing() or keep.sum().item() != len(boxes): boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], lvl[keep]) keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh) # In Detectron1, there was different behavior during training vs. testing. # (https://github.com/facebookresearch/Detectron/issues/459) # During training, topk is over the proposals from *all* images in the training batch. # During testing, it is over the proposals for each image separately. # As a result, the training behavior becomes batch-dependent, # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. # This bug is addressed in Detectron2 to make the behavior independent of batch size. keep = keep[:post_nms_topk] res = Instances(image_size) res.proposal_boxes = boxes[keep] res.objectness_logits = scores_per_img[keep] results.append(res) return results @PROPOSAL_GENERATOR_REGISTRY.register() class RRPN(RPN): """ Rotated Region Proposal Network described in :paper:`RRPN`. """ @configurable def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.anchor_boundary_thresh >= 0: raise NotImplementedError( "anchor_boundary_thresh is a legacy option not implemented for RRPN." ) @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = super().from_config(cfg, input_shape) ret["box2box_transform"] = Box2BoxTransformRotated(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) return ret @torch.no_grad() def label_and_sample_anchors(self, anchors: List[RotatedBoxes], gt_instances: List[Instances]): """ Args: anchors (list[RotatedBoxes]): anchors for each feature map. gt_instances: the ground-truth instances for each image. Returns: list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is the total number of anchors across feature maps. Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class. list[Tensor]: i-th element is a Nx5 tensor, where N is the total number of anchors across feature maps. The values are the matched gt boxes for each anchor. Values are undefined for those anchors not labeled as 1. """ anchors = RotatedBoxes.cat(anchors) gt_boxes = [x.gt_boxes for x in gt_instances] del gt_instances gt_labels = [] matched_gt_boxes = [] for gt_boxes_i in gt_boxes: """ gt_boxes_i: ground-truth boxes for i-th image """ match_quality_matrix = retry_if_cuda_oom(pairwise_iou_rotated)(gt_boxes_i, anchors) matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix) # Matching is memory-expensive and may result in CPU tensors. But the result is small gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device) # A vector of labels (-1, 0, 1) for each anchor gt_labels_i = self._subsample_labels(gt_labels_i) if len(gt_boxes_i) == 0: # These values won't be used anyway since the anchor is labeled as background matched_gt_boxes_i = torch.zeros_like(anchors.tensor) else: # TODO wasted indexing computation for ignored boxes matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor gt_labels.append(gt_labels_i) # N,AHW matched_gt_boxes.append(matched_gt_boxes_i) return gt_labels, matched_gt_boxes @torch.no_grad() def predict_proposals(self, anchors, pred_objectness_logits, pred_anchor_deltas, image_sizes): pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas) return find_top_rrpn_proposals( pred_proposals, pred_objectness_logits, image_sizes, self.nms_thresh, self.pre_nms_topk[self.training], self.post_nms_topk[self.training], self.min_box_size, self.training, ) ================================================ FILE: annotator/oneformer/detectron2/modeling/roi_heads/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .box_head import ROI_BOX_HEAD_REGISTRY, build_box_head, FastRCNNConvFCHead from .keypoint_head import ( ROI_KEYPOINT_HEAD_REGISTRY, build_keypoint_head, BaseKeypointRCNNHead, KRCNNConvDeconvUpsampleHead, ) from .mask_head import ( ROI_MASK_HEAD_REGISTRY, build_mask_head, BaseMaskRCNNHead, MaskRCNNConvUpsampleHead, ) from .roi_heads import ( ROI_HEADS_REGISTRY, ROIHeads, Res5ROIHeads, StandardROIHeads, build_roi_heads, select_foreground_proposals, ) from .cascade_rcnn import CascadeROIHeads from .rotated_fast_rcnn import RROIHeads from .fast_rcnn import FastRCNNOutputLayers from . import cascade_rcnn # isort:skip __all__ = list(globals().keys()) ================================================ FILE: annotator/oneformer/detectron2/modeling/roi_heads/box_head.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import numpy as np from typing import List import fvcore.nn.weight_init as weight_init import torch from torch import nn from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm from annotator.oneformer.detectron2.utils.registry import Registry __all__ = ["FastRCNNConvFCHead", "build_box_head", "ROI_BOX_HEAD_REGISTRY"] ROI_BOX_HEAD_REGISTRY = Registry("ROI_BOX_HEAD") ROI_BOX_HEAD_REGISTRY.__doc__ = """ Registry for box heads, which make box predictions from per-region features. The registered object will be called with `obj(cfg, input_shape)`. """ # To get torchscript support, we make the head a subclass of `nn.Sequential`. # Therefore, to add new layers in this head class, please make sure they are # added in the order they will be used in forward(). @ROI_BOX_HEAD_REGISTRY.register() class FastRCNNConvFCHead(nn.Sequential): """ A head with several 3x3 conv layers (each followed by norm & relu) and then several fc layers (each followed by relu). """ @configurable def __init__( self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm="" ): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature. conv_dims (list[int]): the output dimensions of the conv layers fc_dims (list[int]): the output dimensions of the fc layers conv_norm (str or callable): normalization for the conv layers. See :func:`detectron2.layers.get_norm` for supported types. """ super().__init__() assert len(conv_dims) + len(fc_dims) > 0 self._output_size = (input_shape.channels, input_shape.height, input_shape.width) self.conv_norm_relus = [] for k, conv_dim in enumerate(conv_dims): conv = Conv2d( self._output_size[0], conv_dim, kernel_size=3, padding=1, bias=not conv_norm, norm=get_norm(conv_norm, conv_dim), activation=nn.ReLU(), ) self.add_module("conv{}".format(k + 1), conv) self.conv_norm_relus.append(conv) self._output_size = (conv_dim, self._output_size[1], self._output_size[2]) self.fcs = [] for k, fc_dim in enumerate(fc_dims): if k == 0: self.add_module("flatten", nn.Flatten()) fc = nn.Linear(int(np.prod(self._output_size)), fc_dim) self.add_module("fc{}".format(k + 1), fc) self.add_module("fc_relu{}".format(k + 1), nn.ReLU()) self.fcs.append(fc) self._output_size = fc_dim for layer in self.conv_norm_relus: weight_init.c2_msra_fill(layer) for layer in self.fcs: weight_init.c2_xavier_fill(layer) @classmethod def from_config(cls, cfg, input_shape): num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM return { "input_shape": input_shape, "conv_dims": [conv_dim] * num_conv, "fc_dims": [fc_dim] * num_fc, "conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM, } def forward(self, x): for layer in self: x = layer(x) return x @property @torch.jit.unused def output_shape(self): """ Returns: ShapeSpec: the output feature shape """ o = self._output_size if isinstance(o, int): return ShapeSpec(channels=o) else: return ShapeSpec(channels=o[0], height=o[1], width=o[2]) def build_box_head(cfg, input_shape): """ Build a box head defined by `cfg.MODEL.ROI_BOX_HEAD.NAME`. """ name = cfg.MODEL.ROI_BOX_HEAD.NAME return ROI_BOX_HEAD_REGISTRY.get(name)(cfg, input_shape) ================================================ FILE: annotator/oneformer/detectron2/modeling/roi_heads/cascade_rcnn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from typing import List import torch from torch import nn from torch.autograd.function import Function from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import ShapeSpec from annotator.oneformer.detectron2.structures import Boxes, Instances, pairwise_iou from annotator.oneformer.detectron2.utils.events import get_event_storage from ..box_regression import Box2BoxTransform from ..matcher import Matcher from ..poolers import ROIPooler from .box_head import build_box_head from .fast_rcnn import FastRCNNOutputLayers, fast_rcnn_inference from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads class _ScaleGradient(Function): @staticmethod def forward(ctx, input, scale): ctx.scale = scale return input @staticmethod def backward(ctx, grad_output): return grad_output * ctx.scale, None @ROI_HEADS_REGISTRY.register() class CascadeROIHeads(StandardROIHeads): """ The ROI heads that implement :paper:`Cascade R-CNN`. """ @configurable def __init__( self, *, box_in_features: List[str], box_pooler: ROIPooler, box_heads: List[nn.Module], box_predictors: List[nn.Module], proposal_matchers: List[Matcher], **kwargs, ): """ NOTE: this interface is experimental. Args: box_pooler (ROIPooler): pooler that extracts region features from given boxes box_heads (list[nn.Module]): box head for each cascade stage box_predictors (list[nn.Module]): box predictor for each cascade stage proposal_matchers (list[Matcher]): matcher with different IoU thresholds to match boxes with ground truth for each stage. The first matcher matches RPN proposals with ground truth, the other matchers use boxes predicted by the previous stage as proposals and match them with ground truth. """ assert "proposal_matcher" not in kwargs, ( "CascadeROIHeads takes 'proposal_matchers=' for each stage instead " "of one 'proposal_matcher='." ) # The first matcher matches RPN proposals with ground truth, done in the base class kwargs["proposal_matcher"] = proposal_matchers[0] num_stages = self.num_cascade_stages = len(box_heads) box_heads = nn.ModuleList(box_heads) box_predictors = nn.ModuleList(box_predictors) assert len(box_predictors) == num_stages, f"{len(box_predictors)} != {num_stages}!" assert len(proposal_matchers) == num_stages, f"{len(proposal_matchers)} != {num_stages}!" super().__init__( box_in_features=box_in_features, box_pooler=box_pooler, box_head=box_heads, box_predictor=box_predictors, **kwargs, ) self.proposal_matchers = proposal_matchers @classmethod def from_config(cls, cfg, input_shape): ret = super().from_config(cfg, input_shape) ret.pop("proposal_matcher") return ret @classmethod def _init_box_head(cls, cfg, input_shape): # fmt: off in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS cascade_ious = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS assert len(cascade_bbox_reg_weights) == len(cascade_ious) assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG, \ "CascadeROIHeads only support class-agnostic regression now!" assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0] # fmt: on in_channels = [input_shape[f].channels for f in in_features] # Check all channel counts are equal assert len(set(in_channels)) == 1, in_channels in_channels = in_channels[0] box_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) pooled_shape = ShapeSpec( channels=in_channels, width=pooler_resolution, height=pooler_resolution ) box_heads, box_predictors, proposal_matchers = [], [], [] for match_iou, bbox_reg_weights in zip(cascade_ious, cascade_bbox_reg_weights): box_head = build_box_head(cfg, pooled_shape) box_heads.append(box_head) box_predictors.append( FastRCNNOutputLayers( cfg, box_head.output_shape, box2box_transform=Box2BoxTransform(weights=bbox_reg_weights), ) ) proposal_matchers.append(Matcher([match_iou], [0, 1], allow_low_quality_matches=False)) return { "box_in_features": in_features, "box_pooler": box_pooler, "box_heads": box_heads, "box_predictors": box_predictors, "proposal_matchers": proposal_matchers, } def forward(self, images, features, proposals, targets=None): del images if self.training: proposals = self.label_and_sample_proposals(proposals, targets) if self.training: # Need targets to box head losses = self._forward_box(features, proposals, targets) losses.update(self._forward_mask(features, proposals)) losses.update(self._forward_keypoint(features, proposals)) return proposals, losses else: pred_instances = self._forward_box(features, proposals) pred_instances = self.forward_with_given_boxes(features, pred_instances) return pred_instances, {} def _forward_box(self, features, proposals, targets=None): """ Args: features, targets: the same as in Same as in :meth:`ROIHeads.forward`. proposals (list[Instances]): the per-image object proposals with their matching ground truth. Each has fields "proposal_boxes", and "objectness_logits", "gt_classes", "gt_boxes". """ features = [features[f] for f in self.box_in_features] head_outputs = [] # (predictor, predictions, proposals) prev_pred_boxes = None image_sizes = [x.image_size for x in proposals] for k in range(self.num_cascade_stages): if k > 0: # The output boxes of the previous stage are used to create the input # proposals of the next stage. proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes) if self.training: proposals = self._match_and_label_boxes(proposals, k, targets) predictions = self._run_stage(features, proposals, k) prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals) head_outputs.append((self.box_predictor[k], predictions, proposals)) if self.training: losses = {} storage = get_event_storage() for stage, (predictor, predictions, proposals) in enumerate(head_outputs): with storage.name_scope("stage{}".format(stage)): stage_losses = predictor.losses(predictions, proposals) losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()}) return losses else: # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1) scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs] # Average the scores across heads scores = [ sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages) for scores_per_image in zip(*scores_per_stage) ] # Use the boxes of the last head predictor, predictions, proposals = head_outputs[-1] boxes = predictor.predict_boxes(predictions, proposals) pred_instances, _ = fast_rcnn_inference( boxes, scores, image_sizes, predictor.test_score_thresh, predictor.test_nms_thresh, predictor.test_topk_per_image, ) return pred_instances @torch.no_grad() def _match_and_label_boxes(self, proposals, stage, targets): """ Match proposals with groundtruth using the matcher at the given stage. Label the proposals as foreground or background based on the match. Args: proposals (list[Instances]): One Instances for each image, with the field "proposal_boxes". stage (int): the current stage targets (list[Instances]): the ground truth instances Returns: list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes" """ num_fg_samples, num_bg_samples = [], [] for proposals_per_image, targets_per_image in zip(proposals, targets): match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes ) # proposal_labels are 0 or 1 matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix) if len(targets_per_image) > 0: gt_classes = targets_per_image.gt_classes[matched_idxs] # Label unmatched proposals (0 label from matcher) as background (label=num_classes) gt_classes[proposal_labels == 0] = self.num_classes gt_boxes = targets_per_image.gt_boxes[matched_idxs] else: gt_classes = torch.zeros_like(matched_idxs) + self.num_classes gt_boxes = Boxes( targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4)) ) proposals_per_image.gt_classes = gt_classes proposals_per_image.gt_boxes = gt_boxes num_fg_samples.append((proposal_labels == 1).sum().item()) num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1]) # Log the number of fg/bg samples in each stage storage = get_event_storage() storage.put_scalar( "stage{}/roi_head/num_fg_samples".format(stage), sum(num_fg_samples) / len(num_fg_samples), ) storage.put_scalar( "stage{}/roi_head/num_bg_samples".format(stage), sum(num_bg_samples) / len(num_bg_samples), ) return proposals def _run_stage(self, features, proposals, stage): """ Args: features (list[Tensor]): #lvl input features to ROIHeads proposals (list[Instances]): #image Instances, with the field "proposal_boxes" stage (int): the current stage Returns: Same output as `FastRCNNOutputLayers.forward()`. """ box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) # The original implementation averages the losses among heads, # but scale up the parameter gradients of the heads. # This is equivalent to adding the losses among heads, # but scale down the gradients on features. if self.training: box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages) box_features = self.box_head[stage](box_features) return self.box_predictor[stage](box_features) def _create_proposals_from_boxes(self, boxes, image_sizes): """ Args: boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4 image_sizes (list[tuple]): list of image shapes in (h, w) Returns: list[Instances]: per-image proposals with the given boxes. """ # Just like RPN, the proposals should not have gradients boxes = [Boxes(b.detach()) for b in boxes] proposals = [] for boxes_per_image, image_size in zip(boxes, image_sizes): boxes_per_image.clip(image_size) if self.training: # do not filter empty boxes at inference time, # because the scores from each stage need to be aligned and added later boxes_per_image = boxes_per_image[boxes_per_image.nonempty()] prop = Instances(image_size) prop.proposal_boxes = boxes_per_image proposals.append(prop) return proposals ================================================ FILE: annotator/oneformer/detectron2/modeling/roi_heads/fast_rcnn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging from typing import Callable, Dict, List, Optional, Tuple, Union import torch from torch import nn from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.data.detection_utils import get_fed_loss_cls_weights from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple from annotator.oneformer.detectron2.modeling.box_regression import Box2BoxTransform, _dense_box_regression_loss from annotator.oneformer.detectron2.structures import Boxes, Instances from annotator.oneformer.detectron2.utils.events import get_event_storage __all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers"] logger = logging.getLogger(__name__) """ Shape shorthand in this module: N: number of images in the minibatch R: number of ROIs, combined over all images, in the minibatch Ri: number of ROIs in image i K: number of foreground classes. E.g.,there are 80 foreground classes in COCO. Naming convention: deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box transform (see :class:`box_regression.Box2BoxTransform`). pred_class_logits: predicted class scores in [-inf, +inf]; use softmax(pred_class_logits) to estimate P(class). gt_classes: ground-truth classification labels in [0, K], where [0, K) represent foreground object classes and K represents the background class. pred_proposal_deltas: predicted box2box transform deltas for transforming proposals to detection box predictions. gt_proposal_deltas: ground-truth box2box transform deltas """ def fast_rcnn_inference( boxes: List[torch.Tensor], scores: List[torch.Tensor], image_shapes: List[Tuple[int, int]], score_thresh: float, nms_thresh: float, topk_per_image: int, ): """ Call `fast_rcnn_inference_single_image` for all images. Args: boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic boxes for each image. Element i has shape (Ri, K * 4) if doing class-specific regression, or (Ri, 4) if doing class-agnostic regression, where Ri is the number of predicted objects for image i. This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`. scores (list[Tensor]): A list of Tensors of predicted class scores for each image. Element i has shape (Ri, K + 1), where Ri is the number of predicted objects for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`. image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch. score_thresh (float): Only return detections with a confidence score exceeding this threshold. nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. topk_per_image (int): The number of top scoring detections to return. Set < 0 to return all detections. Returns: instances: (list[Instances]): A list of N instances, one for each image in the batch, that stores the topk most confidence detections. kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates the corresponding boxes/scores index in [0, Ri) from the input, for image i. """ result_per_image = [ fast_rcnn_inference_single_image( boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image ) for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes) ] return [x[0] for x in result_per_image], [x[1] for x in result_per_image] def _log_classification_stats(pred_logits, gt_classes, prefix="fast_rcnn"): """ Log the classification metrics to EventStorage. Args: pred_logits: Rx(K+1) logits. The last column is for background class. gt_classes: R labels """ num_instances = gt_classes.numel() if num_instances == 0: return pred_classes = pred_logits.argmax(dim=1) bg_class_ind = pred_logits.shape[1] - 1 fg_inds = (gt_classes >= 0) & (gt_classes < bg_class_ind) num_fg = fg_inds.nonzero().numel() fg_gt_classes = gt_classes[fg_inds] fg_pred_classes = pred_classes[fg_inds] num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel() num_accurate = (pred_classes == gt_classes).nonzero().numel() fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel() storage = get_event_storage() storage.put_scalar(f"{prefix}/cls_accuracy", num_accurate / num_instances) if num_fg > 0: storage.put_scalar(f"{prefix}/fg_cls_accuracy", fg_num_accurate / num_fg) storage.put_scalar(f"{prefix}/false_negative", num_false_negative / num_fg) def fast_rcnn_inference_single_image( boxes, scores, image_shape: Tuple[int, int], score_thresh: float, nms_thresh: float, topk_per_image: int, ): """ Single-image inference. Return bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). Args: Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference`, but for only one image. """ valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1) if not valid_mask.all(): boxes = boxes[valid_mask] scores = scores[valid_mask] scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // 4 # Convert to Boxes to use the `clip` function ... boxes = Boxes(boxes.reshape(-1, 4)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 # 1. Filter results based on detection scores. It can make NMS more efficient # by filtering out low-confidence detections. filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero() if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] # 2. Apply NMS for each class independently. keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] result = Instances(image_shape) result.pred_boxes = Boxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] return result, filter_inds[:, 0] class FastRCNNOutputLayers(nn.Module): """ Two linear layers for predicting Fast R-CNN outputs: 1. proposal-to-detection box regression deltas 2. classification scores """ @configurable def __init__( self, input_shape: ShapeSpec, *, box2box_transform, num_classes: int, test_score_thresh: float = 0.0, test_nms_thresh: float = 0.5, test_topk_per_image: int = 100, cls_agnostic_bbox_reg: bool = False, smooth_l1_beta: float = 0.0, box_reg_loss_type: str = "smooth_l1", loss_weight: Union[float, Dict[str, float]] = 1.0, use_fed_loss: bool = False, use_sigmoid_ce: bool = False, get_fed_loss_cls_weights: Optional[Callable] = None, fed_loss_num_classes: int = 50, ): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature to this module box2box_transform (Box2BoxTransform or Box2BoxTransformRotated): num_classes (int): number of foreground classes test_score_thresh (float): threshold to filter predictions results. test_nms_thresh (float): NMS threshold for prediction results. test_topk_per_image (int): number of top predictions to produce per image. cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression smooth_l1_beta (float): transition point from L1 to L2 loss. Only used if `box_reg_loss_type` is "smooth_l1" box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou", "diou", "ciou" loss_weight (float|dict): weights to use for losses. Can be single float for weighting all losses, or a dict of individual weightings. Valid dict keys are: * "loss_cls": applied to classification loss * "loss_box_reg": applied to box regression loss use_fed_loss (bool): whether to use federated loss which samples additional negative classes to calculate the loss use_sigmoid_ce (bool): whether to calculate the loss using weighted average of binary cross entropy with logits. This could be used together with federated loss get_fed_loss_cls_weights (Callable): a callable which takes dataset name and frequency weight power, and returns the probabilities to sample negative classes for federated loss. The implementation can be found in detectron2/data/detection_utils.py fed_loss_num_classes (int): number of federated classes to keep in total """ super().__init__() if isinstance(input_shape, int): # some backward compatibility input_shape = ShapeSpec(channels=input_shape) self.num_classes = num_classes input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1) # prediction layer for num_classes foreground classes and one background class (hence + 1) self.cls_score = nn.Linear(input_size, num_classes + 1) num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes box_dim = len(box2box_transform.weights) self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim) nn.init.normal_(self.cls_score.weight, std=0.01) nn.init.normal_(self.bbox_pred.weight, std=0.001) for l in [self.cls_score, self.bbox_pred]: nn.init.constant_(l.bias, 0) self.box2box_transform = box2box_transform self.smooth_l1_beta = smooth_l1_beta self.test_score_thresh = test_score_thresh self.test_nms_thresh = test_nms_thresh self.test_topk_per_image = test_topk_per_image self.box_reg_loss_type = box_reg_loss_type if isinstance(loss_weight, float): loss_weight = {"loss_cls": loss_weight, "loss_box_reg": loss_weight} self.loss_weight = loss_weight self.use_fed_loss = use_fed_loss self.use_sigmoid_ce = use_sigmoid_ce self.fed_loss_num_classes = fed_loss_num_classes if self.use_fed_loss: assert self.use_sigmoid_ce, "Please use sigmoid cross entropy loss with federated loss" fed_loss_cls_weights = get_fed_loss_cls_weights() assert ( len(fed_loss_cls_weights) == self.num_classes ), "Please check the provided fed_loss_cls_weights. Their size should match num_classes" self.register_buffer("fed_loss_cls_weights", fed_loss_cls_weights) @classmethod def from_config(cls, cfg, input_shape): return { "input_shape": input_shape, "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS), # fmt: off "num_classes" : cfg.MODEL.ROI_HEADS.NUM_CLASSES, "cls_agnostic_bbox_reg" : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG, "smooth_l1_beta" : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA, "test_score_thresh" : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST, "test_nms_thresh" : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST, "test_topk_per_image" : cfg.TEST.DETECTIONS_PER_IMAGE, "box_reg_loss_type" : cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE, "loss_weight" : {"loss_box_reg": cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT}, # noqa "use_fed_loss" : cfg.MODEL.ROI_BOX_HEAD.USE_FED_LOSS, "use_sigmoid_ce" : cfg.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE, "get_fed_loss_cls_weights" : lambda: get_fed_loss_cls_weights(dataset_names=cfg.DATASETS.TRAIN, freq_weight_power=cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER), # noqa "fed_loss_num_classes" : cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES, # fmt: on } def forward(self, x): """ Args: x: per-region features of shape (N, ...) for N bounding boxes to predict. Returns: (Tensor, Tensor): First tensor: shape (N,K+1), scores for each of the N box. Each row contains the scores for K object categories and 1 background class. Second tensor: bounding box regression deltas for each box. Shape is shape (N,Kx4), or (N,4) for class-agnostic regression. """ if x.dim() > 2: x = torch.flatten(x, start_dim=1) scores = self.cls_score(x) proposal_deltas = self.bbox_pred(x) return scores, proposal_deltas def losses(self, predictions, proposals): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``, ``gt_classes`` are expected. Returns: Dict[str, Tensor]: dict of losses """ scores, proposal_deltas = predictions # parse classification outputs gt_classes = ( cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0) ) _log_classification_stats(scores, gt_classes) # parse box regression outputs if len(proposals): proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4 assert not proposal_boxes.requires_grad, "Proposals should not require gradients!" # If "gt_boxes" does not exist, the proposals must be all negative and # should not be included in regression loss computation. # Here we just use proposal_boxes as an arbitrary placeholder because its # value won't be used in self.box_reg_loss(). gt_boxes = cat( [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals], dim=0, ) else: proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device) if self.use_sigmoid_ce: loss_cls = self.sigmoid_cross_entropy_loss(scores, gt_classes) else: loss_cls = cross_entropy(scores, gt_classes, reduction="mean") losses = { "loss_cls": loss_cls, "loss_box_reg": self.box_reg_loss( proposal_boxes, gt_boxes, proposal_deltas, gt_classes ), } return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()} # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/fed_loss.py # noqa # with slight modifications def get_fed_loss_classes(self, gt_classes, num_fed_loss_classes, num_classes, weight): """ Args: gt_classes: a long tensor of shape R that contains the gt class label of each proposal. num_fed_loss_classes: minimum number of classes to keep when calculating federated loss. Will sample negative classes if number of unique gt_classes is smaller than this value. num_classes: number of foreground classes weight: probabilities used to sample negative classes Returns: Tensor: classes to keep when calculating the federated loss, including both unique gt classes and sampled negative classes. """ unique_gt_classes = torch.unique(gt_classes) prob = unique_gt_classes.new_ones(num_classes + 1).float() prob[-1] = 0 if len(unique_gt_classes) < num_fed_loss_classes: prob[:num_classes] = weight.float().clone() prob[unique_gt_classes] = 0 sampled_negative_classes = torch.multinomial( prob, num_fed_loss_classes - len(unique_gt_classes), replacement=False ) fed_loss_classes = torch.cat([unique_gt_classes, sampled_negative_classes]) else: fed_loss_classes = unique_gt_classes return fed_loss_classes # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py#L113 # noqa # with slight modifications def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes): """ Args: pred_class_logits: shape (N, K+1), scores for each of the N box. Each row contains the scores for K object categories and 1 background class gt_classes: a long tensor of shape R that contains the gt class label of each proposal. """ if pred_class_logits.numel() == 0: return pred_class_logits.new_zeros([1])[0] N = pred_class_logits.shape[0] K = pred_class_logits.shape[1] - 1 target = pred_class_logits.new_zeros(N, K + 1) target[range(len(gt_classes)), gt_classes] = 1 target = target[:, :K] cls_loss = F.binary_cross_entropy_with_logits( pred_class_logits[:, :-1], target, reduction="none" ) if self.use_fed_loss: fed_loss_classes = self.get_fed_loss_classes( gt_classes, num_fed_loss_classes=self.fed_loss_num_classes, num_classes=K, weight=self.fed_loss_cls_weights, ) fed_loss_classes_mask = fed_loss_classes.new_zeros(K + 1) fed_loss_classes_mask[fed_loss_classes] = 1 fed_loss_classes_mask = fed_loss_classes_mask[:K] weight = fed_loss_classes_mask.view(1, K).expand(N, K).float() else: weight = 1 loss = torch.sum(cls_loss * weight) / N return loss def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes): """ Args: proposal_boxes/gt_boxes are tensors with the same shape (R, 4 or 5). pred_deltas has shape (R, 4 or 5), or (R, num_classes * (4 or 5)). gt_classes is a long tensor of shape R, the gt class label of each proposal. R shall be the number of proposals. """ box_dim = proposal_boxes.shape[1] # 4 or 5 # Regression loss is only computed for foreground proposals (those matched to a GT) fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0] if pred_deltas.shape[1] == box_dim: # cls-agnostic regression fg_pred_deltas = pred_deltas[fg_inds] else: fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[ fg_inds, gt_classes[fg_inds] ] loss_box_reg = _dense_box_regression_loss( [proposal_boxes[fg_inds]], self.box2box_transform, [fg_pred_deltas.unsqueeze(0)], [gt_boxes[fg_inds]], ..., self.box_reg_loss_type, self.smooth_l1_beta, ) # The reg loss is normalized using the total number of regions (R), not the number # of foreground regions even though the box regression loss is only defined on # foreground regions. Why? Because doing so gives equal training influence to # each foreground example. To see how, consider two different minibatches: # (1) Contains a single foreground region # (2) Contains 100 foreground regions # If we normalize by the number of foreground regions, the single example in # minibatch (1) will be given 100 times as much influence as each foreground # example in minibatch (2). Normalizing by the total number of regions, R, # means that the single example in minibatch (1) and each of the 100 examples # in minibatch (2) are given equal influence. return loss_box_reg / max(gt_classes.numel(), 1.0) # return 0 if empty def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. The ``proposal_boxes`` field is expected. Returns: list[Instances]: same as `fast_rcnn_inference`. list[Tensor]: same as `fast_rcnn_inference`. """ boxes = self.predict_boxes(predictions, proposals) scores = self.predict_probs(predictions, proposals) image_shapes = [x.image_size for x in proposals] return fast_rcnn_inference( boxes, scores, image_shapes, self.test_score_thresh, self.test_nms_thresh, self.test_topk_per_image, ) def predict_boxes_for_gt_classes(self, predictions, proposals): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected. Returns: list[Tensor]: A list of Tensors of predicted boxes for GT classes in case of class-specific box head. Element i of the list has shape (Ri, B), where Ri is the number of proposals for image i and B is the box dimension (4 or 5) """ if not len(proposals): return [] scores, proposal_deltas = predictions proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) N, B = proposal_boxes.shape predict_boxes = self.box2box_transform.apply_deltas( proposal_deltas, proposal_boxes ) # Nx(KxB) K = predict_boxes.shape[1] // B if K > 1: gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0) # Some proposals are ignored or have a background class. Their gt_classes # cannot be used as index. gt_classes = gt_classes.clamp_(0, K - 1) predict_boxes = predict_boxes.view(N, K, B)[ torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes ] num_prop_per_image = [len(p) for p in proposals] return predict_boxes.split(num_prop_per_image) def predict_boxes( self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances] ): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. The ``proposal_boxes`` field is expected. Returns: list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is the number of proposals for image i and B is the box dimension (4 or 5) """ if not len(proposals): return [] _, proposal_deltas = predictions num_prop_per_image = [len(p) for p in proposals] proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) predict_boxes = self.box2box_transform.apply_deltas( proposal_deltas, proposal_boxes, ) # Nx(KxB) return predict_boxes.split(num_prop_per_image) def predict_probs( self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances] ): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. Returns: list[Tensor]: A list of Tensors of predicted class probabilities for each image. Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i. """ scores, _ = predictions num_inst_per_image = [len(p) for p in proposals] if self.use_sigmoid_ce: probs = scores.sigmoid() else: probs = F.softmax(scores, dim=-1) return probs.split(num_inst_per_image, dim=0) ================================================ FILE: annotator/oneformer/detectron2/modeling/roi_heads/keypoint_head.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from typing import List import torch from torch import nn from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import Conv2d, ConvTranspose2d, cat, interpolate from annotator.oneformer.detectron2.structures import Instances, heatmaps_to_keypoints from annotator.oneformer.detectron2.utils.events import get_event_storage from annotator.oneformer.detectron2.utils.registry import Registry _TOTAL_SKIPPED = 0 __all__ = [ "ROI_KEYPOINT_HEAD_REGISTRY", "build_keypoint_head", "BaseKeypointRCNNHead", "KRCNNConvDeconvUpsampleHead", ] ROI_KEYPOINT_HEAD_REGISTRY = Registry("ROI_KEYPOINT_HEAD") ROI_KEYPOINT_HEAD_REGISTRY.__doc__ = """ Registry for keypoint heads, which make keypoint predictions from per-region features. The registered object will be called with `obj(cfg, input_shape)`. """ def build_keypoint_head(cfg, input_shape): """ Build a keypoint head from `cfg.MODEL.ROI_KEYPOINT_HEAD.NAME`. """ name = cfg.MODEL.ROI_KEYPOINT_HEAD.NAME return ROI_KEYPOINT_HEAD_REGISTRY.get(name)(cfg, input_shape) def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer): """ Arguments: pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number of instances in the batch, K is the number of keypoints, and S is the side length of the keypoint heatmap. The values are spatial logits. instances (list[Instances]): A list of M Instances, where M is the batch size. These instances are predictions from the model that are in 1:1 correspondence with pred_keypoint_logits. Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint` instance. normalizer (float): Normalize the loss by this amount. If not specified, we normalize by the number of visible keypoints in the minibatch. Returns a scalar tensor containing the loss. """ heatmaps = [] valid = [] keypoint_side_len = pred_keypoint_logits.shape[2] for instances_per_image in instances: if len(instances_per_image) == 0: continue keypoints = instances_per_image.gt_keypoints heatmaps_per_image, valid_per_image = keypoints.to_heatmap( instances_per_image.proposal_boxes.tensor, keypoint_side_len ) heatmaps.append(heatmaps_per_image.view(-1)) valid.append(valid_per_image.view(-1)) if len(heatmaps): keypoint_targets = cat(heatmaps, dim=0) valid = cat(valid, dim=0).to(dtype=torch.uint8) valid = torch.nonzero(valid).squeeze(1) # torch.mean (in binary_cross_entropy_with_logits) doesn't # accept empty tensors, so handle it separately if len(heatmaps) == 0 or valid.numel() == 0: global _TOTAL_SKIPPED _TOTAL_SKIPPED += 1 storage = get_event_storage() storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False) return pred_keypoint_logits.sum() * 0 N, K, H, W = pred_keypoint_logits.shape pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W) keypoint_loss = F.cross_entropy( pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum" ) # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch if normalizer is None: normalizer = valid.numel() keypoint_loss /= normalizer return keypoint_loss def keypoint_rcnn_inference(pred_keypoint_logits: torch.Tensor, pred_instances: List[Instances]): """ Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score) and add it to the `pred_instances` as a `pred_keypoints` field. Args: pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number of instances in the batch, K is the number of keypoints, and S is the side length of the keypoint heatmap. The values are spatial logits. pred_instances (list[Instances]): A list of N Instances, where N is the number of images. Returns: None. Each element in pred_instances will contain extra "pred_keypoints" and "pred_keypoint_heatmaps" fields. "pred_keypoints" is a tensor of shape (#instance, K, 3) where the last dimension corresponds to (x, y, score). The scores are larger than 0. "pred_keypoint_heatmaps" contains the raw keypoint logits as passed to this function. """ # flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor) bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0) pred_keypoint_logits = pred_keypoint_logits.detach() keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits, bboxes_flat.detach()) num_instances_per_image = [len(i) for i in pred_instances] keypoint_results = keypoint_results[:, :, [0, 1, 3]].split(num_instances_per_image, dim=0) heatmap_results = pred_keypoint_logits.split(num_instances_per_image, dim=0) for keypoint_results_per_image, heatmap_results_per_image, instances_per_image in zip( keypoint_results, heatmap_results, pred_instances ): # keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score) # heatmap_results_per_image is (num instances)x(num keypoints)x(side)x(side) instances_per_image.pred_keypoints = keypoint_results_per_image instances_per_image.pred_keypoint_heatmaps = heatmap_results_per_image class BaseKeypointRCNNHead(nn.Module): """ Implement the basic Keypoint R-CNN losses and inference logic described in Sec. 5 of :paper:`Mask R-CNN`. """ @configurable def __init__(self, *, num_keypoints, loss_weight=1.0, loss_normalizer=1.0): """ NOTE: this interface is experimental. Args: num_keypoints (int): number of keypoints to predict loss_weight (float): weight to multiple on the keypoint loss loss_normalizer (float or str): If float, divide the loss by `loss_normalizer * #images`. If 'visible', the loss is normalized by the total number of visible keypoints across images. """ super().__init__() self.num_keypoints = num_keypoints self.loss_weight = loss_weight assert loss_normalizer == "visible" or isinstance(loss_normalizer, float), loss_normalizer self.loss_normalizer = loss_normalizer @classmethod def from_config(cls, cfg, input_shape): ret = { "loss_weight": cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT, "num_keypoints": cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS, } normalize_by_visible = ( cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS ) # noqa if not normalize_by_visible: batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION ret["loss_normalizer"] = ( ret["num_keypoints"] * batch_size_per_image * positive_sample_fraction ) else: ret["loss_normalizer"] = "visible" return ret def forward(self, x, instances: List[Instances]): """ Args: x: input 4D region feature(s) provided by :class:`ROIHeads`. instances (list[Instances]): contains the boxes & labels corresponding to the input features. Exact format is up to its caller to decide. Typically, this is the foreground instances in training, with "proposal_boxes" field and other gt annotations. In inference, it contains boxes that are already predicted. Returns: A dict of losses if in training. The predicted "instances" if in inference. """ x = self.layers(x) if self.training: num_images = len(instances) normalizer = ( None if self.loss_normalizer == "visible" else num_images * self.loss_normalizer ) return { "loss_keypoint": keypoint_rcnn_loss(x, instances, normalizer=normalizer) * self.loss_weight } else: keypoint_rcnn_inference(x, instances) return instances def layers(self, x): """ Neural network layers that makes predictions from regional input features. """ raise NotImplementedError # To get torchscript support, we make the head a subclass of `nn.Sequential`. # Therefore, to add new layers in this head class, please make sure they are # added in the order they will be used in forward(). @ROI_KEYPOINT_HEAD_REGISTRY.register() class KRCNNConvDeconvUpsampleHead(BaseKeypointRCNNHead, nn.Sequential): """ A standard keypoint head containing a series of 3x3 convs, followed by a transpose convolution and bilinear interpolation for upsampling. It is described in Sec. 5 of :paper:`Mask R-CNN`. """ @configurable def __init__(self, input_shape, *, num_keypoints, conv_dims, **kwargs): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature conv_dims: an iterable of output channel counts for each conv in the head e.g. (512, 512, 512) for three convs outputting 512 channels. """ super().__init__(num_keypoints=num_keypoints, **kwargs) # default up_scale to 2.0 (this can be made an option) up_scale = 2.0 in_channels = input_shape.channels for idx, layer_channels in enumerate(conv_dims, 1): module = Conv2d(in_channels, layer_channels, 3, stride=1, padding=1) self.add_module("conv_fcn{}".format(idx), module) self.add_module("conv_fcn_relu{}".format(idx), nn.ReLU()) in_channels = layer_channels deconv_kernel = 4 self.score_lowres = ConvTranspose2d( in_channels, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1 ) self.up_scale = up_scale for name, param in self.named_parameters(): if "bias" in name: nn.init.constant_(param, 0) elif "weight" in name: # Caffe2 implementation uses MSRAFill, which in fact # corresponds to kaiming_normal_ in PyTorch nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") @classmethod def from_config(cls, cfg, input_shape): ret = super().from_config(cfg, input_shape) ret["input_shape"] = input_shape ret["conv_dims"] = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS return ret def layers(self, x): for layer in self: x = layer(x) x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) return x ================================================ FILE: annotator/oneformer/detectron2/modeling/roi_heads/mask_head.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from typing import List import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import Conv2d, ConvTranspose2d, ShapeSpec, cat, get_norm from annotator.oneformer.detectron2.layers.wrappers import move_device_like from annotator.oneformer.detectron2.structures import Instances from annotator.oneformer.detectron2.utils.events import get_event_storage from annotator.oneformer.detectron2.utils.registry import Registry __all__ = [ "BaseMaskRCNNHead", "MaskRCNNConvUpsampleHead", "build_mask_head", "ROI_MASK_HEAD_REGISTRY", ] ROI_MASK_HEAD_REGISTRY = Registry("ROI_MASK_HEAD") ROI_MASK_HEAD_REGISTRY.__doc__ = """ Registry for mask heads, which predicts instance masks given per-region features. The registered object will be called with `obj(cfg, input_shape)`. """ @torch.jit.unused def mask_rcnn_loss(pred_mask_logits: torch.Tensor, instances: List[Instances], vis_period: int = 0): """ Compute the mask prediction loss defined in the Mask R-CNN paper. Args: pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask) for class-specific or class-agnostic, where B is the total number of predicted masks in all images, C is the number of foreground classes, and Hmask, Wmask are the height and width of the mask predictions. The values are logits. instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. These instances are in 1:1 correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask, ...) associated with each instance are stored in fields. vis_period (int): the period (in steps) to dump visualization. Returns: mask_loss (Tensor): A scalar tensor containing the loss. """ cls_agnostic_mask = pred_mask_logits.size(1) == 1 total_num_masks = pred_mask_logits.size(0) mask_side_len = pred_mask_logits.size(2) assert pred_mask_logits.size(2) == pred_mask_logits.size(3), "Mask prediction must be square!" gt_classes = [] gt_masks = [] for instances_per_image in instances: if len(instances_per_image) == 0: continue if not cls_agnostic_mask: gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64) gt_classes.append(gt_classes_per_image) gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize( instances_per_image.proposal_boxes.tensor, mask_side_len ).to(device=pred_mask_logits.device) # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len gt_masks.append(gt_masks_per_image) if len(gt_masks) == 0: return pred_mask_logits.sum() * 0 gt_masks = cat(gt_masks, dim=0) if cls_agnostic_mask: pred_mask_logits = pred_mask_logits[:, 0] else: indices = torch.arange(total_num_masks) gt_classes = cat(gt_classes, dim=0) pred_mask_logits = pred_mask_logits[indices, gt_classes] if gt_masks.dtype == torch.bool: gt_masks_bool = gt_masks else: # Here we allow gt_masks to be float as well (depend on the implementation of rasterize()) gt_masks_bool = gt_masks > 0.5 gt_masks = gt_masks.to(dtype=torch.float32) # Log the training accuracy (using gt classes and 0.5 threshold) mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0)) num_positive = gt_masks_bool.sum().item() false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max( gt_masks_bool.numel() - num_positive, 1.0 ) false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(num_positive, 1.0) storage = get_event_storage() storage.put_scalar("mask_rcnn/accuracy", mask_accuracy) storage.put_scalar("mask_rcnn/false_positive", false_positive) storage.put_scalar("mask_rcnn/false_negative", false_negative) if vis_period > 0 and storage.iter % vis_period == 0: pred_masks = pred_mask_logits.sigmoid() vis_masks = torch.cat([pred_masks, gt_masks], axis=2) name = "Left: mask prediction; Right: mask GT" for idx, vis_mask in enumerate(vis_masks): vis_mask = torch.stack([vis_mask] * 3, axis=0) storage.put_image(name + f" ({idx})", vis_mask) mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits, gt_masks, reduction="mean") return mask_loss def mask_rcnn_inference(pred_mask_logits: torch.Tensor, pred_instances: List[Instances]): """ Convert pred_mask_logits to estimated foreground probability masks while also extracting only the masks for the predicted classes in pred_instances. For each predicted box, the mask of the same class is attached to the instance by adding a new "pred_masks" field to pred_instances. Args: pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask) for class-specific or class-agnostic, where B is the total number of predicted masks in all images, C is the number of foreground classes, and Hmask, Wmask are the height and width of the mask predictions. The values are logits. pred_instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. Each Instances must have field "pred_classes". Returns: None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask, Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized) masks the resolution predicted by the network; post-processing steps, such as resizing the predicted masks to the original image resolution and/or binarizing them, is left to the caller. """ cls_agnostic_mask = pred_mask_logits.size(1) == 1 if cls_agnostic_mask: mask_probs_pred = pred_mask_logits.sigmoid() else: # Select masks corresponding to the predicted classes num_masks = pred_mask_logits.shape[0] class_pred = cat([i.pred_classes for i in pred_instances]) device = ( class_pred.device if torch.jit.is_scripting() else ("cpu" if torch.jit.is_tracing() else class_pred.device) ) indices = move_device_like(torch.arange(num_masks, device=device), class_pred) mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid() # mask_probs_pred.shape: (B, 1, Hmask, Wmask) num_boxes_per_image = [len(i) for i in pred_instances] mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0) for prob, instances in zip(mask_probs_pred, pred_instances): instances.pred_masks = prob # (1, Hmask, Wmask) class BaseMaskRCNNHead(nn.Module): """ Implement the basic Mask R-CNN losses and inference logic described in :paper:`Mask R-CNN` """ @configurable def __init__(self, *, loss_weight: float = 1.0, vis_period: int = 0): """ NOTE: this interface is experimental. Args: loss_weight (float): multiplier of the loss vis_period (int): visualization period """ super().__init__() self.vis_period = vis_period self.loss_weight = loss_weight @classmethod def from_config(cls, cfg, input_shape): return {"vis_period": cfg.VIS_PERIOD} def forward(self, x, instances: List[Instances]): """ Args: x: input region feature(s) provided by :class:`ROIHeads`. instances (list[Instances]): contains the boxes & labels corresponding to the input features. Exact format is up to its caller to decide. Typically, this is the foreground instances in training, with "proposal_boxes" field and other gt annotations. In inference, it contains boxes that are already predicted. Returns: A dict of losses in training. The predicted "instances" in inference. """ x = self.layers(x) if self.training: return {"loss_mask": mask_rcnn_loss(x, instances, self.vis_period) * self.loss_weight} else: mask_rcnn_inference(x, instances) return instances def layers(self, x): """ Neural network layers that makes predictions from input features. """ raise NotImplementedError # To get torchscript support, we make the head a subclass of `nn.Sequential`. # Therefore, to add new layers in this head class, please make sure they are # added in the order they will be used in forward(). @ROI_MASK_HEAD_REGISTRY.register() class MaskRCNNConvUpsampleHead(BaseMaskRCNNHead, nn.Sequential): """ A mask head with several conv layers, plus an upsample layer (with `ConvTranspose2d`). Predictions are made with a final 1x1 conv layer. """ @configurable def __init__(self, input_shape: ShapeSpec, *, num_classes, conv_dims, conv_norm="", **kwargs): """ NOTE: this interface is experimental. Args: input_shape (ShapeSpec): shape of the input feature num_classes (int): the number of foreground classes (i.e. background is not included). 1 if using class agnostic prediction. conv_dims (list[int]): a list of N>0 integers representing the output dimensions of N-1 conv layers and the last upsample layer. conv_norm (str or callable): normalization for the conv layers. See :func:`detectron2.layers.get_norm` for supported types. """ super().__init__(**kwargs) assert len(conv_dims) >= 1, "conv_dims have to be non-empty!" self.conv_norm_relus = [] cur_channels = input_shape.channels for k, conv_dim in enumerate(conv_dims[:-1]): conv = Conv2d( cur_channels, conv_dim, kernel_size=3, stride=1, padding=1, bias=not conv_norm, norm=get_norm(conv_norm, conv_dim), activation=nn.ReLU(), ) self.add_module("mask_fcn{}".format(k + 1), conv) self.conv_norm_relus.append(conv) cur_channels = conv_dim self.deconv = ConvTranspose2d( cur_channels, conv_dims[-1], kernel_size=2, stride=2, padding=0 ) self.add_module("deconv_relu", nn.ReLU()) cur_channels = conv_dims[-1] self.predictor = Conv2d(cur_channels, num_classes, kernel_size=1, stride=1, padding=0) for layer in self.conv_norm_relus + [self.deconv]: weight_init.c2_msra_fill(layer) # use normal distribution initialization for mask prediction layer nn.init.normal_(self.predictor.weight, std=0.001) if self.predictor.bias is not None: nn.init.constant_(self.predictor.bias, 0) @classmethod def from_config(cls, cfg, input_shape): ret = super().from_config(cfg, input_shape) conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM num_conv = cfg.MODEL.ROI_MASK_HEAD.NUM_CONV ret.update( conv_dims=[conv_dim] * (num_conv + 1), # +1 for ConvTranspose conv_norm=cfg.MODEL.ROI_MASK_HEAD.NORM, input_shape=input_shape, ) if cfg.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK: ret["num_classes"] = 1 else: ret["num_classes"] = cfg.MODEL.ROI_HEADS.NUM_CLASSES return ret def layers(self, x): for layer in self: x = layer(x) return x def build_mask_head(cfg, input_shape): """ Build a mask head defined by `cfg.MODEL.ROI_MASK_HEAD.NAME`. """ name = cfg.MODEL.ROI_MASK_HEAD.NAME return ROI_MASK_HEAD_REGISTRY.get(name)(cfg, input_shape) ================================================ FILE: annotator/oneformer/detectron2/modeling/roi_heads/roi_heads.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import inspect import logging import numpy as np from typing import Dict, List, Optional, Tuple import torch from torch import nn from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import ShapeSpec, nonzero_tuple from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_iou from annotator.oneformer.detectron2.utils.events import get_event_storage from annotator.oneformer.detectron2.utils.registry import Registry from ..backbone.resnet import BottleneckBlock, ResNet from ..matcher import Matcher from ..poolers import ROIPooler from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals from ..sampling import subsample_labels from .box_head import build_box_head from .fast_rcnn import FastRCNNOutputLayers from .keypoint_head import build_keypoint_head from .mask_head import build_mask_head ROI_HEADS_REGISTRY = Registry("ROI_HEADS") ROI_HEADS_REGISTRY.__doc__ = """ Registry for ROI heads in a generalized R-CNN model. ROIHeads take feature maps and region proposals, and perform per-region computation. The registered object will be called with `obj(cfg, input_shape)`. The call is expected to return an :class:`ROIHeads`. """ logger = logging.getLogger(__name__) def build_roi_heads(cfg, input_shape): """ Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`. """ name = cfg.MODEL.ROI_HEADS.NAME return ROI_HEADS_REGISTRY.get(name)(cfg, input_shape) def select_foreground_proposals( proposals: List[Instances], bg_label: int ) -> Tuple[List[Instances], List[torch.Tensor]]: """ Given a list of N Instances (for N images), each containing a `gt_classes` field, return a list of Instances that contain only instances with `gt_classes != -1 && gt_classes != bg_label`. Args: proposals (list[Instances]): A list of N Instances, where N is the number of images in the batch. bg_label: label index of background class. Returns: list[Instances]: N Instances, each contains only the selected foreground instances. list[Tensor]: N boolean vector, correspond to the selection mask of each Instances object. True for selected instances. """ assert isinstance(proposals, (list, tuple)) assert isinstance(proposals[0], Instances) assert proposals[0].has("gt_classes") fg_proposals = [] fg_selection_masks = [] for proposals_per_image in proposals: gt_classes = proposals_per_image.gt_classes fg_selection_mask = (gt_classes != -1) & (gt_classes != bg_label) fg_idxs = fg_selection_mask.nonzero().squeeze(1) fg_proposals.append(proposals_per_image[fg_idxs]) fg_selection_masks.append(fg_selection_mask) return fg_proposals, fg_selection_masks def select_proposals_with_visible_keypoints(proposals: List[Instances]) -> List[Instances]: """ Args: proposals (list[Instances]): a list of N Instances, where N is the number of images. Returns: proposals: only contains proposals with at least one visible keypoint. Note that this is still slightly different from Detectron. In Detectron, proposals for training keypoint head are re-sampled from all the proposals with IOU>threshold & >=1 visible keypoint. Here, the proposals are first sampled from all proposals with IOU>threshold, then proposals with no visible keypoint are filtered out. This strategy seems to make no difference on Detectron and is easier to implement. """ ret = [] all_num_fg = [] for proposals_per_image in proposals: # If empty/unannotated image (hard negatives), skip filtering for train if len(proposals_per_image) == 0: ret.append(proposals_per_image) continue gt_keypoints = proposals_per_image.gt_keypoints.tensor # #fg x K x 3 vis_mask = gt_keypoints[:, :, 2] >= 1 xs, ys = gt_keypoints[:, :, 0], gt_keypoints[:, :, 1] proposal_boxes = proposals_per_image.proposal_boxes.tensor.unsqueeze(dim=1) # #fg x 1 x 4 kp_in_box = ( (xs >= proposal_boxes[:, :, 0]) & (xs <= proposal_boxes[:, :, 2]) & (ys >= proposal_boxes[:, :, 1]) & (ys <= proposal_boxes[:, :, 3]) ) selection = (kp_in_box & vis_mask).any(dim=1) selection_idxs = nonzero_tuple(selection)[0] all_num_fg.append(selection_idxs.numel()) ret.append(proposals_per_image[selection_idxs]) storage = get_event_storage() storage.put_scalar("keypoint_head/num_fg_samples", np.mean(all_num_fg)) return ret class ROIHeads(torch.nn.Module): """ ROIHeads perform all per-region computation in an R-CNN. It typically contains logic to 1. (in training only) match proposals with ground truth and sample them 2. crop the regions and extract per-region features using proposals 3. make per-region predictions with different heads It can have many variants, implemented as subclasses of this class. This base class contains the logic to match/sample proposals. But it is not necessary to inherit this class if the sampling logic is not needed. """ @configurable def __init__( self, *, num_classes, batch_size_per_image, positive_fraction, proposal_matcher, proposal_append_gt=True, ): """ NOTE: this interface is experimental. Args: num_classes (int): number of foreground classes (i.e. background is not included) batch_size_per_image (int): number of proposals to sample for training positive_fraction (float): fraction of positive (foreground) proposals to sample for training. proposal_matcher (Matcher): matcher that matches proposals and ground truth proposal_append_gt (bool): whether to include ground truth as proposals as well """ super().__init__() self.batch_size_per_image = batch_size_per_image self.positive_fraction = positive_fraction self.num_classes = num_classes self.proposal_matcher = proposal_matcher self.proposal_append_gt = proposal_append_gt @classmethod def from_config(cls, cfg): return { "batch_size_per_image": cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, "positive_fraction": cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION, "num_classes": cfg.MODEL.ROI_HEADS.NUM_CLASSES, "proposal_append_gt": cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT, # Matcher to assign box proposals to gt boxes "proposal_matcher": Matcher( cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS, cfg.MODEL.ROI_HEADS.IOU_LABELS, allow_low_quality_matches=False, ), } def _sample_proposals( self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: """ Based on the matching between N proposals and M groundtruth, sample the proposals and set their classification labels. Args: matched_idxs (Tensor): a vector of length N, each is the best-matched gt index in [0, M) for each proposal. matched_labels (Tensor): a vector of length N, the matcher's label (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal. gt_classes (Tensor): a vector of length M. Returns: Tensor: a vector of indices of sampled proposals. Each is in [0, N). Tensor: a vector of the same length, the classification label for each sampled proposal. Each sample is labeled as either a category in [0, num_classes) or the background (num_classes). """ has_gt = gt_classes.numel() > 0 # Get the corresponding GT for each proposal if has_gt: gt_classes = gt_classes[matched_idxs] # Label unmatched proposals (0 label from matcher) as background (label=num_classes) gt_classes[matched_labels == 0] = self.num_classes # Label ignore proposals (-1 label) gt_classes[matched_labels == -1] = -1 else: gt_classes = torch.zeros_like(matched_idxs) + self.num_classes sampled_fg_idxs, sampled_bg_idxs = subsample_labels( gt_classes, self.batch_size_per_image, self.positive_fraction, self.num_classes ) sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0) return sampled_idxs, gt_classes[sampled_idxs] @torch.no_grad() def label_and_sample_proposals( self, proposals: List[Instances], targets: List[Instances] ) -> List[Instances]: """ Prepare some proposals to be used to train the ROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than ``self.positive_fraction``. Args: See :meth:`ROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the proposal boxes - gt_boxes: the ground-truth box that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) Other fields such as "gt_classes", "gt_masks", that's included in `targets`. """ # Augment proposals with ground-truth boxes. # In the case of learned proposals (e.g., RPN), when training starts # the proposals will be low quality due to random initialization. # It's possible that none of these initial # proposals have high enough overlap with the gt objects to be used # as positive examples for the second stage components (box head, # cls head, mask head). Adding the gt boxes to the set of proposals # ensures that the second stage components will have some positive # examples from the start of training. For RPN, this augmentation improves # convergence and empirically improves box AP on COCO by about 0.5 # points (under one tested configuration). if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(targets, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes ) matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes ) # Set target attributes of the sampled proposals: proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes if has_gt: sampled_targets = matched_idxs[sampled_idxs] # We index all the attributes of targets that start with "gt_" # and have not been added to proposals yet (="gt_classes"). # NOTE: here the indexing waste some compute, because heads # like masks, keypoints, etc, will filter the proposals again, # (by foreground/background, or number of keypoints in the image, etc) # so we essentially index the data twice. for (trg_name, trg_value) in targets_per_image.get_fields().items(): if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name): proposals_per_image.set(trg_name, trg_value[sampled_targets]) # If no GT is given in the image, we don't know what a dummy gt value can be. # Therefore the returned proposals won't have any gt_* fields, except for a # gt_classes full of background label. num_bg_samples.append((gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt def forward( self, images: ImageList, features: Dict[str, torch.Tensor], proposals: List[Instances], targets: Optional[List[Instances]] = None, ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]: """ Args: images (ImageList): features (dict[str,Tensor]): input data as a mapping from feature map name to tensor. Axis 0 represents the number of images `N` in the input data; axes 1-3 are channels, height, and width, which may vary between feature maps (e.g., if a feature pyramid is used). proposals (list[Instances]): length `N` list of `Instances`. The i-th `Instances` contains object proposals for the i-th input image, with fields "proposal_boxes" and "objectness_logits". targets (list[Instances], optional): length `N` list of `Instances`. The i-th `Instances` contains the ground-truth per-instance annotations for the i-th input image. Specify `targets` during training only. It may have the following fields: - gt_boxes: the bounding box of each instance. - gt_classes: the label for each instance with a category ranging in [0, #class]. - gt_masks: PolygonMasks or BitMasks, the ground-truth masks of each instance. - gt_keypoints: NxKx3, the groud-truth keypoints for each instance. Returns: list[Instances]: length `N` list of `Instances` containing the detected instances. Returned during inference only; may be [] during training. dict[str->Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ raise NotImplementedError() @ROI_HEADS_REGISTRY.register() class Res5ROIHeads(ROIHeads): """ The ROIHeads in a typical "C4" R-CNN model, where the box and mask head share the cropping and the per-region feature computation by a Res5 block. See :paper:`ResNet` Appendix A. """ @configurable def __init__( self, *, in_features: List[str], pooler: ROIPooler, res5: nn.Module, box_predictor: nn.Module, mask_head: Optional[nn.Module] = None, **kwargs, ): """ NOTE: this interface is experimental. Args: in_features (list[str]): list of backbone feature map names to use for feature extraction pooler (ROIPooler): pooler to extra region features from backbone res5 (nn.Sequential): a CNN to compute per-region features, to be used by ``box_predictor`` and ``mask_head``. Typically this is a "res5" block from a ResNet. box_predictor (nn.Module): make box predictions from the feature. Should have the same interface as :class:`FastRCNNOutputLayers`. mask_head (nn.Module): transform features to make mask predictions """ super().__init__(**kwargs) self.in_features = in_features self.pooler = pooler if isinstance(res5, (list, tuple)): res5 = nn.Sequential(*res5) self.res5 = res5 self.box_predictor = box_predictor self.mask_on = mask_head is not None if self.mask_on: self.mask_head = mask_head @classmethod def from_config(cls, cfg, input_shape): # fmt: off ret = super().from_config(cfg) in_features = ret["in_features"] = cfg.MODEL.ROI_HEADS.IN_FEATURES pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE pooler_scales = (1.0 / input_shape[in_features[0]].stride, ) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO mask_on = cfg.MODEL.MASK_ON # fmt: on assert not cfg.MODEL.KEYPOINT_ON assert len(in_features) == 1 ret["pooler"] = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) # Compatbility with old moco code. Might be useful. # See notes in StandardROIHeads.from_config if not inspect.ismethod(cls._build_res5_block): logger.warning( "The behavior of _build_res5_block may change. " "Please do not depend on private methods." ) cls._build_res5_block = classmethod(cls._build_res5_block) ret["res5"], out_channels = cls._build_res5_block(cfg) ret["box_predictor"] = FastRCNNOutputLayers( cfg, ShapeSpec(channels=out_channels, height=1, width=1) ) if mask_on: ret["mask_head"] = build_mask_head( cfg, ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution), ) return ret @classmethod def _build_res5_block(cls, cfg): # fmt: off stage_channel_factor = 2 ** 3 # res5 is 8x res2 num_groups = cfg.MODEL.RESNETS.NUM_GROUPS width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP bottleneck_channels = num_groups * width_per_group * stage_channel_factor out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 norm = cfg.MODEL.RESNETS.NORM assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \ "Deformable conv is not yet supported in res5 head." # fmt: on blocks = ResNet.make_stage( BottleneckBlock, 3, stride_per_block=[2, 1, 1], in_channels=out_channels // 2, bottleneck_channels=bottleneck_channels, out_channels=out_channels, num_groups=num_groups, norm=norm, stride_in_1x1=stride_in_1x1, ) return nn.Sequential(*blocks), out_channels def _shared_roi_transform(self, features: List[torch.Tensor], boxes: List[Boxes]): x = self.pooler(features, boxes) return self.res5(x) def forward( self, images: ImageList, features: Dict[str, torch.Tensor], proposals: List[Instances], targets: Optional[List[Instances]] = None, ): """ See :meth:`ROIHeads.forward`. """ del images if self.training: assert targets proposals = self.label_and_sample_proposals(proposals, targets) del targets proposal_boxes = [x.proposal_boxes for x in proposals] box_features = self._shared_roi_transform( [features[f] for f in self.in_features], proposal_boxes ) predictions = self.box_predictor(box_features.mean(dim=[2, 3])) if self.training: del features losses = self.box_predictor.losses(predictions, proposals) if self.mask_on: proposals, fg_selection_masks = select_foreground_proposals( proposals, self.num_classes ) # Since the ROI feature transform is shared between boxes and masks, # we don't need to recompute features. The mask loss is only defined # on foreground proposals, so we need to select out the foreground # features. mask_features = box_features[torch.cat(fg_selection_masks, dim=0)] del box_features losses.update(self.mask_head(mask_features, proposals)) return [], losses else: pred_instances, _ = self.box_predictor.inference(predictions, proposals) pred_instances = self.forward_with_given_boxes(features, pred_instances) return pred_instances, {} def forward_with_given_boxes( self, features: Dict[str, torch.Tensor], instances: List[Instances] ) -> List[Instances]: """ Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. Args: features: same as in `forward()` instances (list[Instances]): instances to predict other outputs. Expect the keys "pred_boxes" and "pred_classes" to exist. Returns: instances (Instances): the same `Instances` object, with extra fields such as `pred_masks` or `pred_keypoints`. """ assert not self.training assert instances[0].has("pred_boxes") and instances[0].has("pred_classes") if self.mask_on: feature_list = [features[f] for f in self.in_features] x = self._shared_roi_transform(feature_list, [x.pred_boxes for x in instances]) return self.mask_head(x, instances) else: return instances @ROI_HEADS_REGISTRY.register() class StandardROIHeads(ROIHeads): """ It's "standard" in a sense that there is no ROI transform sharing or feature sharing between tasks. Each head independently processes the input features by each head's own pooler and head. This class is used by most models, such as FPN and C5. To implement more models, you can subclass it and implement a different :meth:`forward()` or a head. """ @configurable def __init__( self, *, box_in_features: List[str], box_pooler: ROIPooler, box_head: nn.Module, box_predictor: nn.Module, mask_in_features: Optional[List[str]] = None, mask_pooler: Optional[ROIPooler] = None, mask_head: Optional[nn.Module] = None, keypoint_in_features: Optional[List[str]] = None, keypoint_pooler: Optional[ROIPooler] = None, keypoint_head: Optional[nn.Module] = None, train_on_pred_boxes: bool = False, **kwargs, ): """ NOTE: this interface is experimental. Args: box_in_features (list[str]): list of feature names to use for the box head. box_pooler (ROIPooler): pooler to extra region features for box head box_head (nn.Module): transform features to make box predictions box_predictor (nn.Module): make box predictions from the feature. Should have the same interface as :class:`FastRCNNOutputLayers`. mask_in_features (list[str]): list of feature names to use for the mask pooler or mask head. None if not using mask head. mask_pooler (ROIPooler): pooler to extract region features from image features. The mask head will then take region features to make predictions. If None, the mask head will directly take the dict of image features defined by `mask_in_features` mask_head (nn.Module): transform features to make mask predictions keypoint_in_features, keypoint_pooler, keypoint_head: similar to ``mask_*``. train_on_pred_boxes (bool): whether to use proposal boxes or predicted boxes from the box head to train other heads. """ super().__init__(**kwargs) # keep self.in_features for backward compatibility self.in_features = self.box_in_features = box_in_features self.box_pooler = box_pooler self.box_head = box_head self.box_predictor = box_predictor self.mask_on = mask_in_features is not None if self.mask_on: self.mask_in_features = mask_in_features self.mask_pooler = mask_pooler self.mask_head = mask_head self.keypoint_on = keypoint_in_features is not None if self.keypoint_on: self.keypoint_in_features = keypoint_in_features self.keypoint_pooler = keypoint_pooler self.keypoint_head = keypoint_head self.train_on_pred_boxes = train_on_pred_boxes @classmethod def from_config(cls, cfg, input_shape): ret = super().from_config(cfg) ret["train_on_pred_boxes"] = cfg.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES # Subclasses that have not been updated to use from_config style construction # may have overridden _init_*_head methods. In this case, those overridden methods # will not be classmethods and we need to avoid trying to call them here. # We test for this with ismethod which only returns True for bound methods of cls. # Such subclasses will need to handle calling their overridden _init_*_head methods. if inspect.ismethod(cls._init_box_head): ret.update(cls._init_box_head(cfg, input_shape)) if inspect.ismethod(cls._init_mask_head): ret.update(cls._init_mask_head(cfg, input_shape)) if inspect.ismethod(cls._init_keypoint_head): ret.update(cls._init_keypoint_head(cfg, input_shape)) return ret @classmethod def _init_box_head(cls, cfg, input_shape): # fmt: off in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE # fmt: on # If StandardROIHeads is applied on multiple feature maps (as in FPN), # then we share the same predictors and therefore the channel counts must be the same in_channels = [input_shape[f].channels for f in in_features] # Check all channel counts are equal assert len(set(in_channels)) == 1, in_channels in_channels = in_channels[0] box_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) # Here we split "box head" and "box predictor", which is mainly due to historical reasons. # They are used together so the "box predictor" layers should be part of the "box head". # New subclasses of ROIHeads do not need "box predictor"s. box_head = build_box_head( cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution) ) box_predictor = FastRCNNOutputLayers(cfg, box_head.output_shape) return { "box_in_features": in_features, "box_pooler": box_pooler, "box_head": box_head, "box_predictor": box_predictor, } @classmethod def _init_mask_head(cls, cfg, input_shape): if not cfg.MODEL.MASK_ON: return {} # fmt: off in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE # fmt: on in_channels = [input_shape[f].channels for f in in_features][0] ret = {"mask_in_features": in_features} ret["mask_pooler"] = ( ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) if pooler_type else None ) if pooler_type: shape = ShapeSpec( channels=in_channels, width=pooler_resolution, height=pooler_resolution ) else: shape = {f: input_shape[f] for f in in_features} ret["mask_head"] = build_mask_head(cfg, shape) return ret @classmethod def _init_keypoint_head(cls, cfg, input_shape): if not cfg.MODEL.KEYPOINT_ON: return {} # fmt: off in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES pooler_resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) # noqa sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE # fmt: on in_channels = [input_shape[f].channels for f in in_features][0] ret = {"keypoint_in_features": in_features} ret["keypoint_pooler"] = ( ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) if pooler_type else None ) if pooler_type: shape = ShapeSpec( channels=in_channels, width=pooler_resolution, height=pooler_resolution ) else: shape = {f: input_shape[f] for f in in_features} ret["keypoint_head"] = build_keypoint_head(cfg, shape) return ret def forward( self, images: ImageList, features: Dict[str, torch.Tensor], proposals: List[Instances], targets: Optional[List[Instances]] = None, ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]: """ See :class:`ROIHeads.forward`. """ del images if self.training: assert targets, "'targets' argument is required during training" proposals = self.label_and_sample_proposals(proposals, targets) del targets if self.training: losses = self._forward_box(features, proposals) # Usually the original proposals used by the box head are used by the mask, keypoint # heads. But when `self.train_on_pred_boxes is True`, proposals will contain boxes # predicted by the box head. losses.update(self._forward_mask(features, proposals)) losses.update(self._forward_keypoint(features, proposals)) return proposals, losses else: pred_instances = self._forward_box(features, proposals) # During inference cascaded prediction is used: the mask and keypoints heads are only # applied to the top scoring box detections. pred_instances = self.forward_with_given_boxes(features, pred_instances) return pred_instances, {} def forward_with_given_boxes( self, features: Dict[str, torch.Tensor], instances: List[Instances] ) -> List[Instances]: """ Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. This is useful for downstream tasks where a box is known, but need to obtain other attributes (outputs of other heads). Test-time augmentation also uses this. Args: features: same as in `forward()` instances (list[Instances]): instances to predict other outputs. Expect the keys "pred_boxes" and "pred_classes" to exist. Returns: list[Instances]: the same `Instances` objects, with extra fields such as `pred_masks` or `pred_keypoints`. """ assert not self.training assert instances[0].has("pred_boxes") and instances[0].has("pred_classes") instances = self._forward_mask(features, instances) instances = self._forward_keypoint(features, instances) return instances def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances]): """ Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`, the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument. Args: features (dict[str, Tensor]): mapping from feature map names to tensor. Same as in :meth:`ROIHeads.forward`. proposals (list[Instances]): the per-image object proposals with their matching ground truth. Each has fields "proposal_boxes", and "objectness_logits", "gt_classes", "gt_boxes". Returns: In training, a dict of losses. In inference, a list of `Instances`, the predicted instances. """ features = [features[f] for f in self.box_in_features] box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) box_features = self.box_head(box_features) predictions = self.box_predictor(box_features) del box_features if self.training: losses = self.box_predictor.losses(predictions, proposals) # proposals is modified in-place below, so losses must be computed first. if self.train_on_pred_boxes: with torch.no_grad(): pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( predictions, proposals ) for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes): proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image) return losses else: pred_instances, _ = self.box_predictor.inference(predictions, proposals) return pred_instances def _forward_mask(self, features: Dict[str, torch.Tensor], instances: List[Instances]): """ Forward logic of the mask prediction branch. Args: features (dict[str, Tensor]): mapping from feature map names to tensor. Same as in :meth:`ROIHeads.forward`. instances (list[Instances]): the per-image instances to train/predict masks. In training, they can be the proposals. In inference, they can be the boxes predicted by R-CNN box head. Returns: In training, a dict of losses. In inference, update `instances` with new fields "pred_masks" and return it. """ if not self.mask_on: return {} if self.training else instances if self.training: # head is only trained on positive proposals. instances, _ = select_foreground_proposals(instances, self.num_classes) if self.mask_pooler is not None: features = [features[f] for f in self.mask_in_features] boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in instances] features = self.mask_pooler(features, boxes) else: features = {f: features[f] for f in self.mask_in_features} return self.mask_head(features, instances) def _forward_keypoint(self, features: Dict[str, torch.Tensor], instances: List[Instances]): """ Forward logic of the keypoint prediction branch. Args: features (dict[str, Tensor]): mapping from feature map names to tensor. Same as in :meth:`ROIHeads.forward`. instances (list[Instances]): the per-image instances to train/predict keypoints. In training, they can be the proposals. In inference, they can be the boxes predicted by R-CNN box head. Returns: In training, a dict of losses. In inference, update `instances` with new fields "pred_keypoints" and return it. """ if not self.keypoint_on: return {} if self.training else instances if self.training: # head is only trained on positive proposals with >=1 visible keypoints. instances, _ = select_foreground_proposals(instances, self.num_classes) instances = select_proposals_with_visible_keypoints(instances) if self.keypoint_pooler is not None: features = [features[f] for f in self.keypoint_in_features] boxes = [x.proposal_boxes if self.training else x.pred_boxes for x in instances] features = self.keypoint_pooler(features, boxes) else: features = {f: features[f] for f in self.keypoint_in_features} return self.keypoint_head(features, instances) ================================================ FILE: annotator/oneformer/detectron2/modeling/roi_heads/rotated_fast_rcnn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np import torch from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms_rotated from annotator.oneformer.detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated from annotator.oneformer.detectron2.utils.events import get_event_storage from ..box_regression import Box2BoxTransformRotated from ..poolers import ROIPooler from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals from .box_head import build_box_head from .fast_rcnn import FastRCNNOutputLayers from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads logger = logging.getLogger(__name__) """ Shape shorthand in this module: N: number of images in the minibatch R: number of ROIs, combined over all images, in the minibatch Ri: number of ROIs in image i K: number of foreground classes. E.g.,there are 80 foreground classes in COCO. Naming convention: deltas: refers to the 5-d (dx, dy, dw, dh, da) deltas that parameterize the box2box transform (see :class:`box_regression.Box2BoxTransformRotated`). pred_class_logits: predicted class scores in [-inf, +inf]; use softmax(pred_class_logits) to estimate P(class). gt_classes: ground-truth classification labels in [0, K], where [0, K) represent foreground object classes and K represents the background class. pred_proposal_deltas: predicted rotated box2box transform deltas for transforming proposals to detection box predictions. gt_proposal_deltas: ground-truth rotated box2box transform deltas """ def fast_rcnn_inference_rotated( boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image ): """ Call `fast_rcnn_inference_single_image_rotated` for all images. Args: boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic boxes for each image. Element i has shape (Ri, K * 5) if doing class-specific regression, or (Ri, 5) if doing class-agnostic regression, where Ri is the number of predicted objects for image i. This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`. scores (list[Tensor]): A list of Tensors of predicted class scores for each image. Element i has shape (Ri, K + 1), where Ri is the number of predicted objects for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`. image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch. score_thresh (float): Only return detections with a confidence score exceeding this threshold. nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. topk_per_image (int): The number of top scoring detections to return. Set < 0 to return all detections. Returns: instances: (list[Instances]): A list of N instances, one for each image in the batch, that stores the topk most confidence detections. kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates the corresponding boxes/scores index in [0, Ri) from the input, for image i. """ result_per_image = [ fast_rcnn_inference_single_image_rotated( boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image ) for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes) ] return [x[0] for x in result_per_image], [x[1] for x in result_per_image] @torch.no_grad() def fast_rcnn_inference_single_image_rotated( boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image ): """ Single-image inference. Return rotated bounding-box detection results by thresholding on scores and applying rotated non-maximum suppression (Rotated NMS). Args: Same as `fast_rcnn_inference_rotated`, but with rotated boxes, scores, and image shapes per image. Returns: Same as `fast_rcnn_inference_rotated`, but for only one image. """ valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1) if not valid_mask.all(): boxes = boxes[valid_mask] scores = scores[valid_mask] B = 5 # box dimension scores = scores[:, :-1] num_bbox_reg_classes = boxes.shape[1] // B # Convert to Boxes to use the `clip` function ... boxes = RotatedBoxes(boxes.reshape(-1, B)) boxes.clip(image_shape) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, B) # R x C x B # Filter results based on detection scores filter_mask = scores > score_thresh # R x K # R' x 2. First column contains indices of the R predictions; # Second column contains indices of classes. filter_inds = filter_mask.nonzero() if num_bbox_reg_classes == 1: boxes = boxes[filter_inds[:, 0], 0] else: boxes = boxes[filter_mask] scores = scores[filter_mask] # Apply per-class Rotated NMS keep = batched_nms_rotated(boxes, scores, filter_inds[:, 1], nms_thresh) if topk_per_image >= 0: keep = keep[:topk_per_image] boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] result = Instances(image_shape) result.pred_boxes = RotatedBoxes(boxes) result.scores = scores result.pred_classes = filter_inds[:, 1] return result, filter_inds[:, 0] class RotatedFastRCNNOutputLayers(FastRCNNOutputLayers): """ Two linear layers for predicting Rotated Fast R-CNN outputs. """ @classmethod def from_config(cls, cfg, input_shape): args = super().from_config(cfg, input_shape) args["box2box_transform"] = Box2BoxTransformRotated( weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS ) return args def inference(self, predictions, proposals): """ Returns: list[Instances]: same as `fast_rcnn_inference_rotated`. list[Tensor]: same as `fast_rcnn_inference_rotated`. """ boxes = self.predict_boxes(predictions, proposals) scores = self.predict_probs(predictions, proposals) image_shapes = [x.image_size for x in proposals] return fast_rcnn_inference_rotated( boxes, scores, image_shapes, self.test_score_thresh, self.test_nms_thresh, self.test_topk_per_image, ) @ROI_HEADS_REGISTRY.register() class RROIHeads(StandardROIHeads): """ This class is used by Rotated Fast R-CNN to detect rotated boxes. For now, it only supports box predictions but not mask or keypoints. """ @configurable def __init__(self, **kwargs): """ NOTE: this interface is experimental. """ super().__init__(**kwargs) assert ( not self.mask_on and not self.keypoint_on ), "Mask/Keypoints not supported in Rotated ROIHeads." assert not self.train_on_pred_boxes, "train_on_pred_boxes not implemented for RROIHeads!" @classmethod def _init_box_head(cls, cfg, input_shape): # fmt: off in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE # fmt: on assert pooler_type in ["ROIAlignRotated"], pooler_type # assume all channel counts are equal in_channels = [input_shape[f].channels for f in in_features][0] box_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) box_head = build_box_head( cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution) ) # This line is the only difference v.s. StandardROIHeads box_predictor = RotatedFastRCNNOutputLayers(cfg, box_head.output_shape) return { "box_in_features": in_features, "box_pooler": box_pooler, "box_head": box_head, "box_predictor": box_predictor, } @torch.no_grad() def label_and_sample_proposals(self, proposals, targets): """ Prepare some proposals to be used to train the RROI heads. It performs box matching between `proposals` and `targets`, and assigns training labels to the proposals. It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes, with a fraction of positives that is no larger than `self.positive_sample_fraction. Args: See :meth:`StandardROIHeads.forward` Returns: list[Instances]: length `N` list of `Instances`s containing the proposals sampled for training. Each `Instances` has the following fields: - proposal_boxes: the rotated proposal boxes - gt_boxes: the ground-truth rotated boxes that the proposal is assigned to (this is only meaningful if the proposal has a label > 0; if label = 0 then the ground-truth box is random) - gt_classes: the ground-truth classification lable for each proposal """ if self.proposal_append_gt: proposals = add_ground_truth_to_proposals(targets, proposals) proposals_with_gt = [] num_fg_samples = [] num_bg_samples = [] for proposals_per_image, targets_per_image in zip(proposals, targets): has_gt = len(targets_per_image) > 0 match_quality_matrix = pairwise_iou_rotated( targets_per_image.gt_boxes, proposals_per_image.proposal_boxes ) matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix) sampled_idxs, gt_classes = self._sample_proposals( matched_idxs, matched_labels, targets_per_image.gt_classes ) proposals_per_image = proposals_per_image[sampled_idxs] proposals_per_image.gt_classes = gt_classes if has_gt: sampled_targets = matched_idxs[sampled_idxs] proposals_per_image.gt_boxes = targets_per_image.gt_boxes[sampled_targets] num_bg_samples.append((gt_classes == self.num_classes).sum().item()) num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) proposals_with_gt.append(proposals_per_image) # Log the number of fg/bg samples that are selected for training ROI heads storage = get_event_storage() storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) return proposals_with_gt ================================================ FILE: annotator/oneformer/detectron2/modeling/sampling.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch from annotator.oneformer.detectron2.layers import nonzero_tuple __all__ = ["subsample_labels"] def subsample_labels( labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int ): """ Return `num_samples` (or fewer, if not enough found) random samples from `labels` which is a mixture of positives & negatives. It will try to return as many positives as possible without exceeding `positive_fraction * num_samples`, and then try to fill the remaining slots with negatives. Args: labels (Tensor): (N, ) label vector with values: * -1: ignore * bg_label: background ("negative") class * otherwise: one or more foreground ("positive") classes num_samples (int): The total number of labels with value >= 0 to return. Values that are not sampled will be filled with -1 (ignore). positive_fraction (float): The number of subsampled labels with values > 0 is `min(num_positives, int(positive_fraction * num_samples))`. The number of negatives sampled is `min(num_negatives, num_samples - num_positives_sampled)`. In order words, if there are not enough positives, the sample is filled with negatives. If there are also not enough negatives, then as many elements are sampled as is possible. bg_label (int): label index of background ("negative") class. Returns: pos_idx, neg_idx (Tensor): 1D vector of indices. The total length of both is `num_samples` or fewer. """ positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0] negative = nonzero_tuple(labels == bg_label)[0] num_pos = int(num_samples * positive_fraction) # protect against not enough positive examples num_pos = min(positive.numel(), num_pos) num_neg = num_samples - num_pos # protect against not enough negative examples num_neg = min(negative.numel(), num_neg) # randomly select positive and negative examples perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] pos_idx = positive[perm1] neg_idx = negative[perm2] return pos_idx, neg_idx ================================================ FILE: annotator/oneformer/detectron2/modeling/test_time_augmentation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import numpy as np from contextlib import contextmanager from itertools import count from typing import List import torch from fvcore.transforms import HFlipTransform, NoOpTransform from torch import nn from torch.nn.parallel import DistributedDataParallel from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.data.detection_utils import read_image from annotator.oneformer.detectron2.data.transforms import ( RandomFlip, ResizeShortestEdge, ResizeTransform, apply_augmentations, ) from annotator.oneformer.detectron2.structures import Boxes, Instances from .meta_arch import GeneralizedRCNN from .postprocessing import detector_postprocess from .roi_heads.fast_rcnn import fast_rcnn_inference_single_image __all__ = ["DatasetMapperTTA", "GeneralizedRCNNWithTTA"] class DatasetMapperTTA: """ Implement test-time augmentation for detection data. It is a callable which takes a dataset dict from a detection dataset, and returns a list of dataset dicts where the images are augmented from the input image by the transformations defined in the config. This is used for test-time augmentation. """ @configurable def __init__(self, min_sizes: List[int], max_size: int, flip: bool): """ Args: min_sizes: list of short-edge size to resize the image to max_size: maximum height or width of resized images flip: whether to apply flipping augmentation """ self.min_sizes = min_sizes self.max_size = max_size self.flip = flip @classmethod def from_config(cls, cfg): return { "min_sizes": cfg.TEST.AUG.MIN_SIZES, "max_size": cfg.TEST.AUG.MAX_SIZE, "flip": cfg.TEST.AUG.FLIP, } def __call__(self, dataset_dict): """ Args: dict: a dict in standard model input format. See tutorials for details. Returns: list[dict]: a list of dicts, which contain augmented version of the input image. The total number of dicts is ``len(min_sizes) * (2 if flip else 1)``. Each dict has field "transforms" which is a TransformList, containing the transforms that are used to generate this image. """ numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy() shape = numpy_image.shape orig_shape = (dataset_dict["height"], dataset_dict["width"]) if shape[:2] != orig_shape: # It transforms the "original" image in the dataset to the input image pre_tfm = ResizeTransform(orig_shape[0], orig_shape[1], shape[0], shape[1]) else: pre_tfm = NoOpTransform() # Create all combinations of augmentations to use aug_candidates = [] # each element is a list[Augmentation] for min_size in self.min_sizes: resize = ResizeShortestEdge(min_size, self.max_size) aug_candidates.append([resize]) # resize only if self.flip: flip = RandomFlip(prob=1.0) aug_candidates.append([resize, flip]) # resize + flip # Apply all the augmentations ret = [] for aug in aug_candidates: new_image, tfms = apply_augmentations(aug, np.copy(numpy_image)) torch_image = torch.from_numpy(np.ascontiguousarray(new_image.transpose(2, 0, 1))) dic = copy.deepcopy(dataset_dict) dic["transforms"] = pre_tfm + tfms dic["image"] = torch_image ret.append(dic) return ret class GeneralizedRCNNWithTTA(nn.Module): """ A GeneralizedRCNN with test-time augmentation enabled. Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`. """ def __init__(self, cfg, model, tta_mapper=None, batch_size=3): """ Args: cfg (CfgNode): model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on. tta_mapper (callable): takes a dataset dict and returns a list of augmented versions of the dataset dict. Defaults to `DatasetMapperTTA(cfg)`. batch_size (int): batch the augmented images into this batch size for inference. """ super().__init__() if isinstance(model, DistributedDataParallel): model = model.module assert isinstance( model, GeneralizedRCNN ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model)) self.cfg = cfg.clone() assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet" assert ( not self.cfg.MODEL.LOAD_PROPOSALS ), "TTA for pre-computed proposals is not supported yet" self.model = model if tta_mapper is None: tta_mapper = DatasetMapperTTA(cfg) self.tta_mapper = tta_mapper self.batch_size = batch_size @contextmanager def _turn_off_roi_heads(self, attrs): """ Open a context where some heads in `model.roi_heads` are temporarily turned off. Args: attr (list[str]): the attribute in `model.roi_heads` which can be used to turn off a specific head, e.g., "mask_on", "keypoint_on". """ roi_heads = self.model.roi_heads old = {} for attr in attrs: try: old[attr] = getattr(roi_heads, attr) except AttributeError: # The head may not be implemented in certain ROIHeads pass if len(old.keys()) == 0: yield else: for attr in old.keys(): setattr(roi_heads, attr, False) yield for attr in old.keys(): setattr(roi_heads, attr, old[attr]) def _batch_inference(self, batched_inputs, detected_instances=None): """ Execute inference on a list of inputs, using batch size = self.batch_size, instead of the length of the list. Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference` """ if detected_instances is None: detected_instances = [None] * len(batched_inputs) outputs = [] inputs, instances = [], [] for idx, input, instance in zip(count(), batched_inputs, detected_instances): inputs.append(input) instances.append(instance) if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1: outputs.extend( self.model.inference( inputs, instances if instances[0] is not None else None, do_postprocess=False, ) ) inputs, instances = [], [] return outputs def __call__(self, batched_inputs): """ Same input/output format as :meth:`GeneralizedRCNN.forward` """ def _maybe_read_image(dataset_dict): ret = copy.copy(dataset_dict) if "image" not in ret: image = read_image(ret.pop("file_name"), self.model.input_format) image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW ret["image"] = image if "height" not in ret and "width" not in ret: ret["height"] = image.shape[1] ret["width"] = image.shape[2] return ret return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs] def _inference_one_image(self, input): """ Args: input (dict): one dataset dict with "image" field being a CHW tensor Returns: dict: one output dict """ orig_shape = (input["height"], input["width"]) augmented_inputs, tfms = self._get_augmented_inputs(input) # Detect boxes from all augmented versions with self._turn_off_roi_heads(["mask_on", "keypoint_on"]): # temporarily disable roi heads all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms) # merge all detected boxes to obtain final predictions for boxes merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape) if self.cfg.MODEL.MASK_ON: # Use the detected boxes to obtain masks augmented_instances = self._rescale_detected_boxes( augmented_inputs, merged_instances, tfms ) # run forward on the detected boxes outputs = self._batch_inference(augmented_inputs, augmented_instances) # Delete now useless variables to avoid being out of memory del augmented_inputs, augmented_instances # average the predictions merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms) merged_instances = detector_postprocess(merged_instances, *orig_shape) return {"instances": merged_instances} else: return {"instances": merged_instances} def _get_augmented_inputs(self, input): augmented_inputs = self.tta_mapper(input) tfms = [x.pop("transforms") for x in augmented_inputs] return augmented_inputs, tfms def _get_augmented_boxes(self, augmented_inputs, tfms): # 1: forward with all augmented images outputs = self._batch_inference(augmented_inputs) # 2: union the results all_boxes = [] all_scores = [] all_classes = [] for output, tfm in zip(outputs, tfms): # Need to inverse the transforms on boxes, to obtain results on original image pred_boxes = output.pred_boxes.tensor original_pred_boxes = tfm.inverse().apply_box(pred_boxes.cpu().numpy()) all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device)) all_scores.extend(output.scores) all_classes.extend(output.pred_classes) all_boxes = torch.cat(all_boxes, dim=0) return all_boxes, all_scores, all_classes def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw): # select from the union of all results num_boxes = len(all_boxes) num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES # +1 because fast_rcnn_inference expects background scores as well all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device) for idx, cls, score in zip(count(), all_classes, all_scores): all_scores_2d[idx, cls] = score merged_instances, _ = fast_rcnn_inference_single_image( all_boxes, all_scores_2d, shape_hw, 1e-8, self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST, self.cfg.TEST.DETECTIONS_PER_IMAGE, ) return merged_instances def _rescale_detected_boxes(self, augmented_inputs, merged_instances, tfms): augmented_instances = [] for input, tfm in zip(augmented_inputs, tfms): # Transform the target box to the augmented image's coordinate space pred_boxes = merged_instances.pred_boxes.tensor.cpu().numpy() pred_boxes = torch.from_numpy(tfm.apply_box(pred_boxes)) aug_instances = Instances( image_size=input["image"].shape[1:3], pred_boxes=Boxes(pred_boxes), pred_classes=merged_instances.pred_classes, scores=merged_instances.scores, ) augmented_instances.append(aug_instances) return augmented_instances def _reduce_pred_masks(self, outputs, tfms): # Should apply inverse transforms on masks. # We assume only resize & flip are used. pred_masks is a scale-invariant # representation, so we handle flip specially for output, tfm in zip(outputs, tfms): if any(isinstance(t, HFlipTransform) for t in tfm.transforms): output.pred_masks = output.pred_masks.flip(dims=[3]) all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0) avg_pred_masks = torch.mean(all_pred_masks, dim=0) return avg_pred_masks ================================================ FILE: annotator/oneformer/detectron2/projects/README.md ================================================ Projects live in the [`projects` directory](../../projects) under the root of this repository, but not here. ================================================ FILE: annotator/oneformer/detectron2/projects/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import importlib.abc import importlib.util from pathlib import Path __all__ = [] _PROJECTS = { "point_rend": "PointRend", "deeplab": "DeepLab", "panoptic_deeplab": "Panoptic-DeepLab", } _PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent / "projects" if _PROJECT_ROOT.is_dir(): # This is true only for in-place installation (pip install -e, setup.py develop), # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230 class _D2ProjectsFinder(importlib.abc.MetaPathFinder): def find_spec(self, name, path, target=None): if not name.startswith("detectron2.projects."): return project_name = name.split(".")[-1] project_dir = _PROJECTS.get(project_name) if not project_dir: return target_file = _PROJECT_ROOT / f"{project_dir}/{project_name}/__init__.py" if not target_file.is_file(): return return importlib.util.spec_from_file_location(name, target_file) import sys sys.meta_path.append(_D2ProjectsFinder()) ================================================ FILE: annotator/oneformer/detectron2/projects/deeplab/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .build_solver import build_lr_scheduler from .config import add_deeplab_config from .resnet import build_resnet_deeplab_backbone from .semantic_seg import DeepLabV3Head, DeepLabV3PlusHead ================================================ FILE: annotator/oneformer/detectron2/projects/deeplab/build_solver.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch from annotator.oneformer.detectron2.config import CfgNode from annotator.oneformer.detectron2.solver import LRScheduler from annotator.oneformer.detectron2.solver import build_lr_scheduler as build_d2_lr_scheduler from .lr_scheduler import WarmupPolyLR def build_lr_scheduler(cfg: CfgNode, optimizer: torch.optim.Optimizer) -> LRScheduler: """ Build a LR scheduler from config. """ name = cfg.SOLVER.LR_SCHEDULER_NAME if name == "WarmupPolyLR": return WarmupPolyLR( optimizer, cfg.SOLVER.MAX_ITER, warmup_factor=cfg.SOLVER.WARMUP_FACTOR, warmup_iters=cfg.SOLVER.WARMUP_ITERS, warmup_method=cfg.SOLVER.WARMUP_METHOD, power=cfg.SOLVER.POLY_LR_POWER, constant_ending=cfg.SOLVER.POLY_LR_CONSTANT_ENDING, ) else: return build_d2_lr_scheduler(cfg, optimizer) ================================================ FILE: annotator/oneformer/detectron2/projects/deeplab/config.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. def add_deeplab_config(cfg): """ Add config for DeepLab. """ # We retry random cropping until no single category in semantic segmentation GT occupies more # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 # Used for `poly` learning rate schedule. cfg.SOLVER.POLY_LR_POWER = 0.9 cfg.SOLVER.POLY_LR_CONSTANT_ENDING = 0.0 # Loss type, choose from `cross_entropy`, `hard_pixel_mining`. cfg.MODEL.SEM_SEG_HEAD.LOSS_TYPE = "hard_pixel_mining" # DeepLab settings cfg.MODEL.SEM_SEG_HEAD.PROJECT_FEATURES = ["res2"] cfg.MODEL.SEM_SEG_HEAD.PROJECT_CHANNELS = [48] cfg.MODEL.SEM_SEG_HEAD.ASPP_CHANNELS = 256 cfg.MODEL.SEM_SEG_HEAD.ASPP_DILATIONS = [6, 12, 18] cfg.MODEL.SEM_SEG_HEAD.ASPP_DROPOUT = 0.1 cfg.MODEL.SEM_SEG_HEAD.USE_DEPTHWISE_SEPARABLE_CONV = False # Backbone new configs cfg.MODEL.RESNETS.RES4_DILATION = 1 cfg.MODEL.RESNETS.RES5_MULTI_GRID = [1, 2, 4] # ResNet stem type from: `basic`, `deeplab` cfg.MODEL.RESNETS.STEM_TYPE = "deeplab" ================================================ FILE: annotator/oneformer/detectron2/projects/deeplab/loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch import torch.nn as nn class DeepLabCE(nn.Module): """ Hard pixel mining with cross entropy loss, for semantic segmentation. This is used in TensorFlow DeepLab frameworks. Paper: DeeperLab: Single-Shot Image Parser Reference: https://github.com/tensorflow/models/blob/bd488858d610e44df69da6f89277e9de8a03722c/research/deeplab/utils/train_utils.py#L33 # noqa Arguments: ignore_label: Integer, label to ignore. top_k_percent_pixels: Float, the value lies in [0.0, 1.0]. When its value < 1.0, only compute the loss for the top k percent pixels (e.g., the top 20% pixels). This is useful for hard pixel mining. weight: Tensor, a manual rescaling weight given to each class. """ def __init__(self, ignore_label=-1, top_k_percent_pixels=1.0, weight=None): super(DeepLabCE, self).__init__() self.top_k_percent_pixels = top_k_percent_pixels self.ignore_label = ignore_label self.criterion = nn.CrossEntropyLoss( weight=weight, ignore_index=ignore_label, reduction="none" ) def forward(self, logits, labels, weights=None): if weights is None: pixel_losses = self.criterion(logits, labels).contiguous().view(-1) else: # Apply per-pixel loss weights. pixel_losses = self.criterion(logits, labels) * weights pixel_losses = pixel_losses.contiguous().view(-1) if self.top_k_percent_pixels == 1.0: return pixel_losses.mean() top_k_pixels = int(self.top_k_percent_pixels * pixel_losses.numel()) pixel_losses, _ = torch.topk(pixel_losses, top_k_pixels) return pixel_losses.mean() ================================================ FILE: annotator/oneformer/detectron2/projects/deeplab/lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import math from typing import List import torch from annotator.oneformer.detectron2.solver.lr_scheduler import LRScheduler, _get_warmup_factor_at_iter # NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes # only on epoch boundaries. We typically use iteration based schedules instead. # As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean # "iteration" instead. # FIXME: ideally this would be achieved with a CombinedLRScheduler, separating # MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it. class WarmupPolyLR(LRScheduler): """ Poly learning rate schedule used to train DeepLab. Paper: DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs. Reference: https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/utils/train_utils.py#L337 # noqa """ def __init__( self, optimizer: torch.optim.Optimizer, max_iters: int, warmup_factor: float = 0.001, warmup_iters: int = 1000, warmup_method: str = "linear", last_epoch: int = -1, power: float = 0.9, constant_ending: float = 0.0, ): self.max_iters = max_iters self.warmup_factor = warmup_factor self.warmup_iters = warmup_iters self.warmup_method = warmup_method self.power = power self.constant_ending = constant_ending super().__init__(optimizer, last_epoch) def get_lr(self) -> List[float]: warmup_factor = _get_warmup_factor_at_iter( self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor ) if self.constant_ending > 0 and warmup_factor == 1.0: # Constant ending lr. if ( math.pow((1.0 - self.last_epoch / self.max_iters), self.power) < self.constant_ending ): return [base_lr * self.constant_ending for base_lr in self.base_lrs] return [ base_lr * warmup_factor * math.pow((1.0 - self.last_epoch / self.max_iters), self.power) for base_lr in self.base_lrs ] def _compute_values(self) -> List[float]: # The new interface return self.get_lr() ================================================ FILE: annotator/oneformer/detectron2/projects/deeplab/resnet.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import fvcore.nn.weight_init as weight_init import torch.nn.functional as F from annotator.oneformer.detectron2.layers import CNNBlockBase, Conv2d, get_norm from annotator.oneformer.detectron2.modeling import BACKBONE_REGISTRY from annotator.oneformer.detectron2.modeling.backbone.resnet import ( BasicStem, BottleneckBlock, DeformBottleneckBlock, ResNet, ) class DeepLabStem(CNNBlockBase): """ The DeepLab ResNet stem (layers before the first residual block). """ def __init__(self, in_channels=3, out_channels=128, norm="BN"): """ Args: norm (str or callable): norm after the first conv layer. See :func:`layers.get_norm` for supported format. """ super().__init__(in_channels, out_channels, 4) self.in_channels = in_channels self.conv1 = Conv2d( in_channels, out_channels // 2, kernel_size=3, stride=2, padding=1, bias=False, norm=get_norm(norm, out_channels // 2), ) self.conv2 = Conv2d( out_channels // 2, out_channels // 2, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, out_channels // 2), ) self.conv3 = Conv2d( out_channels // 2, out_channels, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, out_channels), ) weight_init.c2_msra_fill(self.conv1) weight_init.c2_msra_fill(self.conv2) weight_init.c2_msra_fill(self.conv3) def forward(self, x): x = self.conv1(x) x = F.relu_(x) x = self.conv2(x) x = F.relu_(x) x = self.conv3(x) x = F.relu_(x) x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) return x @BACKBONE_REGISTRY.register() def build_resnet_deeplab_backbone(cfg, input_shape): """ Create a ResNet instance from config. Returns: ResNet: a :class:`ResNet` instance. """ # need registration of new blocks/stems? norm = cfg.MODEL.RESNETS.NORM if cfg.MODEL.RESNETS.STEM_TYPE == "basic": stem = BasicStem( in_channels=input_shape.channels, out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, norm=norm, ) elif cfg.MODEL.RESNETS.STEM_TYPE == "deeplab": stem = DeepLabStem( in_channels=input_shape.channels, out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, norm=norm, ) else: raise ValueError("Unknown stem type: {}".format(cfg.MODEL.RESNETS.STEM_TYPE)) # fmt: off freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT out_features = cfg.MODEL.RESNETS.OUT_FEATURES depth = cfg.MODEL.RESNETS.DEPTH num_groups = cfg.MODEL.RESNETS.NUM_GROUPS width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP bottleneck_channels = num_groups * width_per_group in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 res4_dilation = cfg.MODEL.RESNETS.RES4_DILATION res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS res5_multi_grid = cfg.MODEL.RESNETS.RES5_MULTI_GRID # fmt: on assert res4_dilation in {1, 2}, "res4_dilation cannot be {}.".format(res4_dilation) assert res5_dilation in {1, 2, 4}, "res5_dilation cannot be {}.".format(res5_dilation) if res4_dilation == 2: # Always dilate res5 if res4 is dilated. assert res5_dilation == 4 num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth] stages = [] # Avoid creating variables without gradients # It consumes extra memory and may cause allreduce to fail out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features] max_stage_idx = max(out_stage_idx) for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)): if stage_idx == 4: dilation = res4_dilation elif stage_idx == 5: dilation = res5_dilation else: dilation = 1 first_stride = 1 if idx == 0 or dilation > 1 else 2 stage_kargs = { "num_blocks": num_blocks_per_stage[idx], "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1), "in_channels": in_channels, "out_channels": out_channels, "norm": norm, } stage_kargs["bottleneck_channels"] = bottleneck_channels stage_kargs["stride_in_1x1"] = stride_in_1x1 stage_kargs["dilation"] = dilation stage_kargs["num_groups"] = num_groups if deform_on_per_stage[idx]: stage_kargs["block_class"] = DeformBottleneckBlock stage_kargs["deform_modulated"] = deform_modulated stage_kargs["deform_num_groups"] = deform_num_groups else: stage_kargs["block_class"] = BottleneckBlock if stage_idx == 5: stage_kargs.pop("dilation") stage_kargs["dilation_per_block"] = [dilation * mg for mg in res5_multi_grid] blocks = ResNet.make_stage(**stage_kargs) in_channels = out_channels out_channels *= 2 bottleneck_channels *= 2 stages.append(blocks) return ResNet(stem, stages, out_features=out_features).freeze(freeze_at) ================================================ FILE: annotator/oneformer/detectron2/projects/deeplab/semantic_seg.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import ASPP, Conv2d, DepthwiseSeparableConv2d, ShapeSpec, get_norm from annotator.oneformer.detectron2.modeling import SEM_SEG_HEADS_REGISTRY from .loss import DeepLabCE @SEM_SEG_HEADS_REGISTRY.register() class DeepLabV3PlusHead(nn.Module): """ A semantic segmentation head described in :paper:`DeepLabV3+`. """ @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, project_channels: List[int], aspp_dilations: List[int], aspp_dropout: float, decoder_channels: List[int], common_stride: int, norm: Union[str, Callable], train_size: Optional[Tuple], loss_weight: float = 1.0, loss_type: str = "cross_entropy", ignore_value: int = -1, num_classes: Optional[int] = None, use_depthwise_separable_conv: bool = False, ): """ NOTE: this interface is experimental. Args: input_shape: shape of the input features. They will be ordered by stride and the last one (with largest stride) is used as the input to the decoder (i.e. the ASPP module); the rest are low-level feature for the intermediate levels of decoder. project_channels (list[int]): a list of low-level feature channels. The length should be len(in_features) - 1. aspp_dilations (list(int)): a list of 3 dilations in ASPP. aspp_dropout (float): apply dropout on the output of ASPP. decoder_channels (list[int]): a list of output channels of each decoder stage. It should have the same length as "in_features" (each element in "in_features" corresponds to one decoder stage). common_stride (int): output stride of decoder. norm (str or callable): normalization for all conv layers. train_size (tuple): (height, width) of training images. loss_weight (float): loss weight. loss_type (str): type of loss function, 2 opptions: (1) "cross_entropy" is the standard cross entropy loss. (2) "hard_pixel_mining" is the loss in DeepLab that samples top k% hardest pixels. ignore_value (int): category to be ignored during training. num_classes (int): number of classes, if set to None, the decoder will not construct a predictor. use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d in ASPP and decoder. """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) # fmt: off self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" in_channels = [x[1].channels for x in input_shape] in_strides = [x[1].stride for x in input_shape] aspp_channels = decoder_channels[-1] self.ignore_value = ignore_value self.common_stride = common_stride # output stride self.loss_weight = loss_weight self.loss_type = loss_type self.decoder_only = num_classes is None self.use_depthwise_separable_conv = use_depthwise_separable_conv # fmt: on assert ( len(project_channels) == len(self.in_features) - 1 ), "Expected {} project_channels, got {}".format( len(self.in_features) - 1, len(project_channels) ) assert len(decoder_channels) == len( self.in_features ), "Expected {} decoder_channels, got {}".format( len(self.in_features), len(decoder_channels) ) self.decoder = nn.ModuleDict() use_bias = norm == "" for idx, in_channel in enumerate(in_channels): decoder_stage = nn.ModuleDict() if idx == len(self.in_features) - 1: # ASPP module if train_size is not None: train_h, train_w = train_size encoder_stride = in_strides[-1] if train_h % encoder_stride or train_w % encoder_stride: raise ValueError("Crop size need to be divisible by encoder stride.") pool_h = train_h // encoder_stride pool_w = train_w // encoder_stride pool_kernel_size = (pool_h, pool_w) else: pool_kernel_size = None project_conv = ASPP( in_channel, aspp_channels, aspp_dilations, norm=norm, activation=F.relu, pool_kernel_size=pool_kernel_size, dropout=aspp_dropout, use_depthwise_separable_conv=use_depthwise_separable_conv, ) fuse_conv = None else: project_conv = Conv2d( in_channel, project_channels[idx], kernel_size=1, bias=use_bias, norm=get_norm(norm, project_channels[idx]), activation=F.relu, ) weight_init.c2_xavier_fill(project_conv) if use_depthwise_separable_conv: # We use a single 5x5 DepthwiseSeparableConv2d to replace # 2 3x3 Conv2d since they have the same receptive field, # proposed in :paper:`Panoptic-DeepLab`. fuse_conv = DepthwiseSeparableConv2d( project_channels[idx] + decoder_channels[idx + 1], decoder_channels[idx], kernel_size=5, padding=2, norm1=norm, activation1=F.relu, norm2=norm, activation2=F.relu, ) else: fuse_conv = nn.Sequential( Conv2d( project_channels[idx] + decoder_channels[idx + 1], decoder_channels[idx], kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, decoder_channels[idx]), activation=F.relu, ), Conv2d( decoder_channels[idx], decoder_channels[idx], kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, decoder_channels[idx]), activation=F.relu, ), ) weight_init.c2_xavier_fill(fuse_conv[0]) weight_init.c2_xavier_fill(fuse_conv[1]) decoder_stage["project_conv"] = project_conv decoder_stage["fuse_conv"] = fuse_conv self.decoder[self.in_features[idx]] = decoder_stage if not self.decoder_only: self.predictor = Conv2d( decoder_channels[0], num_classes, kernel_size=1, stride=1, padding=0 ) nn.init.normal_(self.predictor.weight, 0, 0.001) nn.init.constant_(self.predictor.bias, 0) if self.loss_type == "cross_entropy": self.loss = nn.CrossEntropyLoss(reduction="mean", ignore_index=self.ignore_value) elif self.loss_type == "hard_pixel_mining": self.loss = DeepLabCE(ignore_label=self.ignore_value, top_k_percent_pixels=0.2) else: raise ValueError("Unexpected loss type: %s" % self.loss_type) @classmethod def from_config(cls, cfg, input_shape): if cfg.INPUT.CROP.ENABLED: assert cfg.INPUT.CROP.TYPE == "absolute" train_size = cfg.INPUT.CROP.SIZE else: train_size = None decoder_channels = [cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM] * ( len(cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES) - 1 ) + [cfg.MODEL.SEM_SEG_HEAD.ASPP_CHANNELS] ret = dict( input_shape={ k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES }, project_channels=cfg.MODEL.SEM_SEG_HEAD.PROJECT_CHANNELS, aspp_dilations=cfg.MODEL.SEM_SEG_HEAD.ASPP_DILATIONS, aspp_dropout=cfg.MODEL.SEM_SEG_HEAD.ASPP_DROPOUT, decoder_channels=decoder_channels, common_stride=cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE, norm=cfg.MODEL.SEM_SEG_HEAD.NORM, train_size=train_size, loss_weight=cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, loss_type=cfg.MODEL.SEM_SEG_HEAD.LOSS_TYPE, ignore_value=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, use_depthwise_separable_conv=cfg.MODEL.SEM_SEG_HEAD.USE_DEPTHWISE_SEPARABLE_CONV, ) return ret def forward(self, features, targets=None): """ Returns: In training, returns (None, dict of losses) In inference, returns (CxHxW logits, {}) """ y = self.layers(features) if self.decoder_only: # Output from self.layers() only contains decoder feature. return y if self.training: return None, self.losses(y, targets) else: y = F.interpolate( y, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) return y, {} def layers(self, features): # Reverse feature maps into top-down order (from low to high resolution) for f in self.in_features[::-1]: x = features[f] proj_x = self.decoder[f]["project_conv"](x) if self.decoder[f]["fuse_conv"] is None: # This is aspp module y = proj_x else: # Upsample y y = F.interpolate(y, size=proj_x.size()[2:], mode="bilinear", align_corners=False) y = torch.cat([proj_x, y], dim=1) y = self.decoder[f]["fuse_conv"](y) if not self.decoder_only: y = self.predictor(y) return y def losses(self, predictions, targets): predictions = F.interpolate( predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) loss = self.loss(predictions, targets) losses = {"loss_sem_seg": loss * self.loss_weight} return losses @SEM_SEG_HEADS_REGISTRY.register() class DeepLabV3Head(nn.Module): """ A semantic segmentation head described in :paper:`DeepLabV3`. """ def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): super().__init__() # fmt: off self.in_features = cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES in_channels = [input_shape[f].channels for f in self.in_features] aspp_channels = cfg.MODEL.SEM_SEG_HEAD.ASPP_CHANNELS aspp_dilations = cfg.MODEL.SEM_SEG_HEAD.ASPP_DILATIONS self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE num_classes = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES conv_dims = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM self.common_stride = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE # output stride norm = cfg.MODEL.SEM_SEG_HEAD.NORM self.loss_weight = cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT self.loss_type = cfg.MODEL.SEM_SEG_HEAD.LOSS_TYPE train_crop_size = cfg.INPUT.CROP.SIZE aspp_dropout = cfg.MODEL.SEM_SEG_HEAD.ASPP_DROPOUT use_depthwise_separable_conv = cfg.MODEL.SEM_SEG_HEAD.USE_DEPTHWISE_SEPARABLE_CONV # fmt: on assert len(self.in_features) == 1 assert len(in_channels) == 1 # ASPP module if cfg.INPUT.CROP.ENABLED: assert cfg.INPUT.CROP.TYPE == "absolute" train_crop_h, train_crop_w = train_crop_size if train_crop_h % self.common_stride or train_crop_w % self.common_stride: raise ValueError("Crop size need to be divisible by output stride.") pool_h = train_crop_h // self.common_stride pool_w = train_crop_w // self.common_stride pool_kernel_size = (pool_h, pool_w) else: pool_kernel_size = None self.aspp = ASPP( in_channels[0], aspp_channels, aspp_dilations, norm=norm, activation=F.relu, pool_kernel_size=pool_kernel_size, dropout=aspp_dropout, use_depthwise_separable_conv=use_depthwise_separable_conv, ) self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) nn.init.normal_(self.predictor.weight, 0, 0.001) nn.init.constant_(self.predictor.bias, 0) if self.loss_type == "cross_entropy": self.loss = nn.CrossEntropyLoss(reduction="mean", ignore_index=self.ignore_value) elif self.loss_type == "hard_pixel_mining": self.loss = DeepLabCE(ignore_label=self.ignore_value, top_k_percent_pixels=0.2) else: raise ValueError("Unexpected loss type: %s" % self.loss_type) def forward(self, features, targets=None): """ Returns: In training, returns (None, dict of losses) In inference, returns (CxHxW logits, {}) """ x = features[self.in_features[0]] x = self.aspp(x) x = self.predictor(x) if self.training: return None, self.losses(x, targets) else: x = F.interpolate( x, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) return x, {} def losses(self, predictions, targets): predictions = F.interpolate( predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False ) loss = self.loss(predictions, targets) losses = {"loss_sem_seg": loss * self.loss_weight} return losses ================================================ FILE: annotator/oneformer/detectron2/solver/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .build import build_lr_scheduler, build_optimizer, get_default_optimizer_params from .lr_scheduler import ( LRMultiplier, LRScheduler, WarmupCosineLR, WarmupMultiStepLR, WarmupParamScheduler, ) __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: annotator/oneformer/detectron2/solver/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import itertools import logging from collections import defaultdict from enum import Enum from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Type, Union import torch from fvcore.common.param_scheduler import ( CosineParamScheduler, MultiStepParamScheduler, StepWithFixedGammaParamScheduler, ) from annotator.oneformer.detectron2.config import CfgNode from annotator.oneformer.detectron2.utils.env import TORCH_VERSION from .lr_scheduler import LRMultiplier, LRScheduler, WarmupParamScheduler _GradientClipperInput = Union[torch.Tensor, Iterable[torch.Tensor]] _GradientClipper = Callable[[_GradientClipperInput], None] class GradientClipType(Enum): VALUE = "value" NORM = "norm" def _create_gradient_clipper(cfg: CfgNode) -> _GradientClipper: """ Creates gradient clipping closure to clip by value or by norm, according to the provided config. """ cfg = copy.deepcopy(cfg) def clip_grad_norm(p: _GradientClipperInput): torch.nn.utils.clip_grad_norm_(p, cfg.CLIP_VALUE, cfg.NORM_TYPE) def clip_grad_value(p: _GradientClipperInput): torch.nn.utils.clip_grad_value_(p, cfg.CLIP_VALUE) _GRADIENT_CLIP_TYPE_TO_CLIPPER = { GradientClipType.VALUE: clip_grad_value, GradientClipType.NORM: clip_grad_norm, } return _GRADIENT_CLIP_TYPE_TO_CLIPPER[GradientClipType(cfg.CLIP_TYPE)] def _generate_optimizer_class_with_gradient_clipping( optimizer: Type[torch.optim.Optimizer], *, per_param_clipper: Optional[_GradientClipper] = None, global_clipper: Optional[_GradientClipper] = None, ) -> Type[torch.optim.Optimizer]: """ Dynamically creates a new type that inherits the type of a given instance and overrides the `step` method to add gradient clipping """ assert ( per_param_clipper is None or global_clipper is None ), "Not allowed to use both per-parameter clipping and global clipping" def optimizer_wgc_step(self, closure=None): if per_param_clipper is not None: for group in self.param_groups: for p in group["params"]: per_param_clipper(p) else: # global clipper for future use with detr # (https://github.com/facebookresearch/detr/pull/287) all_params = itertools.chain(*[g["params"] for g in self.param_groups]) global_clipper(all_params) super(type(self), self).step(closure) OptimizerWithGradientClip = type( optimizer.__name__ + "WithGradientClip", (optimizer,), {"step": optimizer_wgc_step}, ) return OptimizerWithGradientClip def maybe_add_gradient_clipping( cfg: CfgNode, optimizer: Type[torch.optim.Optimizer] ) -> Type[torch.optim.Optimizer]: """ If gradient clipping is enabled through config options, wraps the existing optimizer type to become a new dynamically created class OptimizerWithGradientClip that inherits the given optimizer and overrides the `step` method to include gradient clipping. Args: cfg: CfgNode, configuration options optimizer: type. A subclass of torch.optim.Optimizer Return: type: either the input `optimizer` (if gradient clipping is disabled), or a subclass of it with gradient clipping included in the `step` method. """ if not cfg.SOLVER.CLIP_GRADIENTS.ENABLED: return optimizer if isinstance(optimizer, torch.optim.Optimizer): optimizer_type = type(optimizer) else: assert issubclass(optimizer, torch.optim.Optimizer), optimizer optimizer_type = optimizer grad_clipper = _create_gradient_clipper(cfg.SOLVER.CLIP_GRADIENTS) OptimizerWithGradientClip = _generate_optimizer_class_with_gradient_clipping( optimizer_type, per_param_clipper=grad_clipper ) if isinstance(optimizer, torch.optim.Optimizer): optimizer.__class__ = OptimizerWithGradientClip # a bit hacky, not recommended return optimizer else: return OptimizerWithGradientClip def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer: """ Build an optimizer from config. """ params = get_default_optimizer_params( model, base_lr=cfg.SOLVER.BASE_LR, weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM, bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR, weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS, ) sgd_args = { "params": params, "lr": cfg.SOLVER.BASE_LR, "momentum": cfg.SOLVER.MOMENTUM, "nesterov": cfg.SOLVER.NESTEROV, "weight_decay": cfg.SOLVER.WEIGHT_DECAY, } if TORCH_VERSION >= (1, 12): sgd_args["foreach"] = True return maybe_add_gradient_clipping(cfg, torch.optim.SGD(**sgd_args)) def get_default_optimizer_params( model: torch.nn.Module, base_lr: Optional[float] = None, weight_decay: Optional[float] = None, weight_decay_norm: Optional[float] = None, bias_lr_factor: Optional[float] = 1.0, weight_decay_bias: Optional[float] = None, lr_factor_func: Optional[Callable] = None, overrides: Optional[Dict[str, Dict[str, float]]] = None, ) -> List[Dict[str, Any]]: """ Get default param list for optimizer, with support for a few types of overrides. If no overrides needed, this is equivalent to `model.parameters()`. Args: base_lr: lr for every group by default. Can be omitted to use the one in optimizer. weight_decay: weight decay for every group by default. Can be omitted to use the one in optimizer. weight_decay_norm: override weight decay for params in normalization layers bias_lr_factor: multiplier of lr for bias parameters. weight_decay_bias: override weight decay for bias parameters. lr_factor_func: function to calculate lr decay rate by mapping the parameter names to corresponding lr decay rate. Note that setting this option requires also setting ``base_lr``. overrides: if not `None`, provides values for optimizer hyperparameters (LR, weight decay) for module parameters with a given name; e.g. ``{"embedding": {"lr": 0.01, "weight_decay": 0.1}}`` will set the LR and weight decay values for all module parameters named `embedding`. For common detection models, ``weight_decay_norm`` is the only option needed to be set. ``bias_lr_factor,weight_decay_bias`` are legacy settings from Detectron1 that are not found useful. Example: :: torch.optim.SGD(get_default_optimizer_params(model, weight_decay_norm=0), lr=0.01, weight_decay=1e-4, momentum=0.9) """ if overrides is None: overrides = {} defaults = {} if base_lr is not None: defaults["lr"] = base_lr if weight_decay is not None: defaults["weight_decay"] = weight_decay bias_overrides = {} if bias_lr_factor is not None and bias_lr_factor != 1.0: # NOTE: unlike Detectron v1, we now by default make bias hyperparameters # exactly the same as regular weights. if base_lr is None: raise ValueError("bias_lr_factor requires base_lr") bias_overrides["lr"] = base_lr * bias_lr_factor if weight_decay_bias is not None: bias_overrides["weight_decay"] = weight_decay_bias if len(bias_overrides): if "bias" in overrides: raise ValueError("Conflicting overrides for 'bias'") overrides["bias"] = bias_overrides if lr_factor_func is not None: if base_lr is None: raise ValueError("lr_factor_func requires base_lr") norm_module_types = ( torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm, # NaiveSyncBatchNorm inherits from BatchNorm2d torch.nn.GroupNorm, torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d, torch.nn.LayerNorm, torch.nn.LocalResponseNorm, ) params: List[Dict[str, Any]] = [] memo: Set[torch.nn.parameter.Parameter] = set() for module_name, module in model.named_modules(): for module_param_name, value in module.named_parameters(recurse=False): if not value.requires_grad: continue # Avoid duplicating parameters if value in memo: continue memo.add(value) hyperparams = copy.copy(defaults) if isinstance(module, norm_module_types) and weight_decay_norm is not None: hyperparams["weight_decay"] = weight_decay_norm if lr_factor_func is not None: hyperparams["lr"] *= lr_factor_func(f"{module_name}.{module_param_name}") hyperparams.update(overrides.get(module_param_name, {})) params.append({"params": [value], **hyperparams}) return reduce_param_groups(params) def _expand_param_groups(params: List[Dict[str, Any]]) -> List[Dict[str, Any]]: # Transform parameter groups into per-parameter structure. # Later items in `params` can overwrite parameters set in previous items. ret = defaultdict(dict) for item in params: assert "params" in item cur_params = {x: y for x, y in item.items() if x != "params"} for param in item["params"]: ret[param].update({"params": [param], **cur_params}) return list(ret.values()) def reduce_param_groups(params: List[Dict[str, Any]]) -> List[Dict[str, Any]]: # Reorganize the parameter groups and merge duplicated groups. # The number of parameter groups needs to be as small as possible in order # to efficiently use the PyTorch multi-tensor optimizer. Therefore instead # of using a parameter_group per single parameter, we reorganize the # parameter groups and merge duplicated groups. This approach speeds # up multi-tensor optimizer significantly. params = _expand_param_groups(params) groups = defaultdict(list) # re-group all parameter groups by their hyperparams for item in params: cur_params = tuple((x, y) for x, y in item.items() if x != "params") groups[cur_params].extend(item["params"]) ret = [] for param_keys, param_values in groups.items(): cur = {kv[0]: kv[1] for kv in param_keys} cur["params"] = param_values ret.append(cur) return ret def build_lr_scheduler(cfg: CfgNode, optimizer: torch.optim.Optimizer) -> LRScheduler: """ Build a LR scheduler from config. """ name = cfg.SOLVER.LR_SCHEDULER_NAME if name == "WarmupMultiStepLR": steps = [x for x in cfg.SOLVER.STEPS if x <= cfg.SOLVER.MAX_ITER] if len(steps) != len(cfg.SOLVER.STEPS): logger = logging.getLogger(__name__) logger.warning( "SOLVER.STEPS contains values larger than SOLVER.MAX_ITER. " "These values will be ignored." ) sched = MultiStepParamScheduler( values=[cfg.SOLVER.GAMMA**k for k in range(len(steps) + 1)], milestones=steps, num_updates=cfg.SOLVER.MAX_ITER, ) elif name == "WarmupCosineLR": end_value = cfg.SOLVER.BASE_LR_END / cfg.SOLVER.BASE_LR assert end_value >= 0.0 and end_value <= 1.0, end_value sched = CosineParamScheduler(1, end_value) elif name == "WarmupStepWithFixedGammaLR": sched = StepWithFixedGammaParamScheduler( base_value=1.0, gamma=cfg.SOLVER.GAMMA, num_decays=cfg.SOLVER.NUM_DECAYS, num_updates=cfg.SOLVER.MAX_ITER, ) else: raise ValueError("Unknown LR scheduler: {}".format(name)) sched = WarmupParamScheduler( sched, cfg.SOLVER.WARMUP_FACTOR, min(cfg.SOLVER.WARMUP_ITERS / cfg.SOLVER.MAX_ITER, 1.0), cfg.SOLVER.WARMUP_METHOD, cfg.SOLVER.RESCALE_INTERVAL, ) return LRMultiplier(optimizer, multiplier=sched, max_iter=cfg.SOLVER.MAX_ITER) ================================================ FILE: annotator/oneformer/detectron2/solver/lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import math from bisect import bisect_right from typing import List import torch from fvcore.common.param_scheduler import ( CompositeParamScheduler, ConstantParamScheduler, LinearParamScheduler, ParamScheduler, ) try: from torch.optim.lr_scheduler import LRScheduler except ImportError: from torch.optim.lr_scheduler import _LRScheduler as LRScheduler logger = logging.getLogger(__name__) class WarmupParamScheduler(CompositeParamScheduler): """ Add an initial warmup stage to another scheduler. """ def __init__( self, scheduler: ParamScheduler, warmup_factor: float, warmup_length: float, warmup_method: str = "linear", rescale_interval: bool = False, ): """ Args: scheduler: warmup will be added at the beginning of this scheduler warmup_factor: the factor w.r.t the initial value of ``scheduler``, e.g. 0.001 warmup_length: the relative length (in [0, 1]) of warmup steps w.r.t the entire training, e.g. 0.01 warmup_method: one of "linear" or "constant" rescale_interval: whether we will rescale the interval of the scheduler after warmup """ end_value = scheduler(warmup_length) # the value to reach when warmup ends start_value = warmup_factor * scheduler(0.0) if warmup_method == "constant": warmup = ConstantParamScheduler(start_value) elif warmup_method == "linear": warmup = LinearParamScheduler(start_value, end_value) else: raise ValueError("Unknown warmup method: {}".format(warmup_method)) super().__init__( [warmup, scheduler], interval_scaling=["rescaled", "rescaled" if rescale_interval else "fixed"], lengths=[warmup_length, 1 - warmup_length], ) class LRMultiplier(LRScheduler): """ A LRScheduler which uses fvcore :class:`ParamScheduler` to multiply the learning rate of each param in the optimizer. Every step, the learning rate of each parameter becomes its initial value multiplied by the output of the given :class:`ParamScheduler`. The absolute learning rate value of each parameter can be different. This scheduler can be used as long as the relative scale among them do not change during training. Examples: :: LRMultiplier( opt, WarmupParamScheduler( MultiStepParamScheduler( [1, 0.1, 0.01], milestones=[60000, 80000], num_updates=90000, ), 0.001, 100 / 90000 ), max_iter=90000 ) """ # NOTES: in the most general case, every LR can use its own scheduler. # Supporting this requires interaction with the optimizer when its parameter # group is initialized. For example, classyvision implements its own optimizer # that allows different schedulers for every parameter group. # To avoid this complexity, we use this class to support the most common cases # where the relative scale among all LRs stay unchanged during training. In this # case we only need a total of one scheduler that defines the relative LR multiplier. def __init__( self, optimizer: torch.optim.Optimizer, multiplier: ParamScheduler, max_iter: int, last_iter: int = -1, ): """ Args: optimizer, last_iter: See ``torch.optim.lr_scheduler.LRScheduler``. ``last_iter`` is the same as ``last_epoch``. multiplier: a fvcore ParamScheduler that defines the multiplier on every LR of the optimizer max_iter: the total number of training iterations """ if not isinstance(multiplier, ParamScheduler): raise ValueError( "_LRMultiplier(multiplier=) must be an instance of fvcore " f"ParamScheduler. Got {multiplier} instead." ) self._multiplier = multiplier self._max_iter = max_iter super().__init__(optimizer, last_epoch=last_iter) def state_dict(self): # fvcore schedulers are stateless. Only keep pytorch scheduler states return {"base_lrs": self.base_lrs, "last_epoch": self.last_epoch} def get_lr(self) -> List[float]: multiplier = self._multiplier(self.last_epoch / self._max_iter) return [base_lr * multiplier for base_lr in self.base_lrs] """ Content below is no longer needed! """ # NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes # only on epoch boundaries. We typically use iteration based schedules instead. # As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean # "iteration" instead. # FIXME: ideally this would be achieved with a CombinedLRScheduler, separating # MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it. class WarmupMultiStepLR(LRScheduler): def __init__( self, optimizer: torch.optim.Optimizer, milestones: List[int], gamma: float = 0.1, warmup_factor: float = 0.001, warmup_iters: int = 1000, warmup_method: str = "linear", last_epoch: int = -1, ): logger.warning( "WarmupMultiStepLR is deprecated! Use LRMultipilier with fvcore ParamScheduler instead!" ) if not list(milestones) == sorted(milestones): raise ValueError( "Milestones should be a list of" " increasing integers. Got {}", milestones ) self.milestones = milestones self.gamma = gamma self.warmup_factor = warmup_factor self.warmup_iters = warmup_iters self.warmup_method = warmup_method super().__init__(optimizer, last_epoch) def get_lr(self) -> List[float]: warmup_factor = _get_warmup_factor_at_iter( self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor ) return [ base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch) for base_lr in self.base_lrs ] def _compute_values(self) -> List[float]: # The new interface return self.get_lr() class WarmupCosineLR(LRScheduler): def __init__( self, optimizer: torch.optim.Optimizer, max_iters: int, warmup_factor: float = 0.001, warmup_iters: int = 1000, warmup_method: str = "linear", last_epoch: int = -1, ): logger.warning( "WarmupCosineLR is deprecated! Use LRMultipilier with fvcore ParamScheduler instead!" ) self.max_iters = max_iters self.warmup_factor = warmup_factor self.warmup_iters = warmup_iters self.warmup_method = warmup_method super().__init__(optimizer, last_epoch) def get_lr(self) -> List[float]: warmup_factor = _get_warmup_factor_at_iter( self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor ) # Different definitions of half-cosine with warmup are possible. For # simplicity we multiply the standard half-cosine schedule by the warmup # factor. An alternative is to start the period of the cosine at warmup_iters # instead of at 0. In the case that warmup_iters << max_iters the two are # very close to each other. return [ base_lr * warmup_factor * 0.5 * (1.0 + math.cos(math.pi * self.last_epoch / self.max_iters)) for base_lr in self.base_lrs ] def _compute_values(self) -> List[float]: # The new interface return self.get_lr() def _get_warmup_factor_at_iter( method: str, iter: int, warmup_iters: int, warmup_factor: float ) -> float: """ Return the learning rate warmup factor at a specific iteration. See :paper:`ImageNet in 1h` for more details. Args: method (str): warmup method; either "constant" or "linear". iter (int): iteration at which to calculate the warmup factor. warmup_iters (int): the number of warmup iterations. warmup_factor (float): the base warmup factor (the meaning changes according to the method used). Returns: float: the effective warmup factor at the given iteration. """ if iter >= warmup_iters: return 1.0 if method == "constant": return warmup_factor elif method == "linear": alpha = iter / warmup_iters return warmup_factor * (1 - alpha) + alpha else: raise ValueError("Unknown warmup method: {}".format(method)) ================================================ FILE: annotator/oneformer/detectron2/structures/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .boxes import Boxes, BoxMode, pairwise_iou, pairwise_ioa, pairwise_point_box_distance from .image_list import ImageList from .instances import Instances from .keypoints import Keypoints, heatmaps_to_keypoints from .masks import BitMasks, PolygonMasks, polygons_to_bitmask, ROIMasks from .rotated_boxes import RotatedBoxes from .rotated_boxes import pairwise_iou as pairwise_iou_rotated __all__ = [k for k in globals().keys() if not k.startswith("_")] from annotator.oneformer.detectron2.utils.env import fixup_module_metadata fixup_module_metadata(__name__, globals(), __all__) del fixup_module_metadata ================================================ FILE: annotator/oneformer/detectron2/structures/boxes.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import math import numpy as np from enum import IntEnum, unique from typing import List, Tuple, Union import torch from torch import device _RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray] @unique class BoxMode(IntEnum): """ Enum of different ways to represent a box. """ XYXY_ABS = 0 """ (x0, y0, x1, y1) in absolute floating points coordinates. The coordinates in range [0, width or height]. """ XYWH_ABS = 1 """ (x0, y0, w, h) in absolute floating points coordinates. """ XYXY_REL = 2 """ Not yet supported! (x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image. """ XYWH_REL = 3 """ Not yet supported! (x0, y0, w, h) in range [0, 1]. They are relative to the size of the image. """ XYWHA_ABS = 4 """ (xc, yc, w, h, a) in absolute floating points coordinates. (xc, yc) is the center of the rotated box, and the angle a is in degrees ccw. """ @staticmethod def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType: """ Args: box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5 from_mode, to_mode (BoxMode) Returns: The converted box of the same type. """ if from_mode == to_mode: return box original_type = type(box) is_numpy = isinstance(box, np.ndarray) single_box = isinstance(box, (list, tuple)) if single_box: assert len(box) == 4 or len(box) == 5, ( "BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor," " where k == 4 or 5" ) arr = torch.tensor(box)[None, :] else: # avoid modifying the input box if is_numpy: arr = torch.from_numpy(np.asarray(box)).clone() else: arr = box.clone() assert to_mode not in [BoxMode.XYXY_REL, BoxMode.XYWH_REL] and from_mode not in [ BoxMode.XYXY_REL, BoxMode.XYWH_REL, ], "Relative mode not yet supported!" if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS: assert ( arr.shape[-1] == 5 ), "The last dimension of input shape must be 5 for XYWHA format" original_dtype = arr.dtype arr = arr.double() w = arr[:, 2] h = arr[:, 3] a = arr[:, 4] c = torch.abs(torch.cos(a * math.pi / 180.0)) s = torch.abs(torch.sin(a * math.pi / 180.0)) # This basically computes the horizontal bounding rectangle of the rotated box new_w = c * w + s * h new_h = c * h + s * w # convert center to top-left corner arr[:, 0] -= new_w / 2.0 arr[:, 1] -= new_h / 2.0 # bottom-right corner arr[:, 2] = arr[:, 0] + new_w arr[:, 3] = arr[:, 1] + new_h arr = arr[:, :4].to(dtype=original_dtype) elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS: original_dtype = arr.dtype arr = arr.double() arr[:, 0] += arr[:, 2] / 2.0 arr[:, 1] += arr[:, 3] / 2.0 angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype) arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype) else: if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS: arr[:, 2] += arr[:, 0] arr[:, 3] += arr[:, 1] elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS: arr[:, 2] -= arr[:, 0] arr[:, 3] -= arr[:, 1] else: raise NotImplementedError( "Conversion from BoxMode {} to {} is not supported yet".format( from_mode, to_mode ) ) if single_box: return original_type(arr.flatten().tolist()) if is_numpy: return arr.numpy() else: return arr class Boxes: """ This structure stores a list of boxes as a Nx4 torch.Tensor. It supports some common methods about boxes (`area`, `clip`, `nonempty`, etc), and also behaves like a Tensor (support indexing, `to(device)`, `.device`, and iteration over all boxes) Attributes: tensor (torch.Tensor): float matrix of Nx4. Each row is (x1, y1, x2, y2). """ def __init__(self, tensor: torch.Tensor): """ Args: tensor (Tensor[float]): a Nx4 matrix. Each row is (x1, y1, x2, y2). """ if not isinstance(tensor, torch.Tensor): tensor = torch.as_tensor(tensor, dtype=torch.float32, device=torch.device("cpu")) else: tensor = tensor.to(torch.float32) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that does not depend on # the inputs (and consequently confuses jit) tensor = tensor.reshape((-1, 4)).to(dtype=torch.float32) assert tensor.dim() == 2 and tensor.size(-1) == 4, tensor.size() self.tensor = tensor def clone(self) -> "Boxes": """ Clone the Boxes. Returns: Boxes """ return Boxes(self.tensor.clone()) def to(self, device: torch.device): # Boxes are assumed float32 and does not support to(dtype) return Boxes(self.tensor.to(device=device)) def area(self) -> torch.Tensor: """ Computes the area of all the boxes. Returns: torch.Tensor: a vector with areas of each box. """ box = self.tensor area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1]) return area def clip(self, box_size: Tuple[int, int]) -> None: """ Clip (in place) the boxes by limiting x coordinates to the range [0, width] and y coordinates to the range [0, height]. Args: box_size (height, width): The clipping box's size. """ assert torch.isfinite(self.tensor).all(), "Box tensor contains infinite or NaN!" h, w = box_size x1 = self.tensor[:, 0].clamp(min=0, max=w) y1 = self.tensor[:, 1].clamp(min=0, max=h) x2 = self.tensor[:, 2].clamp(min=0, max=w) y2 = self.tensor[:, 3].clamp(min=0, max=h) self.tensor = torch.stack((x1, y1, x2, y2), dim=-1) def nonempty(self, threshold: float = 0.0) -> torch.Tensor: """ Find boxes that are non-empty. A box is considered empty, if either of its side is no larger than threshold. Returns: Tensor: a binary vector which represents whether each box is empty (False) or non-empty (True). """ box = self.tensor widths = box[:, 2] - box[:, 0] heights = box[:, 3] - box[:, 1] keep = (widths > threshold) & (heights > threshold) return keep def __getitem__(self, item) -> "Boxes": """ Args: item: int, slice, or a BoolTensor Returns: Boxes: Create a new :class:`Boxes` by indexing. The following usage are allowed: 1. `new_boxes = boxes[3]`: return a `Boxes` which contains only one box. 2. `new_boxes = boxes[2:10]`: return a slice of boxes. 3. `new_boxes = boxes[vector]`, where vector is a torch.BoolTensor with `length = len(boxes)`. Nonzero elements in the vector will be selected. Note that the returned Boxes might share storage with this Boxes, subject to Pytorch's indexing semantics. """ if isinstance(item, int): return Boxes(self.tensor[item].view(1, -1)) b = self.tensor[item] assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item) return Boxes(b) def __len__(self) -> int: return self.tensor.shape[0] def __repr__(self) -> str: return "Boxes(" + str(self.tensor) + ")" def inside_box(self, box_size: Tuple[int, int], boundary_threshold: int = 0) -> torch.Tensor: """ Args: box_size (height, width): Size of the reference box. boundary_threshold (int): Boxes that extend beyond the reference box boundary by more than boundary_threshold are considered "outside". Returns: a binary vector, indicating whether each box is inside the reference box. """ height, width = box_size inds_inside = ( (self.tensor[..., 0] >= -boundary_threshold) & (self.tensor[..., 1] >= -boundary_threshold) & (self.tensor[..., 2] < width + boundary_threshold) & (self.tensor[..., 3] < height + boundary_threshold) ) return inds_inside def get_centers(self) -> torch.Tensor: """ Returns: The box centers in a Nx2 array of (x, y). """ return (self.tensor[:, :2] + self.tensor[:, 2:]) / 2 def scale(self, scale_x: float, scale_y: float) -> None: """ Scale the box with horizontal and vertical scaling factors """ self.tensor[:, 0::2] *= scale_x self.tensor[:, 1::2] *= scale_y @classmethod def cat(cls, boxes_list: List["Boxes"]) -> "Boxes": """ Concatenates a list of Boxes into a single Boxes Arguments: boxes_list (list[Boxes]) Returns: Boxes: the concatenated Boxes """ assert isinstance(boxes_list, (list, tuple)) if len(boxes_list) == 0: return cls(torch.empty(0)) assert all([isinstance(box, Boxes) for box in boxes_list]) # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0)) return cat_boxes @property def device(self) -> device: return self.tensor.device # type "Iterator[torch.Tensor]", yield, and iter() not supported by torchscript # https://github.com/pytorch/pytorch/issues/18627 @torch.jit.unused def __iter__(self): """ Yield a box as a Tensor of shape (4,) at a time. """ yield from self.tensor def pairwise_intersection(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor: """ Given two lists of boxes of size N and M, compute the intersection area between __all__ N x M pairs of boxes. The box order must be (xmin, ymin, xmax, ymax) Args: boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively. Returns: Tensor: intersection, sized [N,M]. """ boxes1, boxes2 = boxes1.tensor, boxes2.tensor width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max( boxes1[:, None, :2], boxes2[:, :2] ) # [N,M,2] width_height.clamp_(min=0) # [N,M,2] intersection = width_height.prod(dim=2) # [N,M] return intersection # implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py # with slight modifications def pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor: """ Given two lists of boxes of size N and M, compute the IoU (intersection over union) between **all** N x M pairs of boxes. The box order must be (xmin, ymin, xmax, ymax). Args: boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively. Returns: Tensor: IoU, sized [N,M]. """ area1 = boxes1.area() # [N] area2 = boxes2.area() # [M] inter = pairwise_intersection(boxes1, boxes2) # handle empty boxes iou = torch.where( inter > 0, inter / (area1[:, None] + area2 - inter), torch.zeros(1, dtype=inter.dtype, device=inter.device), ) return iou def pairwise_ioa(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor: """ Similar to :func:`pariwise_iou` but compute the IoA (intersection over boxes2 area). Args: boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively. Returns: Tensor: IoA, sized [N,M]. """ area2 = boxes2.area() # [M] inter = pairwise_intersection(boxes1, boxes2) # handle empty boxes ioa = torch.where( inter > 0, inter / area2, torch.zeros(1, dtype=inter.dtype, device=inter.device) ) return ioa def pairwise_point_box_distance(points: torch.Tensor, boxes: Boxes): """ Pairwise distance between N points and M boxes. The distance between a point and a box is represented by the distance from the point to 4 edges of the box. Distances are all positive when the point is inside the box. Args: points: Nx2 coordinates. Each row is (x, y) boxes: M boxes Returns: Tensor: distances of size (N, M, 4). The 4 values are distances from the point to the left, top, right, bottom of the box. """ x, y = points.unsqueeze(dim=2).unbind(dim=1) # (N, 1) x0, y0, x1, y1 = boxes.tensor.unsqueeze(dim=0).unbind(dim=2) # (1, M) return torch.stack([x - x0, y - y0, x1 - x, y1 - y], dim=2) def matched_pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor: """ Compute pairwise intersection over union (IOU) of two sets of matched boxes that have the same number of boxes. Similar to :func:`pairwise_iou`, but computes only diagonal elements of the matrix. Args: boxes1 (Boxes): bounding boxes, sized [N,4]. boxes2 (Boxes): same length as boxes1 Returns: Tensor: iou, sized [N]. """ assert len(boxes1) == len( boxes2 ), "boxlists should have the same" "number of entries, got {}, {}".format( len(boxes1), len(boxes2) ) area1 = boxes1.area() # [N] area2 = boxes2.area() # [N] box1, box2 = boxes1.tensor, boxes2.tensor lt = torch.max(box1[:, :2], box2[:, :2]) # [N,2] rb = torch.min(box1[:, 2:], box2[:, 2:]) # [N,2] wh = (rb - lt).clamp(min=0) # [N,2] inter = wh[:, 0] * wh[:, 1] # [N] iou = inter / (area1 + area2 - inter) # [N] return iou ================================================ FILE: annotator/oneformer/detectron2/structures/image_list.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from __future__ import division from typing import Any, Dict, List, Optional, Tuple import torch from torch import device from torch.nn import functional as F from annotator.oneformer.detectron2.layers.wrappers import move_device_like, shapes_to_tensor class ImageList(object): """ Structure that holds a list of images (of possibly varying sizes) as a single tensor. This works by padding the images to the same size. The original sizes of each image is stored in `image_sizes`. Attributes: image_sizes (list[tuple[int, int]]): each tuple is (h, w). During tracing, it becomes list[Tensor] instead. """ def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]]): """ Arguments: tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1 image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can be smaller than (H, W) due to padding. """ self.tensor = tensor self.image_sizes = image_sizes def __len__(self) -> int: return len(self.image_sizes) def __getitem__(self, idx) -> torch.Tensor: """ Access the individual image in its original size. Args: idx: int or slice Returns: Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1 """ size = self.image_sizes[idx] return self.tensor[idx, ..., : size[0], : size[1]] @torch.jit.unused def to(self, *args: Any, **kwargs: Any) -> "ImageList": cast_tensor = self.tensor.to(*args, **kwargs) return ImageList(cast_tensor, self.image_sizes) @property def device(self) -> device: return self.tensor.device @staticmethod def from_tensors( tensors: List[torch.Tensor], size_divisibility: int = 0, pad_value: float = 0.0, padding_constraints: Optional[Dict[str, int]] = None, ) -> "ImageList": """ Args: tensors: a tuple or list of `torch.Tensor`, each of shape (Hi, Wi) or (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded to the same shape with `pad_value`. size_divisibility (int): If `size_divisibility > 0`, add padding to ensure the common height and width is divisible by `size_divisibility`. This depends on the model and many models need a divisibility of 32. pad_value (float): value to pad. padding_constraints (optional[Dict]): If given, it would follow the format as {"size_divisibility": int, "square_size": int}, where `size_divisibility` will overwrite the above one if presented and `square_size` indicates the square padding size if `square_size` > 0. Returns: an `ImageList`. """ assert len(tensors) > 0 assert isinstance(tensors, (tuple, list)) for t in tensors: assert isinstance(t, torch.Tensor), type(t) assert t.shape[:-2] == tensors[0].shape[:-2], t.shape image_sizes = [(im.shape[-2], im.shape[-1]) for im in tensors] image_sizes_tensor = [shapes_to_tensor(x) for x in image_sizes] max_size = torch.stack(image_sizes_tensor).max(0).values if padding_constraints is not None: square_size = padding_constraints.get("square_size", 0) if square_size > 0: # pad to square. max_size[0] = max_size[1] = square_size if "size_divisibility" in padding_constraints: size_divisibility = padding_constraints["size_divisibility"] if size_divisibility > 1: stride = size_divisibility # the last two dims are H,W, both subject to divisibility requirement max_size = (max_size + (stride - 1)).div(stride, rounding_mode="floor") * stride # handle weirdness of scripting and tracing ... if torch.jit.is_scripting(): max_size: List[int] = max_size.to(dtype=torch.long).tolist() else: if torch.jit.is_tracing(): image_sizes = image_sizes_tensor if len(tensors) == 1: # This seems slightly (2%) faster. # TODO: check whether it's faster for multiple images as well image_size = image_sizes[0] padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]] batched_imgs = F.pad(tensors[0], padding_size, value=pad_value).unsqueeze_(0) else: # max_size can be a tensor in tracing mode, therefore convert to list batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size) device = ( None if torch.jit.is_scripting() else ("cpu" if torch.jit.is_tracing() else None) ) batched_imgs = tensors[0].new_full(batch_shape, pad_value, device=device) batched_imgs = move_device_like(batched_imgs, tensors[0]) for i, img in enumerate(tensors): # Use `batched_imgs` directly instead of `img, pad_img = zip(tensors, batched_imgs)` # Tracing mode cannot capture `copy_()` of temporary locals batched_imgs[i, ..., : img.shape[-2], : img.shape[-1]].copy_(img) return ImageList(batched_imgs.contiguous(), image_sizes) ================================================ FILE: annotator/oneformer/detectron2/structures/instances.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import itertools import warnings from typing import Any, Dict, List, Tuple, Union import torch class Instances: """ This class represents a list of instances in an image. It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields". All fields must have the same ``__len__`` which is the number of instances. All other (non-field) attributes of this class are considered private: they must start with '_' and are not modifiable by a user. Some basic usage: 1. Set/get/check a field: .. code-block:: python instances.gt_boxes = Boxes(...) print(instances.pred_masks) # a tensor of shape (N, H, W) print('gt_masks' in instances) 2. ``len(instances)`` returns the number of instances 3. Indexing: ``instances[indices]`` will apply the indexing on all the fields and returns a new :class:`Instances`. Typically, ``indices`` is a integer vector of indices, or a binary mask of length ``num_instances`` .. code-block:: python category_3_detections = instances[instances.pred_classes == 3] confident_detections = instances[instances.scores > 0.9] """ def __init__(self, image_size: Tuple[int, int], **kwargs: Any): """ Args: image_size (height, width): the spatial size of the image. kwargs: fields to add to this `Instances`. """ self._image_size = image_size self._fields: Dict[str, Any] = {} for k, v in kwargs.items(): self.set(k, v) @property def image_size(self) -> Tuple[int, int]: """ Returns: tuple: height, width """ return self._image_size def __setattr__(self, name: str, val: Any) -> None: if name.startswith("_"): super().__setattr__(name, val) else: self.set(name, val) def __getattr__(self, name: str) -> Any: if name == "_fields" or name not in self._fields: raise AttributeError("Cannot find field '{}' in the given Instances!".format(name)) return self._fields[name] def set(self, name: str, value: Any) -> None: """ Set the field named `name` to `value`. The length of `value` must be the number of instances, and must agree with other existing fields in this object. """ with warnings.catch_warnings(record=True): data_len = len(value) if len(self._fields): assert ( len(self) == data_len ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self)) self._fields[name] = value def has(self, name: str) -> bool: """ Returns: bool: whether the field called `name` exists. """ return name in self._fields def remove(self, name: str) -> None: """ Remove the field called `name`. """ del self._fields[name] def get(self, name: str) -> Any: """ Returns the field called `name`. """ return self._fields[name] def get_fields(self) -> Dict[str, Any]: """ Returns: dict: a dict which maps names (str) to data of the fields Modifying the returned dict will modify this instance. """ return self._fields # Tensor-like methods def to(self, *args: Any, **kwargs: Any) -> "Instances": """ Returns: Instances: all fields are called with a `to(device)`, if the field has this method. """ ret = Instances(self._image_size) for k, v in self._fields.items(): if hasattr(v, "to"): v = v.to(*args, **kwargs) ret.set(k, v) return ret def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances": """ Args: item: an index-like object and will be used to index all the fields. Returns: If `item` is a string, return the data in the corresponding field. Otherwise, returns an `Instances` where all fields are indexed by `item`. """ if type(item) == int: if item >= len(self) or item < -len(self): raise IndexError("Instances index out of range!") else: item = slice(item, None, len(self)) ret = Instances(self._image_size) for k, v in self._fields.items(): ret.set(k, v[item]) return ret def __len__(self) -> int: for v in self._fields.values(): # use __len__ because len() has to be int and is not friendly to tracing return v.__len__() raise NotImplementedError("Empty Instances does not support __len__!") def __iter__(self): raise NotImplementedError("`Instances` object is not iterable!") @staticmethod def cat(instance_lists: List["Instances"]) -> "Instances": """ Args: instance_lists (list[Instances]) Returns: Instances """ assert all(isinstance(i, Instances) for i in instance_lists) assert len(instance_lists) > 0 if len(instance_lists) == 1: return instance_lists[0] image_size = instance_lists[0].image_size if not isinstance(image_size, torch.Tensor): # could be a tensor in tracing for i in instance_lists[1:]: assert i.image_size == image_size ret = Instances(image_size) for k in instance_lists[0]._fields.keys(): values = [i.get(k) for i in instance_lists] v0 = values[0] if isinstance(v0, torch.Tensor): values = torch.cat(values, dim=0) elif isinstance(v0, list): values = list(itertools.chain(*values)) elif hasattr(type(v0), "cat"): values = type(v0).cat(values) else: raise ValueError("Unsupported type {} for concatenation".format(type(v0))) ret.set(k, values) return ret def __str__(self) -> str: s = self.__class__.__name__ + "(" s += "num_instances={}, ".format(len(self)) s += "image_height={}, ".format(self._image_size[0]) s += "image_width={}, ".format(self._image_size[1]) s += "fields=[{}])".format(", ".join((f"{k}: {v}" for k, v in self._fields.items()))) return s __repr__ = __str__ ================================================ FILE: annotator/oneformer/detectron2/structures/keypoints.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import numpy as np from typing import Any, List, Tuple, Union import torch from torch.nn import functional as F class Keypoints: """ Stores keypoint **annotation** data. GT Instances have a `gt_keypoints` property containing the x,y location and visibility flag of each keypoint. This tensor has shape (N, K, 3) where N is the number of instances and K is the number of keypoints per instance. The visibility flag follows the COCO format and must be one of three integers: * v=0: not labeled (in which case x=y=0) * v=1: labeled but not visible * v=2: labeled and visible """ def __init__(self, keypoints: Union[torch.Tensor, np.ndarray, List[List[float]]]): """ Arguments: keypoints: A Tensor, numpy array, or list of the x, y, and visibility of each keypoint. The shape should be (N, K, 3) where N is the number of instances, and K is the number of keypoints per instance. """ device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device("cpu") keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device) assert keypoints.dim() == 3 and keypoints.shape[2] == 3, keypoints.shape self.tensor = keypoints def __len__(self) -> int: return self.tensor.size(0) def to(self, *args: Any, **kwargs: Any) -> "Keypoints": return type(self)(self.tensor.to(*args, **kwargs)) @property def device(self) -> torch.device: return self.tensor.device def to_heatmap(self, boxes: torch.Tensor, heatmap_size: int) -> torch.Tensor: """ Convert keypoint annotations to a heatmap of one-hot labels for training, as described in :paper:`Mask R-CNN`. Arguments: boxes: Nx4 tensor, the boxes to draw the keypoints to Returns: heatmaps: A tensor of shape (N, K), each element is integer spatial label in the range [0, heatmap_size**2 - 1] for each keypoint in the input. valid: A tensor of shape (N, K) containing whether each keypoint is in the roi or not. """ return _keypoints_to_heatmap(self.tensor, boxes, heatmap_size) def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Keypoints": """ Create a new `Keypoints` by indexing on this `Keypoints`. The following usage are allowed: 1. `new_kpts = kpts[3]`: return a `Keypoints` which contains only one instance. 2. `new_kpts = kpts[2:10]`: return a slice of key points. 3. `new_kpts = kpts[vector]`, where vector is a torch.ByteTensor with `length = len(kpts)`. Nonzero elements in the vector will be selected. Note that the returned Keypoints might share storage with this Keypoints, subject to Pytorch's indexing semantics. """ if isinstance(item, int): return Keypoints([self.tensor[item]]) return Keypoints(self.tensor[item]) def __repr__(self) -> str: s = self.__class__.__name__ + "(" s += "num_instances={})".format(len(self.tensor)) return s @staticmethod def cat(keypoints_list: List["Keypoints"]) -> "Keypoints": """ Concatenates a list of Keypoints into a single Keypoints Arguments: keypoints_list (list[Keypoints]) Returns: Keypoints: the concatenated Keypoints """ assert isinstance(keypoints_list, (list, tuple)) assert len(keypoints_list) > 0 assert all(isinstance(keypoints, Keypoints) for keypoints in keypoints_list) cat_kpts = type(keypoints_list[0])( torch.cat([kpts.tensor for kpts in keypoints_list], dim=0) ) return cat_kpts # TODO make this nicer, this is a direct translation from C2 (but removing the inner loop) def _keypoints_to_heatmap( keypoints: torch.Tensor, rois: torch.Tensor, heatmap_size: int ) -> Tuple[torch.Tensor, torch.Tensor]: """ Encode keypoint locations into a target heatmap for use in SoftmaxWithLoss across space. Maps keypoints from the half-open interval [x1, x2) on continuous image coordinates to the closed interval [0, heatmap_size - 1] on discrete image coordinates. We use the continuous-discrete conversion from Heckbert 1990 ("What is the coordinate of a pixel?"): d = floor(c) and c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate. Arguments: keypoints: tensor of keypoint locations in of shape (N, K, 3). rois: Nx4 tensor of rois in xyxy format heatmap_size: integer side length of square heatmap. Returns: heatmaps: A tensor of shape (N, K) containing an integer spatial label in the range [0, heatmap_size**2 - 1] for each keypoint in the input. valid: A tensor of shape (N, K) containing whether each keypoint is in the roi or not. """ if rois.numel() == 0: return rois.new().long(), rois.new().long() offset_x = rois[:, 0] offset_y = rois[:, 1] scale_x = heatmap_size / (rois[:, 2] - rois[:, 0]) scale_y = heatmap_size / (rois[:, 3] - rois[:, 1]) offset_x = offset_x[:, None] offset_y = offset_y[:, None] scale_x = scale_x[:, None] scale_y = scale_y[:, None] x = keypoints[..., 0] y = keypoints[..., 1] x_boundary_inds = x == rois[:, 2][:, None] y_boundary_inds = y == rois[:, 3][:, None] x = (x - offset_x) * scale_x x = x.floor().long() y = (y - offset_y) * scale_y y = y.floor().long() x[x_boundary_inds] = heatmap_size - 1 y[y_boundary_inds] = heatmap_size - 1 valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size) vis = keypoints[..., 2] > 0 valid = (valid_loc & vis).long() lin_ind = y * heatmap_size + x heatmaps = lin_ind * valid return heatmaps, valid @torch.jit.script_if_tracing def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor: """ Extract predicted keypoint locations from heatmaps. Args: maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for each ROI and each keypoint. rois (Tensor): (#ROIs, 4). The box of each ROI. Returns: Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to (x, y, logit, score) for each keypoint. When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate, we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate. """ offset_x = rois[:, 0] offset_y = rois[:, 1] widths = (rois[:, 2] - rois[:, 0]).clamp(min=1) heights = (rois[:, 3] - rois[:, 1]).clamp(min=1) widths_ceil = widths.ceil() heights_ceil = heights.ceil() num_rois, num_keypoints = maps.shape[:2] xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4) width_corrections = widths / widths_ceil height_corrections = heights / heights_ceil keypoints_idx = torch.arange(num_keypoints, device=maps.device) for i in range(num_rois): outsize = (int(heights_ceil[i]), int(widths_ceil[i])) roi_map = F.interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False) # Although semantically equivalent, `reshape` is used instead of `squeeze` due # to limitation during ONNX export of `squeeze` in scripting mode roi_map = roi_map.reshape(roi_map.shape[1:]) # keypoints x H x W # softmax over the spatial region max_score, _ = roi_map.view(num_keypoints, -1).max(1) max_score = max_score.view(num_keypoints, 1, 1) tmp_full_resolution = (roi_map - max_score).exp_() tmp_pool_resolution = (maps[i] - max_score).exp_() # Produce scores over the region H x W, but normalize with POOL_H x POOL_W, # so that the scores of objects of different absolute sizes will be more comparable roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum((1, 2), keepdim=True) w = roi_map.shape[2] pos = roi_map.view(num_keypoints, -1).argmax(1) x_int = pos % w y_int = (pos - x_int) // w assert ( roi_map_scores[keypoints_idx, y_int, x_int] == roi_map_scores.view(num_keypoints, -1).max(1)[0] ).all() x = (x_int.float() + 0.5) * width_corrections[i] y = (y_int.float() + 0.5) * height_corrections[i] xy_preds[i, :, 0] = x + offset_x[i] xy_preds[i, :, 1] = y + offset_y[i] xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int] xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int] return xy_preds ================================================ FILE: annotator/oneformer/detectron2/structures/masks.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import copy import itertools import numpy as np from typing import Any, Iterator, List, Union import annotator.oneformer.pycocotools.mask as mask_util import torch from torch import device from annotator.oneformer.detectron2.layers.roi_align import ROIAlign from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom from .boxes import Boxes def polygon_area(x, y): # Using the shoelace formula # https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1))) def polygons_to_bitmask(polygons: List[np.ndarray], height: int, width: int) -> np.ndarray: """ Args: polygons (list[ndarray]): each array has shape (Nx2,) height, width (int) Returns: ndarray: a bool mask of shape (height, width) """ if len(polygons) == 0: # COCOAPI does not support empty polygons return np.zeros((height, width)).astype(bool) rles = mask_util.frPyObjects(polygons, height, width) rle = mask_util.merge(rles) return mask_util.decode(rle).astype(bool) def rasterize_polygons_within_box( polygons: List[np.ndarray], box: np.ndarray, mask_size: int ) -> torch.Tensor: """ Rasterize the polygons into a mask image and crop the mask content in the given box. The cropped mask is resized to (mask_size, mask_size). This function is used when generating training targets for mask head in Mask R-CNN. Given original ground-truth masks for an image, new ground-truth mask training targets in the size of `mask_size x mask_size` must be provided for each predicted box. This function will be called to produce such targets. Args: polygons (list[ndarray[float]]): a list of polygons, which represents an instance. box: 4-element numpy array mask_size (int): Returns: Tensor: BoolTensor of shape (mask_size, mask_size) """ # 1. Shift the polygons w.r.t the boxes w, h = box[2] - box[0], box[3] - box[1] polygons = copy.deepcopy(polygons) for p in polygons: p[0::2] = p[0::2] - box[0] p[1::2] = p[1::2] - box[1] # 2. Rescale the polygons to the new box size # max() to avoid division by small number ratio_h = mask_size / max(h, 0.1) ratio_w = mask_size / max(w, 0.1) if ratio_h == ratio_w: for p in polygons: p *= ratio_h else: for p in polygons: p[0::2] *= ratio_w p[1::2] *= ratio_h # 3. Rasterize the polygons with coco api mask = polygons_to_bitmask(polygons, mask_size, mask_size) mask = torch.from_numpy(mask) return mask class BitMasks: """ This class stores the segmentation masks for all objects in one image, in the form of bitmaps. Attributes: tensor: bool Tensor of N,H,W, representing N instances in the image. """ def __init__(self, tensor: Union[torch.Tensor, np.ndarray]): """ Args: tensor: bool Tensor of N,H,W, representing N instances in the image. """ if isinstance(tensor, torch.Tensor): tensor = tensor.to(torch.bool) else: tensor = torch.as_tensor(tensor, dtype=torch.bool, device=torch.device("cpu")) assert tensor.dim() == 3, tensor.size() self.image_size = tensor.shape[1:] self.tensor = tensor @torch.jit.unused def to(self, *args: Any, **kwargs: Any) -> "BitMasks": return BitMasks(self.tensor.to(*args, **kwargs)) @property def device(self) -> torch.device: return self.tensor.device @torch.jit.unused def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "BitMasks": """ Returns: BitMasks: Create a new :class:`BitMasks` by indexing. The following usage are allowed: 1. `new_masks = masks[3]`: return a `BitMasks` which contains only one mask. 2. `new_masks = masks[2:10]`: return a slice of masks. 3. `new_masks = masks[vector]`, where vector is a torch.BoolTensor with `length = len(masks)`. Nonzero elements in the vector will be selected. Note that the returned object might share storage with this object, subject to Pytorch's indexing semantics. """ if isinstance(item, int): return BitMasks(self.tensor[item].unsqueeze(0)) m = self.tensor[item] assert m.dim() == 3, "Indexing on BitMasks with {} returns a tensor with shape {}!".format( item, m.shape ) return BitMasks(m) @torch.jit.unused def __iter__(self) -> torch.Tensor: yield from self.tensor @torch.jit.unused def __repr__(self) -> str: s = self.__class__.__name__ + "(" s += "num_instances={})".format(len(self.tensor)) return s def __len__(self) -> int: return self.tensor.shape[0] def nonempty(self) -> torch.Tensor: """ Find masks that are non-empty. Returns: Tensor: a BoolTensor which represents whether each mask is empty (False) or non-empty (True). """ return self.tensor.flatten(1).any(dim=1) @staticmethod def from_polygon_masks( polygon_masks: Union["PolygonMasks", List[List[np.ndarray]]], height: int, width: int ) -> "BitMasks": """ Args: polygon_masks (list[list[ndarray]] or PolygonMasks) height, width (int) """ if isinstance(polygon_masks, PolygonMasks): polygon_masks = polygon_masks.polygons masks = [polygons_to_bitmask(p, height, width) for p in polygon_masks] if len(masks): return BitMasks(torch.stack([torch.from_numpy(x) for x in masks])) else: return BitMasks(torch.empty(0, height, width, dtype=torch.bool)) @staticmethod def from_roi_masks(roi_masks: "ROIMasks", height: int, width: int) -> "BitMasks": """ Args: roi_masks: height, width (int): """ return roi_masks.to_bitmasks(height, width) def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor: """ Crop each bitmask by the given box, and resize results to (mask_size, mask_size). This can be used to prepare training targets for Mask R-CNN. It has less reconstruction error compared to rasterization with polygons. However we observe no difference in accuracy, but BitMasks requires more memory to store all the masks. Args: boxes (Tensor): Nx4 tensor storing the boxes for each mask mask_size (int): the size of the rasterized mask. Returns: Tensor: A bool tensor of shape (N, mask_size, mask_size), where N is the number of predicted boxes for this image. """ assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self)) device = self.tensor.device batch_inds = torch.arange(len(boxes), device=device).to(dtype=boxes.dtype)[:, None] rois = torch.cat([batch_inds, boxes], dim=1) # Nx5 bit_masks = self.tensor.to(dtype=torch.float32) rois = rois.to(device=device) output = ( ROIAlign((mask_size, mask_size), 1.0, 0, aligned=True) .forward(bit_masks[:, None, :, :], rois) .squeeze(1) ) output = output >= 0.5 return output def get_bounding_boxes(self) -> Boxes: """ Returns: Boxes: tight bounding boxes around bitmasks. If a mask is empty, it's bounding box will be all zero. """ boxes = torch.zeros(self.tensor.shape[0], 4, dtype=torch.float32) x_any = torch.any(self.tensor, dim=1) y_any = torch.any(self.tensor, dim=2) for idx in range(self.tensor.shape[0]): x = torch.where(x_any[idx, :])[0] y = torch.where(y_any[idx, :])[0] if len(x) > 0 and len(y) > 0: boxes[idx, :] = torch.as_tensor( [x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=torch.float32 ) return Boxes(boxes) @staticmethod def cat(bitmasks_list: List["BitMasks"]) -> "BitMasks": """ Concatenates a list of BitMasks into a single BitMasks Arguments: bitmasks_list (list[BitMasks]) Returns: BitMasks: the concatenated BitMasks """ assert isinstance(bitmasks_list, (list, tuple)) assert len(bitmasks_list) > 0 assert all(isinstance(bitmask, BitMasks) for bitmask in bitmasks_list) cat_bitmasks = type(bitmasks_list[0])(torch.cat([bm.tensor for bm in bitmasks_list], dim=0)) return cat_bitmasks class PolygonMasks: """ This class stores the segmentation masks for all objects in one image, in the form of polygons. Attributes: polygons: list[list[ndarray]]. Each ndarray is a float64 vector representing a polygon. """ def __init__(self, polygons: List[List[Union[torch.Tensor, np.ndarray]]]): """ Arguments: polygons (list[list[np.ndarray]]): The first level of the list correspond to individual instances, the second level to all the polygons that compose the instance, and the third level to the polygon coordinates. The third level array should have the format of [x0, y0, x1, y1, ..., xn, yn] (n >= 3). """ if not isinstance(polygons, list): raise ValueError( "Cannot create PolygonMasks: Expect a list of list of polygons per image. " "Got '{}' instead.".format(type(polygons)) ) def _make_array(t: Union[torch.Tensor, np.ndarray]) -> np.ndarray: # Use float64 for higher precision, because why not? # Always put polygons on CPU (self.to is a no-op) since they # are supposed to be small tensors. # May need to change this assumption if GPU placement becomes useful if isinstance(t, torch.Tensor): t = t.cpu().numpy() return np.asarray(t).astype("float64") def process_polygons( polygons_per_instance: List[Union[torch.Tensor, np.ndarray]] ) -> List[np.ndarray]: if not isinstance(polygons_per_instance, list): raise ValueError( "Cannot create polygons: Expect a list of polygons per instance. " "Got '{}' instead.".format(type(polygons_per_instance)) ) # transform each polygon to a numpy array polygons_per_instance = [_make_array(p) for p in polygons_per_instance] for polygon in polygons_per_instance: if len(polygon) % 2 != 0 or len(polygon) < 6: raise ValueError(f"Cannot create a polygon from {len(polygon)} coordinates.") return polygons_per_instance self.polygons: List[List[np.ndarray]] = [ process_polygons(polygons_per_instance) for polygons_per_instance in polygons ] def to(self, *args: Any, **kwargs: Any) -> "PolygonMasks": return self @property def device(self) -> torch.device: return torch.device("cpu") def get_bounding_boxes(self) -> Boxes: """ Returns: Boxes: tight bounding boxes around polygon masks. """ boxes = torch.zeros(len(self.polygons), 4, dtype=torch.float32) for idx, polygons_per_instance in enumerate(self.polygons): minxy = torch.as_tensor([float("inf"), float("inf")], dtype=torch.float32) maxxy = torch.zeros(2, dtype=torch.float32) for polygon in polygons_per_instance: coords = torch.from_numpy(polygon).view(-1, 2).to(dtype=torch.float32) minxy = torch.min(minxy, torch.min(coords, dim=0).values) maxxy = torch.max(maxxy, torch.max(coords, dim=0).values) boxes[idx, :2] = minxy boxes[idx, 2:] = maxxy return Boxes(boxes) def nonempty(self) -> torch.Tensor: """ Find masks that are non-empty. Returns: Tensor: a BoolTensor which represents whether each mask is empty (False) or not (True). """ keep = [1 if len(polygon) > 0 else 0 for polygon in self.polygons] return torch.from_numpy(np.asarray(keep, dtype=bool)) def __getitem__(self, item: Union[int, slice, List[int], torch.BoolTensor]) -> "PolygonMasks": """ Support indexing over the instances and return a `PolygonMasks` object. `item` can be: 1. An integer. It will return an object with only one instance. 2. A slice. It will return an object with the selected instances. 3. A list[int]. It will return an object with the selected instances, correpsonding to the indices in the list. 4. A vector mask of type BoolTensor, whose length is num_instances. It will return an object with the instances whose mask is nonzero. """ if isinstance(item, int): selected_polygons = [self.polygons[item]] elif isinstance(item, slice): selected_polygons = self.polygons[item] elif isinstance(item, list): selected_polygons = [self.polygons[i] for i in item] elif isinstance(item, torch.Tensor): # Polygons is a list, so we have to move the indices back to CPU. if item.dtype == torch.bool: assert item.dim() == 1, item.shape item = item.nonzero().squeeze(1).cpu().numpy().tolist() elif item.dtype in [torch.int32, torch.int64]: item = item.cpu().numpy().tolist() else: raise ValueError("Unsupported tensor dtype={} for indexing!".format(item.dtype)) selected_polygons = [self.polygons[i] for i in item] return PolygonMasks(selected_polygons) def __iter__(self) -> Iterator[List[np.ndarray]]: """ Yields: list[ndarray]: the polygons for one instance. Each Tensor is a float64 vector representing a polygon. """ return iter(self.polygons) def __repr__(self) -> str: s = self.__class__.__name__ + "(" s += "num_instances={})".format(len(self.polygons)) return s def __len__(self) -> int: return len(self.polygons) def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor: """ Crop each mask by the given box, and resize results to (mask_size, mask_size). This can be used to prepare training targets for Mask R-CNN. Args: boxes (Tensor): Nx4 tensor storing the boxes for each mask mask_size (int): the size of the rasterized mask. Returns: Tensor: A bool tensor of shape (N, mask_size, mask_size), where N is the number of predicted boxes for this image. """ assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self)) device = boxes.device # Put boxes on the CPU, as the polygon representation is not efficient GPU-wise # (several small tensors for representing a single instance mask) boxes = boxes.to(torch.device("cpu")) results = [ rasterize_polygons_within_box(poly, box.numpy(), mask_size) for poly, box in zip(self.polygons, boxes) ] """ poly: list[list[float]], the polygons for one instance box: a tensor of shape (4,) """ if len(results) == 0: return torch.empty(0, mask_size, mask_size, dtype=torch.bool, device=device) return torch.stack(results, dim=0).to(device=device) def area(self): """ Computes area of the mask. Only works with Polygons, using the shoelace formula: https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates Returns: Tensor: a vector, area for each instance """ area = [] for polygons_per_instance in self.polygons: area_per_instance = 0 for p in polygons_per_instance: area_per_instance += polygon_area(p[0::2], p[1::2]) area.append(area_per_instance) return torch.tensor(area) @staticmethod def cat(polymasks_list: List["PolygonMasks"]) -> "PolygonMasks": """ Concatenates a list of PolygonMasks into a single PolygonMasks Arguments: polymasks_list (list[PolygonMasks]) Returns: PolygonMasks: the concatenated PolygonMasks """ assert isinstance(polymasks_list, (list, tuple)) assert len(polymasks_list) > 0 assert all(isinstance(polymask, PolygonMasks) for polymask in polymasks_list) cat_polymasks = type(polymasks_list[0])( list(itertools.chain.from_iterable(pm.polygons for pm in polymasks_list)) ) return cat_polymasks class ROIMasks: """ Represent masks by N smaller masks defined in some ROIs. Once ROI boxes are given, full-image bitmask can be obtained by "pasting" the mask on the region defined by the corresponding ROI box. """ def __init__(self, tensor: torch.Tensor): """ Args: tensor: (N, M, M) mask tensor that defines the mask within each ROI. """ if tensor.dim() != 3: raise ValueError("ROIMasks must take a masks of 3 dimension.") self.tensor = tensor def to(self, device: torch.device) -> "ROIMasks": return ROIMasks(self.tensor.to(device)) @property def device(self) -> device: return self.tensor.device def __len__(self): return self.tensor.shape[0] def __getitem__(self, item) -> "ROIMasks": """ Returns: ROIMasks: Create a new :class:`ROIMasks` by indexing. The following usage are allowed: 1. `new_masks = masks[2:10]`: return a slice of masks. 2. `new_masks = masks[vector]`, where vector is a torch.BoolTensor with `length = len(masks)`. Nonzero elements in the vector will be selected. Note that the returned object might share storage with this object, subject to Pytorch's indexing semantics. """ t = self.tensor[item] if t.dim() != 3: raise ValueError( f"Indexing on ROIMasks with {item} returns a tensor with shape {t.shape}!" ) return ROIMasks(t) @torch.jit.unused def __repr__(self) -> str: s = self.__class__.__name__ + "(" s += "num_instances={})".format(len(self.tensor)) return s @torch.jit.unused def to_bitmasks(self, boxes: torch.Tensor, height, width, threshold=0.5): """ Args: see documentation of :func:`paste_masks_in_image`. """ from annotator.oneformer.detectron2.layers.mask_ops import paste_masks_in_image, _paste_masks_tensor_shape if torch.jit.is_tracing(): if isinstance(height, torch.Tensor): paste_func = _paste_masks_tensor_shape else: paste_func = paste_masks_in_image else: paste_func = retry_if_cuda_oom(paste_masks_in_image) bitmasks = paste_func(self.tensor, boxes.tensor, (height, width), threshold=threshold) return BitMasks(bitmasks) ================================================ FILE: annotator/oneformer/detectron2/structures/rotated_boxes.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import math from typing import List, Tuple import torch from annotator.oneformer.detectron2.layers.rotated_boxes import pairwise_iou_rotated from .boxes import Boxes class RotatedBoxes(Boxes): """ This structure stores a list of rotated boxes as a Nx5 torch.Tensor. It supports some common methods about boxes (`area`, `clip`, `nonempty`, etc), and also behaves like a Tensor (support indexing, `to(device)`, `.device`, and iteration over all boxes) """ def __init__(self, tensor: torch.Tensor): """ Args: tensor (Tensor[float]): a Nx5 matrix. Each row is (x_center, y_center, width, height, angle), in which angle is represented in degrees. While there's no strict range restriction for it, the recommended principal range is between [-180, 180) degrees. Assume we have a horizontal box B = (x_center, y_center, width, height), where width is along the x-axis and height is along the y-axis. The rotated box B_rot (x_center, y_center, width, height, angle) can be seen as: 1. When angle == 0: B_rot == B 2. When angle > 0: B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CCW; 3. When angle < 0: B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CW. Mathematically, since the right-handed coordinate system for image space is (y, x), where y is top->down and x is left->right, the 4 vertices of the rotated rectangle :math:`(yr_i, xr_i)` (i = 1, 2, 3, 4) can be obtained from the vertices of the horizontal rectangle :math:`(y_i, x_i)` (i = 1, 2, 3, 4) in the following way (:math:`\\theta = angle*\\pi/180` is the angle in radians, :math:`(y_c, x_c)` is the center of the rectangle): .. math:: yr_i = \\cos(\\theta) (y_i - y_c) - \\sin(\\theta) (x_i - x_c) + y_c, xr_i = \\sin(\\theta) (y_i - y_c) + \\cos(\\theta) (x_i - x_c) + x_c, which is the standard rigid-body rotation transformation. Intuitively, the angle is (1) the rotation angle from y-axis in image space to the height vector (top->down in the box's local coordinate system) of the box in CCW, and (2) the rotation angle from x-axis in image space to the width vector (left->right in the box's local coordinate system) of the box in CCW. More intuitively, consider the following horizontal box ABCD represented in (x1, y1, x2, y2): (3, 2, 7, 4), covering the [3, 7] x [2, 4] region of the continuous coordinate system which looks like this: .. code:: none O--------> x | | A---B | | | | D---C | v y Note that each capital letter represents one 0-dimensional geometric point instead of a 'square pixel' here. In the example above, using (x, y) to represent a point we have: .. math:: O = (0, 0), A = (3, 2), B = (7, 2), C = (7, 4), D = (3, 4) We name vector AB = vector DC as the width vector in box's local coordinate system, and vector AD = vector BC as the height vector in box's local coordinate system. Initially, when angle = 0 degree, they're aligned with the positive directions of x-axis and y-axis in the image space, respectively. For better illustration, we denote the center of the box as E, .. code:: none O--------> x | | A---B | | E | | D---C | v y where the center E = ((3+7)/2, (2+4)/2) = (5, 3). Also, .. math:: width = |AB| = |CD| = 7 - 3 = 4, height = |AD| = |BC| = 4 - 2 = 2. Therefore, the corresponding representation for the same shape in rotated box in (x_center, y_center, width, height, angle) format is: (5, 3, 4, 2, 0), Now, let's consider (5, 3, 4, 2, 90), which is rotated by 90 degrees CCW (counter-clockwise) by definition. It looks like this: .. code:: none O--------> x | B-C | | | | |E| | | | | A-D v y The center E is still located at the same point (5, 3), while the vertices ABCD are rotated by 90 degrees CCW with regard to E: A = (4, 5), B = (4, 1), C = (6, 1), D = (6, 5) Here, 90 degrees can be seen as the CCW angle to rotate from y-axis to vector AD or vector BC (the top->down height vector in box's local coordinate system), or the CCW angle to rotate from x-axis to vector AB or vector DC (the left->right width vector in box's local coordinate system). .. math:: width = |AB| = |CD| = 5 - 1 = 4, height = |AD| = |BC| = 6 - 4 = 2. Next, how about (5, 3, 4, 2, -90), which is rotated by 90 degrees CW (clockwise) by definition? It looks like this: .. code:: none O--------> x | D-A | | | | |E| | | | | C-B v y The center E is still located at the same point (5, 3), while the vertices ABCD are rotated by 90 degrees CW with regard to E: A = (6, 1), B = (6, 5), C = (4, 5), D = (4, 1) .. math:: width = |AB| = |CD| = 5 - 1 = 4, height = |AD| = |BC| = 6 - 4 = 2. This covers exactly the same region as (5, 3, 4, 2, 90) does, and their IoU will be 1. However, these two will generate different RoI Pooling results and should not be treated as an identical box. On the other hand, it's easy to see that (X, Y, W, H, A) is identical to (X, Y, W, H, A+360N), for any integer N. For example (5, 3, 4, 2, 270) would be identical to (5, 3, 4, 2, -90), because rotating the shape 270 degrees CCW is equivalent to rotating the same shape 90 degrees CW. We could rotate further to get (5, 3, 4, 2, 180), or (5, 3, 4, 2, -180): .. code:: none O--------> x | | C---D | | E | | B---A | v y .. math:: A = (7, 4), B = (3, 4), C = (3, 2), D = (7, 2), width = |AB| = |CD| = 7 - 3 = 4, height = |AD| = |BC| = 4 - 2 = 2. Finally, this is a very inaccurate (heavily quantized) illustration of how (5, 3, 4, 2, 60) looks like in case anyone wonders: .. code:: none O--------> x | B\ | / C | /E / | A / | `D v y It's still a rectangle with center of (5, 3), width of 4 and height of 2, but its angle (and thus orientation) is somewhere between (5, 3, 4, 2, 0) and (5, 3, 4, 2, 90). """ device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu") tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that does not depend on # the inputs (and consequently confuses jit) tensor = tensor.reshape((0, 5)).to(dtype=torch.float32, device=device) assert tensor.dim() == 2 and tensor.size(-1) == 5, tensor.size() self.tensor = tensor def clone(self) -> "RotatedBoxes": """ Clone the RotatedBoxes. Returns: RotatedBoxes """ return RotatedBoxes(self.tensor.clone()) def to(self, device: torch.device): # Boxes are assumed float32 and does not support to(dtype) return RotatedBoxes(self.tensor.to(device=device)) def area(self) -> torch.Tensor: """ Computes the area of all the boxes. Returns: torch.Tensor: a vector with areas of each box. """ box = self.tensor area = box[:, 2] * box[:, 3] return area # Avoid in-place operations so that we can torchscript; NOTE: this creates a new tensor def normalize_angles(self) -> None: """ Restrict angles to the range of [-180, 180) degrees """ angle_tensor = (self.tensor[:, 4] + 180.0) % 360.0 - 180.0 self.tensor = torch.cat((self.tensor[:, :4], angle_tensor[:, None]), dim=1) def clip(self, box_size: Tuple[int, int], clip_angle_threshold: float = 1.0) -> None: """ Clip (in place) the boxes by limiting x coordinates to the range [0, width] and y coordinates to the range [0, height]. For RRPN: Only clip boxes that are almost horizontal with a tolerance of clip_angle_threshold to maintain backward compatibility. Rotated boxes beyond this threshold are not clipped for two reasons: 1. There are potentially multiple ways to clip a rotated box to make it fit within the image. 2. It's tricky to make the entire rectangular box fit within the image and still be able to not leave out pixels of interest. Therefore we rely on ops like RoIAlignRotated to safely handle this. Args: box_size (height, width): The clipping box's size. clip_angle_threshold: Iff. abs(normalized(angle)) <= clip_angle_threshold (in degrees), we do the clipping as horizontal boxes. """ h, w = box_size # normalize angles to be within (-180, 180] degrees self.normalize_angles() idx = torch.where(torch.abs(self.tensor[:, 4]) <= clip_angle_threshold)[0] # convert to (x1, y1, x2, y2) x1 = self.tensor[idx, 0] - self.tensor[idx, 2] / 2.0 y1 = self.tensor[idx, 1] - self.tensor[idx, 3] / 2.0 x2 = self.tensor[idx, 0] + self.tensor[idx, 2] / 2.0 y2 = self.tensor[idx, 1] + self.tensor[idx, 3] / 2.0 # clip x1.clamp_(min=0, max=w) y1.clamp_(min=0, max=h) x2.clamp_(min=0, max=w) y2.clamp_(min=0, max=h) # convert back to (xc, yc, w, h) self.tensor[idx, 0] = (x1 + x2) / 2.0 self.tensor[idx, 1] = (y1 + y2) / 2.0 # make sure widths and heights do not increase due to numerical errors self.tensor[idx, 2] = torch.min(self.tensor[idx, 2], x2 - x1) self.tensor[idx, 3] = torch.min(self.tensor[idx, 3], y2 - y1) def nonempty(self, threshold: float = 0.0) -> torch.Tensor: """ Find boxes that are non-empty. A box is considered empty, if either of its side is no larger than threshold. Returns: Tensor: a binary vector which represents whether each box is empty (False) or non-empty (True). """ box = self.tensor widths = box[:, 2] heights = box[:, 3] keep = (widths > threshold) & (heights > threshold) return keep def __getitem__(self, item) -> "RotatedBoxes": """ Returns: RotatedBoxes: Create a new :class:`RotatedBoxes` by indexing. The following usage are allowed: 1. `new_boxes = boxes[3]`: return a `RotatedBoxes` which contains only one box. 2. `new_boxes = boxes[2:10]`: return a slice of boxes. 3. `new_boxes = boxes[vector]`, where vector is a torch.ByteTensor with `length = len(boxes)`. Nonzero elements in the vector will be selected. Note that the returned RotatedBoxes might share storage with this RotatedBoxes, subject to Pytorch's indexing semantics. """ if isinstance(item, int): return RotatedBoxes(self.tensor[item].view(1, -1)) b = self.tensor[item] assert b.dim() == 2, "Indexing on RotatedBoxes with {} failed to return a matrix!".format( item ) return RotatedBoxes(b) def __len__(self) -> int: return self.tensor.shape[0] def __repr__(self) -> str: return "RotatedBoxes(" + str(self.tensor) + ")" def inside_box(self, box_size: Tuple[int, int], boundary_threshold: int = 0) -> torch.Tensor: """ Args: box_size (height, width): Size of the reference box covering [0, width] x [0, height] boundary_threshold (int): Boxes that extend beyond the reference box boundary by more than boundary_threshold are considered "outside". For RRPN, it might not be necessary to call this function since it's common for rotated box to extend to outside of the image boundaries (the clip function only clips the near-horizontal boxes) Returns: a binary vector, indicating whether each box is inside the reference box. """ height, width = box_size cnt_x = self.tensor[..., 0] cnt_y = self.tensor[..., 1] half_w = self.tensor[..., 2] / 2.0 half_h = self.tensor[..., 3] / 2.0 a = self.tensor[..., 4] c = torch.abs(torch.cos(a * math.pi / 180.0)) s = torch.abs(torch.sin(a * math.pi / 180.0)) # This basically computes the horizontal bounding rectangle of the rotated box max_rect_dx = c * half_w + s * half_h max_rect_dy = c * half_h + s * half_w inds_inside = ( (cnt_x - max_rect_dx >= -boundary_threshold) & (cnt_y - max_rect_dy >= -boundary_threshold) & (cnt_x + max_rect_dx < width + boundary_threshold) & (cnt_y + max_rect_dy < height + boundary_threshold) ) return inds_inside def get_centers(self) -> torch.Tensor: """ Returns: The box centers in a Nx2 array of (x, y). """ return self.tensor[:, :2] def scale(self, scale_x: float, scale_y: float) -> None: """ Scale the rotated box with horizontal and vertical scaling factors Note: when scale_factor_x != scale_factor_y, the rotated box does not preserve the rectangular shape when the angle is not a multiple of 90 degrees under resize transformation. Instead, the shape is a parallelogram (that has skew) Here we make an approximation by fitting a rotated rectangle to the parallelogram. """ self.tensor[:, 0] *= scale_x self.tensor[:, 1] *= scale_y theta = self.tensor[:, 4] * math.pi / 180.0 c = torch.cos(theta) s = torch.sin(theta) # In image space, y is top->down and x is left->right # Consider the local coordintate system for the rotated box, # where the box center is located at (0, 0), and the four vertices ABCD are # A(-w / 2, -h / 2), B(w / 2, -h / 2), C(w / 2, h / 2), D(-w / 2, h / 2) # the midpoint of the left edge AD of the rotated box E is: # E = (A+D)/2 = (-w / 2, 0) # the midpoint of the top edge AB of the rotated box F is: # F(0, -h / 2) # To get the old coordinates in the global system, apply the rotation transformation # (Note: the right-handed coordinate system for image space is yOx): # (old_x, old_y) = (s * y + c * x, c * y - s * x) # E(old) = (s * 0 + c * (-w/2), c * 0 - s * (-w/2)) = (-c * w / 2, s * w / 2) # F(old) = (s * (-h / 2) + c * 0, c * (-h / 2) - s * 0) = (-s * h / 2, -c * h / 2) # After applying the scaling factor (sfx, sfy): # E(new) = (-sfx * c * w / 2, sfy * s * w / 2) # F(new) = (-sfx * s * h / 2, -sfy * c * h / 2) # The new width after scaling tranformation becomes: # w(new) = |E(new) - O| * 2 # = sqrt[(sfx * c * w / 2)^2 + (sfy * s * w / 2)^2] * 2 # = sqrt[(sfx * c)^2 + (sfy * s)^2] * w # i.e., scale_factor_w = sqrt[(sfx * c)^2 + (sfy * s)^2] # # For example, # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_w == scale_factor_x; # when |angle| = 90, c = 0, |s| = 1, scale_factor_w == scale_factor_y self.tensor[:, 2] *= torch.sqrt((scale_x * c) ** 2 + (scale_y * s) ** 2) # h(new) = |F(new) - O| * 2 # = sqrt[(sfx * s * h / 2)^2 + (sfy * c * h / 2)^2] * 2 # = sqrt[(sfx * s)^2 + (sfy * c)^2] * h # i.e., scale_factor_h = sqrt[(sfx * s)^2 + (sfy * c)^2] # # For example, # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_h == scale_factor_y; # when |angle| = 90, c = 0, |s| = 1, scale_factor_h == scale_factor_x self.tensor[:, 3] *= torch.sqrt((scale_x * s) ** 2 + (scale_y * c) ** 2) # The angle is the rotation angle from y-axis in image space to the height # vector (top->down in the box's local coordinate system) of the box in CCW. # # angle(new) = angle_yOx(O - F(new)) # = angle_yOx( (sfx * s * h / 2, sfy * c * h / 2) ) # = atan2(sfx * s * h / 2, sfy * c * h / 2) # = atan2(sfx * s, sfy * c) # # For example, # when sfx == sfy, angle(new) == atan2(s, c) == angle(old) self.tensor[:, 4] = torch.atan2(scale_x * s, scale_y * c) * 180 / math.pi @classmethod def cat(cls, boxes_list: List["RotatedBoxes"]) -> "RotatedBoxes": """ Concatenates a list of RotatedBoxes into a single RotatedBoxes Arguments: boxes_list (list[RotatedBoxes]) Returns: RotatedBoxes: the concatenated RotatedBoxes """ assert isinstance(boxes_list, (list, tuple)) if len(boxes_list) == 0: return cls(torch.empty(0)) assert all([isinstance(box, RotatedBoxes) for box in boxes_list]) # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0)) return cat_boxes @property def device(self) -> torch.device: return self.tensor.device @torch.jit.unused def __iter__(self): """ Yield a box as a Tensor of shape (5,) at a time. """ yield from self.tensor def pairwise_iou(boxes1: RotatedBoxes, boxes2: RotatedBoxes) -> None: """ Given two lists of rotated boxes of size N and M, compute the IoU (intersection over union) between **all** N x M pairs of boxes. The box order must be (x_center, y_center, width, height, angle). Args: boxes1, boxes2 (RotatedBoxes): two `RotatedBoxes`. Contains N & M rotated boxes, respectively. Returns: Tensor: IoU, sized [N,M]. """ return pairwise_iou_rotated(boxes1.tensor, boxes2.tensor) ================================================ FILE: annotator/oneformer/detectron2/tracking/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .base_tracker import ( # noqa BaseTracker, build_tracker_head, TRACKER_HEADS_REGISTRY, ) from .bbox_iou_tracker import BBoxIOUTracker # noqa from .hungarian_tracker import BaseHungarianTracker # noqa from .iou_weighted_hungarian_bbox_iou_tracker import ( # noqa IOUWeightedHungarianBBoxIOUTracker, ) from .utils import create_prediction_pairs # noqa from .vanilla_hungarian_bbox_iou_tracker import VanillaHungarianBBoxIOUTracker # noqa __all__ = [k for k in globals().keys() if not k.startswith("_")] ================================================ FILE: annotator/oneformer/detectron2/tracking/base_tracker.py ================================================ #!/usr/bin/env python3 # Copyright 2004-present Facebook. All Rights Reserved. from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.utils.registry import Registry from ..config.config import CfgNode as CfgNode_ from ..structures import Instances TRACKER_HEADS_REGISTRY = Registry("TRACKER_HEADS") TRACKER_HEADS_REGISTRY.__doc__ = """ Registry for tracking classes. """ class BaseTracker(object): """ A parent class for all trackers """ @configurable def __init__(self, **kwargs): self._prev_instances = None # (D2)instances for previous frame self._matched_idx = set() # indices in prev_instances found matching self._matched_ID = set() # idendities in prev_instances found matching self._untracked_prev_idx = set() # indices in prev_instances not found matching self._id_count = 0 # used to assign new id @classmethod def from_config(cls, cfg: CfgNode_): raise NotImplementedError("Calling BaseTracker::from_config") def update(self, predictions: Instances) -> Instances: """ Args: predictions: D2 Instances for predictions of the current frame Return: D2 Instances for predictions of the current frame with ID assigned _prev_instances and instances will have the following fields: .pred_boxes (shape=[N, 4]) .scores (shape=[N,]) .pred_classes (shape=[N,]) .pred_keypoints (shape=[N, M, 3], Optional) .pred_masks (shape=List[2D_MASK], Optional) 2D_MASK: shape=[H, W] .ID (shape=[N,]) N: # of detected bboxes H and W: height and width of 2D mask """ raise NotImplementedError("Calling BaseTracker::update") def build_tracker_head(cfg: CfgNode_) -> BaseTracker: """ Build a tracker head from `cfg.TRACKER_HEADS.TRACKER_NAME`. Args: cfg: D2 CfgNode, config file with tracker information Return: tracker object """ name = cfg.TRACKER_HEADS.TRACKER_NAME tracker_class = TRACKER_HEADS_REGISTRY.get(name) return tracker_class(cfg) ================================================ FILE: annotator/oneformer/detectron2/tracking/bbox_iou_tracker.py ================================================ #!/usr/bin/env python3 # Copyright 2004-present Facebook. All Rights Reserved. import copy import numpy as np from typing import List import torch from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.structures import Boxes, Instances from annotator.oneformer.detectron2.structures.boxes import pairwise_iou from ..config.config import CfgNode as CfgNode_ from .base_tracker import TRACKER_HEADS_REGISTRY, BaseTracker @TRACKER_HEADS_REGISTRY.register() class BBoxIOUTracker(BaseTracker): """ A bounding box tracker to assign ID based on IoU between current and previous instances """ @configurable def __init__( self, *, video_height: int, video_width: int, max_num_instances: int = 200, max_lost_frame_count: int = 0, min_box_rel_dim: float = 0.02, min_instance_period: int = 1, track_iou_threshold: float = 0.5, **kwargs, ): """ Args: video_height: height the video frame video_width: width of the video frame max_num_instances: maximum number of id allowed to be tracked max_lost_frame_count: maximum number of frame an id can lost tracking exceed this number, an id is considered as lost forever min_box_rel_dim: a percentage, smaller than this dimension, a bbox is removed from tracking min_instance_period: an instance will be shown after this number of period since its first showing up in the video track_iou_threshold: iou threshold, below this number a bbox pair is removed from tracking """ super().__init__(**kwargs) self._video_height = video_height self._video_width = video_width self._max_num_instances = max_num_instances self._max_lost_frame_count = max_lost_frame_count self._min_box_rel_dim = min_box_rel_dim self._min_instance_period = min_instance_period self._track_iou_threshold = track_iou_threshold @classmethod def from_config(cls, cfg: CfgNode_): """ Old style initialization using CfgNode Args: cfg: D2 CfgNode, config file Return: dictionary storing arguments for __init__ method """ assert "VIDEO_HEIGHT" in cfg.TRACKER_HEADS assert "VIDEO_WIDTH" in cfg.TRACKER_HEADS video_height = cfg.TRACKER_HEADS.get("VIDEO_HEIGHT") video_width = cfg.TRACKER_HEADS.get("VIDEO_WIDTH") max_num_instances = cfg.TRACKER_HEADS.get("MAX_NUM_INSTANCES", 200) max_lost_frame_count = cfg.TRACKER_HEADS.get("MAX_LOST_FRAME_COUNT", 0) min_box_rel_dim = cfg.TRACKER_HEADS.get("MIN_BOX_REL_DIM", 0.02) min_instance_period = cfg.TRACKER_HEADS.get("MIN_INSTANCE_PERIOD", 1) track_iou_threshold = cfg.TRACKER_HEADS.get("TRACK_IOU_THRESHOLD", 0.5) return { "_target_": "detectron2.tracking.bbox_iou_tracker.BBoxIOUTracker", "video_height": video_height, "video_width": video_width, "max_num_instances": max_num_instances, "max_lost_frame_count": max_lost_frame_count, "min_box_rel_dim": min_box_rel_dim, "min_instance_period": min_instance_period, "track_iou_threshold": track_iou_threshold, } def update(self, instances: Instances) -> Instances: """ See BaseTracker description """ instances = self._initialize_extra_fields(instances) if self._prev_instances is not None: # calculate IoU of all bbox pairs iou_all = pairwise_iou( boxes1=instances.pred_boxes, boxes2=self._prev_instances.pred_boxes, ) # sort IoU in descending order bbox_pairs = self._create_prediction_pairs(instances, iou_all) # assign previous ID to current bbox if IoU > track_iou_threshold self._reset_fields() for bbox_pair in bbox_pairs: idx = bbox_pair["idx"] prev_id = bbox_pair["prev_id"] if ( idx in self._matched_idx or prev_id in self._matched_ID or bbox_pair["IoU"] < self._track_iou_threshold ): continue instances.ID[idx] = prev_id instances.ID_period[idx] = bbox_pair["prev_period"] + 1 instances.lost_frame_count[idx] = 0 self._matched_idx.add(idx) self._matched_ID.add(prev_id) self._untracked_prev_idx.remove(bbox_pair["prev_idx"]) instances = self._assign_new_id(instances) instances = self._merge_untracked_instances(instances) self._prev_instances = copy.deepcopy(instances) return instances def _create_prediction_pairs(self, instances: Instances, iou_all: np.ndarray) -> List: """ For all instances in previous and current frames, create pairs. For each pair, store index of the instance in current frame predcitions, index in previous predictions, ID in previous predictions, IoU of the bboxes in this pair, period in previous predictions. Args: instances: D2 Instances, for predictions of the current frame iou_all: IoU for all bboxes pairs Return: A list of IoU for all pairs """ bbox_pairs = [] for i in range(len(instances)): for j in range(len(self._prev_instances)): bbox_pairs.append( { "idx": i, "prev_idx": j, "prev_id": self._prev_instances.ID[j], "IoU": iou_all[i, j], "prev_period": self._prev_instances.ID_period[j], } ) return bbox_pairs def _initialize_extra_fields(self, instances: Instances) -> Instances: """ If input instances don't have ID, ID_period, lost_frame_count fields, this method is used to initialize these fields. Args: instances: D2 Instances, for predictions of the current frame Return: D2 Instances with extra fields added """ if not instances.has("ID"): instances.set("ID", [None] * len(instances)) if not instances.has("ID_period"): instances.set("ID_period", [None] * len(instances)) if not instances.has("lost_frame_count"): instances.set("lost_frame_count", [None] * len(instances)) if self._prev_instances is None: instances.ID = list(range(len(instances))) self._id_count += len(instances) instances.ID_period = [1] * len(instances) instances.lost_frame_count = [0] * len(instances) return instances def _reset_fields(self): """ Before each uodate call, reset fields first """ self._matched_idx = set() self._matched_ID = set() self._untracked_prev_idx = set(range(len(self._prev_instances))) def _assign_new_id(self, instances: Instances) -> Instances: """ For each untracked instance, assign a new id Args: instances: D2 Instances, for predictions of the current frame Return: D2 Instances with new ID assigned """ untracked_idx = set(range(len(instances))).difference(self._matched_idx) for idx in untracked_idx: instances.ID[idx] = self._id_count self._id_count += 1 instances.ID_period[idx] = 1 instances.lost_frame_count[idx] = 0 return instances def _merge_untracked_instances(self, instances: Instances) -> Instances: """ For untracked previous instances, under certain condition, still keep them in tracking and merge with the current instances. Args: instances: D2 Instances, for predictions of the current frame Return: D2 Instances merging current instances and instances from previous frame decided to keep tracking """ untracked_instances = Instances( image_size=instances.image_size, pred_boxes=[], pred_classes=[], scores=[], ID=[], ID_period=[], lost_frame_count=[], ) prev_bboxes = list(self._prev_instances.pred_boxes) prev_classes = list(self._prev_instances.pred_classes) prev_scores = list(self._prev_instances.scores) prev_ID_period = self._prev_instances.ID_period if instances.has("pred_masks"): untracked_instances.set("pred_masks", []) prev_masks = list(self._prev_instances.pred_masks) if instances.has("pred_keypoints"): untracked_instances.set("pred_keypoints", []) prev_keypoints = list(self._prev_instances.pred_keypoints) if instances.has("pred_keypoint_heatmaps"): untracked_instances.set("pred_keypoint_heatmaps", []) prev_keypoint_heatmaps = list(self._prev_instances.pred_keypoint_heatmaps) for idx in self._untracked_prev_idx: x_left, y_top, x_right, y_bot = prev_bboxes[idx] if ( (1.0 * (x_right - x_left) / self._video_width < self._min_box_rel_dim) or (1.0 * (y_bot - y_top) / self._video_height < self._min_box_rel_dim) or self._prev_instances.lost_frame_count[idx] >= self._max_lost_frame_count or prev_ID_period[idx] <= self._min_instance_period ): continue untracked_instances.pred_boxes.append(list(prev_bboxes[idx].numpy())) untracked_instances.pred_classes.append(int(prev_classes[idx])) untracked_instances.scores.append(float(prev_scores[idx])) untracked_instances.ID.append(self._prev_instances.ID[idx]) untracked_instances.ID_period.append(self._prev_instances.ID_period[idx]) untracked_instances.lost_frame_count.append( self._prev_instances.lost_frame_count[idx] + 1 ) if instances.has("pred_masks"): untracked_instances.pred_masks.append(prev_masks[idx].numpy().astype(np.uint8)) if instances.has("pred_keypoints"): untracked_instances.pred_keypoints.append( prev_keypoints[idx].numpy().astype(np.uint8) ) if instances.has("pred_keypoint_heatmaps"): untracked_instances.pred_keypoint_heatmaps.append( prev_keypoint_heatmaps[idx].numpy().astype(np.float32) ) untracked_instances.pred_boxes = Boxes(torch.FloatTensor(untracked_instances.pred_boxes)) untracked_instances.pred_classes = torch.IntTensor(untracked_instances.pred_classes) untracked_instances.scores = torch.FloatTensor(untracked_instances.scores) if instances.has("pred_masks"): untracked_instances.pred_masks = torch.IntTensor(untracked_instances.pred_masks) if instances.has("pred_keypoints"): untracked_instances.pred_keypoints = torch.IntTensor(untracked_instances.pred_keypoints) if instances.has("pred_keypoint_heatmaps"): untracked_instances.pred_keypoint_heatmaps = torch.FloatTensor( untracked_instances.pred_keypoint_heatmaps ) return Instances.cat( [ instances, untracked_instances, ] ) ================================================ FILE: annotator/oneformer/detectron2/tracking/hungarian_tracker.py ================================================ #!/usr/bin/env python3 # Copyright 2004-present Facebook. All Rights Reserved. import copy import numpy as np from typing import Dict import torch from scipy.optimize import linear_sum_assignment from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.structures import Boxes, Instances from ..config.config import CfgNode as CfgNode_ from .base_tracker import BaseTracker class BaseHungarianTracker(BaseTracker): """ A base class for all Hungarian trackers """ @configurable def __init__( self, video_height: int, video_width: int, max_num_instances: int = 200, max_lost_frame_count: int = 0, min_box_rel_dim: float = 0.02, min_instance_period: int = 1, **kwargs ): """ Args: video_height: height the video frame video_width: width of the video frame max_num_instances: maximum number of id allowed to be tracked max_lost_frame_count: maximum number of frame an id can lost tracking exceed this number, an id is considered as lost forever min_box_rel_dim: a percentage, smaller than this dimension, a bbox is removed from tracking min_instance_period: an instance will be shown after this number of period since its first showing up in the video """ super().__init__(**kwargs) self._video_height = video_height self._video_width = video_width self._max_num_instances = max_num_instances self._max_lost_frame_count = max_lost_frame_count self._min_box_rel_dim = min_box_rel_dim self._min_instance_period = min_instance_period @classmethod def from_config(cls, cfg: CfgNode_) -> Dict: raise NotImplementedError("Calling HungarianTracker::from_config") def build_cost_matrix(self, instances: Instances, prev_instances: Instances) -> np.ndarray: raise NotImplementedError("Calling HungarianTracker::build_matrix") def update(self, instances: Instances) -> Instances: if instances.has("pred_keypoints"): raise NotImplementedError("Need to add support for keypoints") instances = self._initialize_extra_fields(instances) if self._prev_instances is not None: self._untracked_prev_idx = set(range(len(self._prev_instances))) cost_matrix = self.build_cost_matrix(instances, self._prev_instances) matched_idx, matched_prev_idx = linear_sum_assignment(cost_matrix) instances = self._process_matched_idx(instances, matched_idx, matched_prev_idx) instances = self._process_unmatched_idx(instances, matched_idx) instances = self._process_unmatched_prev_idx(instances, matched_prev_idx) self._prev_instances = copy.deepcopy(instances) return instances def _initialize_extra_fields(self, instances: Instances) -> Instances: """ If input instances don't have ID, ID_period, lost_frame_count fields, this method is used to initialize these fields. Args: instances: D2 Instances, for predictions of the current frame Return: D2 Instances with extra fields added """ if not instances.has("ID"): instances.set("ID", [None] * len(instances)) if not instances.has("ID_period"): instances.set("ID_period", [None] * len(instances)) if not instances.has("lost_frame_count"): instances.set("lost_frame_count", [None] * len(instances)) if self._prev_instances is None: instances.ID = list(range(len(instances))) self._id_count += len(instances) instances.ID_period = [1] * len(instances) instances.lost_frame_count = [0] * len(instances) return instances def _process_matched_idx( self, instances: Instances, matched_idx: np.ndarray, matched_prev_idx: np.ndarray ) -> Instances: assert matched_idx.size == matched_prev_idx.size for i in range(matched_idx.size): instances.ID[matched_idx[i]] = self._prev_instances.ID[matched_prev_idx[i]] instances.ID_period[matched_idx[i]] = ( self._prev_instances.ID_period[matched_prev_idx[i]] + 1 ) instances.lost_frame_count[matched_idx[i]] = 0 return instances def _process_unmatched_idx(self, instances: Instances, matched_idx: np.ndarray) -> Instances: untracked_idx = set(range(len(instances))).difference(set(matched_idx)) for idx in untracked_idx: instances.ID[idx] = self._id_count self._id_count += 1 instances.ID_period[idx] = 1 instances.lost_frame_count[idx] = 0 return instances def _process_unmatched_prev_idx( self, instances: Instances, matched_prev_idx: np.ndarray ) -> Instances: untracked_instances = Instances( image_size=instances.image_size, pred_boxes=[], pred_masks=[], pred_classes=[], scores=[], ID=[], ID_period=[], lost_frame_count=[], ) prev_bboxes = list(self._prev_instances.pred_boxes) prev_classes = list(self._prev_instances.pred_classes) prev_scores = list(self._prev_instances.scores) prev_ID_period = self._prev_instances.ID_period if instances.has("pred_masks"): prev_masks = list(self._prev_instances.pred_masks) untracked_prev_idx = set(range(len(self._prev_instances))).difference(set(matched_prev_idx)) for idx in untracked_prev_idx: x_left, y_top, x_right, y_bot = prev_bboxes[idx] if ( (1.0 * (x_right - x_left) / self._video_width < self._min_box_rel_dim) or (1.0 * (y_bot - y_top) / self._video_height < self._min_box_rel_dim) or self._prev_instances.lost_frame_count[idx] >= self._max_lost_frame_count or prev_ID_period[idx] <= self._min_instance_period ): continue untracked_instances.pred_boxes.append(list(prev_bboxes[idx].numpy())) untracked_instances.pred_classes.append(int(prev_classes[idx])) untracked_instances.scores.append(float(prev_scores[idx])) untracked_instances.ID.append(self._prev_instances.ID[idx]) untracked_instances.ID_period.append(self._prev_instances.ID_period[idx]) untracked_instances.lost_frame_count.append( self._prev_instances.lost_frame_count[idx] + 1 ) if instances.has("pred_masks"): untracked_instances.pred_masks.append(prev_masks[idx].numpy().astype(np.uint8)) untracked_instances.pred_boxes = Boxes(torch.FloatTensor(untracked_instances.pred_boxes)) untracked_instances.pred_classes = torch.IntTensor(untracked_instances.pred_classes) untracked_instances.scores = torch.FloatTensor(untracked_instances.scores) if instances.has("pred_masks"): untracked_instances.pred_masks = torch.IntTensor(untracked_instances.pred_masks) else: untracked_instances.remove("pred_masks") return Instances.cat( [ instances, untracked_instances, ] ) ================================================ FILE: annotator/oneformer/detectron2/tracking/iou_weighted_hungarian_bbox_iou_tracker.py ================================================ #!/usr/bin/env python3 # Copyright 2004-present Facebook. All Rights Reserved. import numpy as np from typing import List from annotator.oneformer.detectron2.config import CfgNode as CfgNode_ from annotator.oneformer.detectron2.config import configurable from .base_tracker import TRACKER_HEADS_REGISTRY from .vanilla_hungarian_bbox_iou_tracker import VanillaHungarianBBoxIOUTracker @TRACKER_HEADS_REGISTRY.register() class IOUWeightedHungarianBBoxIOUTracker(VanillaHungarianBBoxIOUTracker): """ A tracker using IoU as weight in Hungarian algorithm, also known as Munkres or Kuhn-Munkres algorithm """ @configurable def __init__( self, *, video_height: int, video_width: int, max_num_instances: int = 200, max_lost_frame_count: int = 0, min_box_rel_dim: float = 0.02, min_instance_period: int = 1, track_iou_threshold: float = 0.5, **kwargs, ): """ Args: video_height: height the video frame video_width: width of the video frame max_num_instances: maximum number of id allowed to be tracked max_lost_frame_count: maximum number of frame an id can lost tracking exceed this number, an id is considered as lost forever min_box_rel_dim: a percentage, smaller than this dimension, a bbox is removed from tracking min_instance_period: an instance will be shown after this number of period since its first showing up in the video track_iou_threshold: iou threshold, below this number a bbox pair is removed from tracking """ super().__init__( video_height=video_height, video_width=video_width, max_num_instances=max_num_instances, max_lost_frame_count=max_lost_frame_count, min_box_rel_dim=min_box_rel_dim, min_instance_period=min_instance_period, track_iou_threshold=track_iou_threshold, ) @classmethod def from_config(cls, cfg: CfgNode_): """ Old style initialization using CfgNode Args: cfg: D2 CfgNode, config file Return: dictionary storing arguments for __init__ method """ assert "VIDEO_HEIGHT" in cfg.TRACKER_HEADS assert "VIDEO_WIDTH" in cfg.TRACKER_HEADS video_height = cfg.TRACKER_HEADS.get("VIDEO_HEIGHT") video_width = cfg.TRACKER_HEADS.get("VIDEO_WIDTH") max_num_instances = cfg.TRACKER_HEADS.get("MAX_NUM_INSTANCES", 200) max_lost_frame_count = cfg.TRACKER_HEADS.get("MAX_LOST_FRAME_COUNT", 0) min_box_rel_dim = cfg.TRACKER_HEADS.get("MIN_BOX_REL_DIM", 0.02) min_instance_period = cfg.TRACKER_HEADS.get("MIN_INSTANCE_PERIOD", 1) track_iou_threshold = cfg.TRACKER_HEADS.get("TRACK_IOU_THRESHOLD", 0.5) return { "_target_": "detectron2.tracking.iou_weighted_hungarian_bbox_iou_tracker.IOUWeightedHungarianBBoxIOUTracker", # noqa "video_height": video_height, "video_width": video_width, "max_num_instances": max_num_instances, "max_lost_frame_count": max_lost_frame_count, "min_box_rel_dim": min_box_rel_dim, "min_instance_period": min_instance_period, "track_iou_threshold": track_iou_threshold, } def assign_cost_matrix_values(self, cost_matrix: np.ndarray, bbox_pairs: List) -> np.ndarray: """ Based on IoU for each pair of bbox, assign the associated value in cost matrix Args: cost_matrix: np.ndarray, initialized 2D array with target dimensions bbox_pairs: list of bbox pair, in each pair, iou value is stored Return: np.ndarray, cost_matrix with assigned values """ for pair in bbox_pairs: # assign (-1 * IoU) for above threshold pairs, algorithms will minimize cost cost_matrix[pair["idx"]][pair["prev_idx"]] = -1 * pair["IoU"] return cost_matrix ================================================ FILE: annotator/oneformer/detectron2/tracking/utils.py ================================================ #!/usr/bin/env python3 import numpy as np from typing import List from annotator.oneformer.detectron2.structures import Instances def create_prediction_pairs( instances: Instances, prev_instances: Instances, iou_all: np.ndarray, threshold: float = 0.5, ) -> List: """ Args: instances: predictions from current frame prev_instances: predictions from previous frame iou_all: 2D numpy array containing iou for each bbox pair threshold: below the threshold, doesn't consider the pair of bbox is valid Return: List of bbox pairs """ bbox_pairs = [] for i in range(len(instances)): for j in range(len(prev_instances)): if iou_all[i, j] < threshold: continue bbox_pairs.append( { "idx": i, "prev_idx": j, "prev_id": prev_instances.ID[j], "IoU": iou_all[i, j], "prev_period": prev_instances.ID_period[j], } ) return bbox_pairs LARGE_COST_VALUE = 100000 ================================================ FILE: annotator/oneformer/detectron2/tracking/vanilla_hungarian_bbox_iou_tracker.py ================================================ #!/usr/bin/env python3 # Copyright 2004-present Facebook. All Rights Reserved. import numpy as np from typing import List from annotator.oneformer.detectron2.config import CfgNode as CfgNode_ from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.structures import Instances from annotator.oneformer.detectron2.structures.boxes import pairwise_iou from annotator.oneformer.detectron2.tracking.utils import LARGE_COST_VALUE, create_prediction_pairs from .base_tracker import TRACKER_HEADS_REGISTRY from .hungarian_tracker import BaseHungarianTracker @TRACKER_HEADS_REGISTRY.register() class VanillaHungarianBBoxIOUTracker(BaseHungarianTracker): """ Hungarian algo based tracker using bbox iou as metric """ @configurable def __init__( self, *, video_height: int, video_width: int, max_num_instances: int = 200, max_lost_frame_count: int = 0, min_box_rel_dim: float = 0.02, min_instance_period: int = 1, track_iou_threshold: float = 0.5, **kwargs, ): """ Args: video_height: height the video frame video_width: width of the video frame max_num_instances: maximum number of id allowed to be tracked max_lost_frame_count: maximum number of frame an id can lost tracking exceed this number, an id is considered as lost forever min_box_rel_dim: a percentage, smaller than this dimension, a bbox is removed from tracking min_instance_period: an instance will be shown after this number of period since its first showing up in the video track_iou_threshold: iou threshold, below this number a bbox pair is removed from tracking """ super().__init__( video_height=video_height, video_width=video_width, max_num_instances=max_num_instances, max_lost_frame_count=max_lost_frame_count, min_box_rel_dim=min_box_rel_dim, min_instance_period=min_instance_period, ) self._track_iou_threshold = track_iou_threshold @classmethod def from_config(cls, cfg: CfgNode_): """ Old style initialization using CfgNode Args: cfg: D2 CfgNode, config file Return: dictionary storing arguments for __init__ method """ assert "VIDEO_HEIGHT" in cfg.TRACKER_HEADS assert "VIDEO_WIDTH" in cfg.TRACKER_HEADS video_height = cfg.TRACKER_HEADS.get("VIDEO_HEIGHT") video_width = cfg.TRACKER_HEADS.get("VIDEO_WIDTH") max_num_instances = cfg.TRACKER_HEADS.get("MAX_NUM_INSTANCES", 200) max_lost_frame_count = cfg.TRACKER_HEADS.get("MAX_LOST_FRAME_COUNT", 0) min_box_rel_dim = cfg.TRACKER_HEADS.get("MIN_BOX_REL_DIM", 0.02) min_instance_period = cfg.TRACKER_HEADS.get("MIN_INSTANCE_PERIOD", 1) track_iou_threshold = cfg.TRACKER_HEADS.get("TRACK_IOU_THRESHOLD", 0.5) return { "_target_": "detectron2.tracking.vanilla_hungarian_bbox_iou_tracker.VanillaHungarianBBoxIOUTracker", # noqa "video_height": video_height, "video_width": video_width, "max_num_instances": max_num_instances, "max_lost_frame_count": max_lost_frame_count, "min_box_rel_dim": min_box_rel_dim, "min_instance_period": min_instance_period, "track_iou_threshold": track_iou_threshold, } def build_cost_matrix(self, instances: Instances, prev_instances: Instances) -> np.ndarray: """ Build the cost matrix for assignment problem (https://en.wikipedia.org/wiki/Assignment_problem) Args: instances: D2 Instances, for current frame predictions prev_instances: D2 Instances, for previous frame predictions Return: the cost matrix in numpy array """ assert instances is not None and prev_instances is not None # calculate IoU of all bbox pairs iou_all = pairwise_iou( boxes1=instances.pred_boxes, boxes2=self._prev_instances.pred_boxes, ) bbox_pairs = create_prediction_pairs( instances, self._prev_instances, iou_all, threshold=self._track_iou_threshold ) # assign large cost value to make sure pair below IoU threshold won't be matched cost_matrix = np.full((len(instances), len(prev_instances)), LARGE_COST_VALUE) return self.assign_cost_matrix_values(cost_matrix, bbox_pairs) def assign_cost_matrix_values(self, cost_matrix: np.ndarray, bbox_pairs: List) -> np.ndarray: """ Based on IoU for each pair of bbox, assign the associated value in cost matrix Args: cost_matrix: np.ndarray, initialized 2D array with target dimensions bbox_pairs: list of bbox pair, in each pair, iou value is stored Return: np.ndarray, cost_matrix with assigned values """ for pair in bbox_pairs: # assign -1 for IoU above threshold pairs, algorithms will minimize cost cost_matrix[pair["idx"]][pair["prev_idx"]] = -1 return cost_matrix ================================================ FILE: annotator/oneformer/detectron2/utils/README.md ================================================ # Utility functions This folder contain utility functions that are not used in the core library, but are useful for building models or training code using the config system. ================================================ FILE: annotator/oneformer/detectron2/utils/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: annotator/oneformer/detectron2/utils/analysis.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # -*- coding: utf-8 -*- import typing from typing import Any, List import fvcore from fvcore.nn import activation_count, flop_count, parameter_count, parameter_count_table from torch import nn from annotator.oneformer.detectron2.export import TracingAdapter __all__ = [ "activation_count_operators", "flop_count_operators", "parameter_count_table", "parameter_count", "FlopCountAnalysis", ] FLOPS_MODE = "flops" ACTIVATIONS_MODE = "activations" # Some extra ops to ignore from counting, including elementwise and reduction ops _IGNORED_OPS = { "aten::add", "aten::add_", "aten::argmax", "aten::argsort", "aten::batch_norm", "aten::constant_pad_nd", "aten::div", "aten::div_", "aten::exp", "aten::log2", "aten::max_pool2d", "aten::meshgrid", "aten::mul", "aten::mul_", "aten::neg", "aten::nonzero_numpy", "aten::reciprocal", "aten::repeat_interleave", "aten::rsub", "aten::sigmoid", "aten::sigmoid_", "aten::softmax", "aten::sort", "aten::sqrt", "aten::sub", "torchvision::nms", # TODO estimate flop for nms } class FlopCountAnalysis(fvcore.nn.FlopCountAnalysis): """ Same as :class:`fvcore.nn.FlopCountAnalysis`, but supports detectron2 models. """ def __init__(self, model, inputs): """ Args: model (nn.Module): inputs (Any): inputs of the given model. Does not have to be tuple of tensors. """ wrapper = TracingAdapter(model, inputs, allow_non_tensor=True) super().__init__(wrapper, wrapper.flattened_inputs) self.set_op_handle(**{k: None for k in _IGNORED_OPS}) def flop_count_operators(model: nn.Module, inputs: list) -> typing.DefaultDict[str, float]: """ Implement operator-level flops counting using jit. This is a wrapper of :func:`fvcore.nn.flop_count` and adds supports for standard detection models in detectron2. Please use :class:`FlopCountAnalysis` for more advanced functionalities. Note: The function runs the input through the model to compute flops. The flops of a detection model is often input-dependent, for example, the flops of box & mask head depends on the number of proposals & the number of detected objects. Therefore, the flops counting using a single input may not accurately reflect the computation cost of a model. It's recommended to average across a number of inputs. Args: model: a detectron2 model that takes `list[dict]` as input. inputs (list[dict]): inputs to model, in detectron2's standard format. Only "image" key will be used. supported_ops (dict[str, Handle]): see documentation of :func:`fvcore.nn.flop_count` Returns: Counter: Gflop count per operator """ old_train = model.training model.eval() ret = FlopCountAnalysis(model, inputs).by_operator() model.train(old_train) return {k: v / 1e9 for k, v in ret.items()} def activation_count_operators( model: nn.Module, inputs: list, **kwargs ) -> typing.DefaultDict[str, float]: """ Implement operator-level activations counting using jit. This is a wrapper of fvcore.nn.activation_count, that supports standard detection models in detectron2. Note: The function runs the input through the model to compute activations. The activations of a detection model is often input-dependent, for example, the activations of box & mask head depends on the number of proposals & the number of detected objects. Args: model: a detectron2 model that takes `list[dict]` as input. inputs (list[dict]): inputs to model, in detectron2's standard format. Only "image" key will be used. Returns: Counter: activation count per operator """ return _wrapper_count_operators(model=model, inputs=inputs, mode=ACTIVATIONS_MODE, **kwargs) def _wrapper_count_operators( model: nn.Module, inputs: list, mode: str, **kwargs ) -> typing.DefaultDict[str, float]: # ignore some ops supported_ops = {k: lambda *args, **kwargs: {} for k in _IGNORED_OPS} supported_ops.update(kwargs.pop("supported_ops", {})) kwargs["supported_ops"] = supported_ops assert len(inputs) == 1, "Please use batch size=1" tensor_input = inputs[0]["image"] inputs = [{"image": tensor_input}] # remove other keys, in case there are any old_train = model.training if isinstance(model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel)): model = model.module wrapper = TracingAdapter(model, inputs) wrapper.eval() if mode == FLOPS_MODE: ret = flop_count(wrapper, (tensor_input,), **kwargs) elif mode == ACTIVATIONS_MODE: ret = activation_count(wrapper, (tensor_input,), **kwargs) else: raise NotImplementedError("Count for mode {} is not supported yet.".format(mode)) # compatible with change in fvcore if isinstance(ret, tuple): ret = ret[0] model.train(old_train) return ret def find_unused_parameters(model: nn.Module, inputs: Any) -> List[str]: """ Given a model, find parameters that do not contribute to the loss. Args: model: a model in training mode that returns losses inputs: argument or a tuple of arguments. Inputs of the model Returns: list[str]: the name of unused parameters """ assert model.training for _, prm in model.named_parameters(): prm.grad = None if isinstance(inputs, tuple): losses = model(*inputs) else: losses = model(inputs) if isinstance(losses, dict): losses = sum(losses.values()) losses.backward() unused: List[str] = [] for name, prm in model.named_parameters(): if prm.grad is None: unused.append(name) prm.grad = None return unused ================================================ FILE: annotator/oneformer/detectron2/utils/collect_env.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import importlib import numpy as np import os import re import subprocess import sys from collections import defaultdict import PIL import torch import torchvision from tabulate import tabulate __all__ = ["collect_env_info"] def collect_torch_env(): try: import torch.__config__ return torch.__config__.show() except ImportError: # compatible with older versions of pytorch from torch.utils.collect_env import get_pretty_env_info return get_pretty_env_info() def get_env_module(): var_name = "DETECTRON2_ENV_MODULE" return var_name, os.environ.get(var_name, "") def detect_compute_compatibility(CUDA_HOME, so_file): try: cuobjdump = os.path.join(CUDA_HOME, "bin", "cuobjdump") if os.path.isfile(cuobjdump): output = subprocess.check_output( "'{}' --list-elf '{}'".format(cuobjdump, so_file), shell=True ) output = output.decode("utf-8").strip().split("\n") arch = [] for line in output: line = re.findall(r"\.sm_([0-9]*)\.", line)[0] arch.append(".".join(line)) arch = sorted(set(arch)) return ", ".join(arch) else: return so_file + "; cannot find cuobjdump" except Exception: # unhandled failure return so_file def collect_env_info(): has_gpu = torch.cuda.is_available() # true for both CUDA & ROCM torch_version = torch.__version__ # NOTE that CUDA_HOME/ROCM_HOME could be None even when CUDA runtime libs are functional from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME has_rocm = False if (getattr(torch.version, "hip", None) is not None) and (ROCM_HOME is not None): has_rocm = True has_cuda = has_gpu and (not has_rocm) data = [] data.append(("sys.platform", sys.platform)) # check-template.yml depends on it data.append(("Python", sys.version.replace("\n", ""))) data.append(("numpy", np.__version__)) try: import annotator.oneformer.detectron2 # noqa data.append( ("detectron2", detectron2.__version__ + " @" + os.path.dirname(detectron2.__file__)) ) except ImportError: data.append(("detectron2", "failed to import")) except AttributeError: data.append(("detectron2", "imported a wrong installation")) try: import annotator.oneformer.detectron2._C as _C except ImportError as e: data.append(("detectron2._C", f"not built correctly: {e}")) # print system compilers when extension fails to build if sys.platform != "win32": # don't know what to do for windows try: # this is how torch/utils/cpp_extensions.py choose compiler cxx = os.environ.get("CXX", "c++") cxx = subprocess.check_output("'{}' --version".format(cxx), shell=True) cxx = cxx.decode("utf-8").strip().split("\n")[0] except subprocess.SubprocessError: cxx = "Not found" data.append(("Compiler ($CXX)", cxx)) if has_cuda and CUDA_HOME is not None: try: nvcc = os.path.join(CUDA_HOME, "bin", "nvcc") nvcc = subprocess.check_output("'{}' -V".format(nvcc), shell=True) nvcc = nvcc.decode("utf-8").strip().split("\n")[-1] except subprocess.SubprocessError: nvcc = "Not found" data.append(("CUDA compiler", nvcc)) if has_cuda and sys.platform != "win32": try: so_file = importlib.util.find_spec("detectron2._C").origin except (ImportError, AttributeError): pass else: data.append( ("detectron2 arch flags", detect_compute_compatibility(CUDA_HOME, so_file)) ) else: # print compilers that are used to build extension data.append(("Compiler", _C.get_compiler_version())) data.append(("CUDA compiler", _C.get_cuda_version())) # cuda or hip if has_cuda and getattr(_C, "has_cuda", lambda: True)(): data.append( ("detectron2 arch flags", detect_compute_compatibility(CUDA_HOME, _C.__file__)) ) data.append(get_env_module()) data.append(("PyTorch", torch_version + " @" + os.path.dirname(torch.__file__))) data.append(("PyTorch debug build", torch.version.debug)) try: data.append(("torch._C._GLIBCXX_USE_CXX11_ABI", torch._C._GLIBCXX_USE_CXX11_ABI)) except Exception: pass if not has_gpu: has_gpu_text = "No: torch.cuda.is_available() == False" else: has_gpu_text = "Yes" data.append(("GPU available", has_gpu_text)) if has_gpu: devices = defaultdict(list) for k in range(torch.cuda.device_count()): cap = ".".join((str(x) for x in torch.cuda.get_device_capability(k))) name = torch.cuda.get_device_name(k) + f" (arch={cap})" devices[name].append(str(k)) for name, devids in devices.items(): data.append(("GPU " + ",".join(devids), name)) if has_rocm: msg = " - invalid!" if not (ROCM_HOME and os.path.isdir(ROCM_HOME)) else "" data.append(("ROCM_HOME", str(ROCM_HOME) + msg)) else: try: from torch.utils.collect_env import get_nvidia_driver_version, run as _run data.append(("Driver version", get_nvidia_driver_version(_run))) except Exception: pass msg = " - invalid!" if not (CUDA_HOME and os.path.isdir(CUDA_HOME)) else "" data.append(("CUDA_HOME", str(CUDA_HOME) + msg)) cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None) if cuda_arch_list: data.append(("TORCH_CUDA_ARCH_LIST", cuda_arch_list)) data.append(("Pillow", PIL.__version__)) try: data.append( ( "torchvision", str(torchvision.__version__) + " @" + os.path.dirname(torchvision.__file__), ) ) if has_cuda: try: torchvision_C = importlib.util.find_spec("torchvision._C").origin msg = detect_compute_compatibility(CUDA_HOME, torchvision_C) data.append(("torchvision arch flags", msg)) except (ImportError, AttributeError): data.append(("torchvision._C", "Not found")) except AttributeError: data.append(("torchvision", "unknown")) try: import fvcore data.append(("fvcore", fvcore.__version__)) except (ImportError, AttributeError): pass try: import iopath data.append(("iopath", iopath.__version__)) except (ImportError, AttributeError): pass try: import cv2 data.append(("cv2", cv2.__version__)) except (ImportError, AttributeError): data.append(("cv2", "Not found")) env_str = tabulate(data) + "\n" env_str += collect_torch_env() return env_str def test_nccl_ops(): num_gpu = torch.cuda.device_count() if os.access("/tmp", os.W_OK): import torch.multiprocessing as mp dist_url = "file:///tmp/nccl_tmp_file" print("Testing NCCL connectivity ... this should not hang.") mp.spawn(_test_nccl_worker, nprocs=num_gpu, args=(num_gpu, dist_url), daemon=False) print("NCCL succeeded.") def _test_nccl_worker(rank, num_gpu, dist_url): import torch.distributed as dist dist.init_process_group(backend="NCCL", init_method=dist_url, rank=rank, world_size=num_gpu) dist.barrier(device_ids=[rank]) if __name__ == "__main__": try: from annotator.oneformer.detectron2.utils.collect_env import collect_env_info as f print(f()) except ImportError: print(collect_env_info()) if torch.cuda.is_available(): num_gpu = torch.cuda.device_count() for k in range(num_gpu): device = f"cuda:{k}" try: x = torch.tensor([1, 2.0], dtype=torch.float32) x = x.to(device) except Exception as e: print( f"Unable to copy tensor to device={device}: {e}. " "Your CUDA environment is broken." ) if num_gpu > 1: test_nccl_ops() ================================================ FILE: annotator/oneformer/detectron2/utils/colormap.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. """ An awesome colormap for really neat visualizations. Copied from Detectron, and removed gray colors. """ import numpy as np import random __all__ = ["colormap", "random_color", "random_colors"] # fmt: off # RGB: _COLORS = np.array( [ 0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494, 0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078, 0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000, 1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667, 0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000, 0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000, 1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000, 0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500, 0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667, 0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333, 0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000, 0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333, 0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000, 1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000 ] ).astype(np.float32).reshape(-1, 3) # fmt: on def colormap(rgb=False, maximum=255): """ Args: rgb (bool): whether to return RGB colors or BGR colors. maximum (int): either 255 or 1 Returns: ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1] """ assert maximum in [255, 1], maximum c = _COLORS * maximum if not rgb: c = c[:, ::-1] return c def random_color(rgb=False, maximum=255): """ Args: rgb (bool): whether to return RGB colors or BGR colors. maximum (int): either 255 or 1 Returns: ndarray: a vector of 3 numbers """ idx = np.random.randint(0, len(_COLORS)) ret = _COLORS[idx] * maximum if not rgb: ret = ret[::-1] return ret def random_colors(N, rgb=False, maximum=255): """ Args: N (int): number of unique colors needed rgb (bool): whether to return RGB colors or BGR colors. maximum (int): either 255 or 1 Returns: ndarray: a list of random_color """ indices = random.sample(range(len(_COLORS)), N) ret = [_COLORS[i] * maximum for i in indices] if not rgb: ret = [x[::-1] for x in ret] return ret if __name__ == "__main__": import cv2 size = 100 H, W = 10, 10 canvas = np.random.rand(H * size, W * size, 3).astype("float32") for h in range(H): for w in range(W): idx = h * W + w if idx >= len(_COLORS): break canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx] cv2.imshow("a", canvas) cv2.waitKey(0) ================================================ FILE: annotator/oneformer/detectron2/utils/comm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. """ This file contains primitives for multi-gpu communication. This is useful when doing distributed training. """ import functools import numpy as np import torch import torch.distributed as dist _LOCAL_PROCESS_GROUP = None _MISSING_LOCAL_PG_ERROR = ( "Local process group is not yet created! Please use detectron2's `launch()` " "to start processes and initialize pytorch process group. If you need to start " "processes in other ways, please call comm.create_local_process_group(" "num_workers_per_machine) after calling torch.distributed.init_process_group()." ) def get_world_size() -> int: if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 return dist.get_world_size() def get_rank() -> int: if not dist.is_available(): return 0 if not dist.is_initialized(): return 0 return dist.get_rank() @functools.lru_cache() def create_local_process_group(num_workers_per_machine: int) -> None: """ Create a process group that contains ranks within the same machine. Detectron2's launch() in engine/launch.py will call this function. If you start workers without launch(), you'll have to also call this. Otherwise utilities like `get_local_rank()` will not work. This function contains a barrier. All processes must call it together. Args: num_workers_per_machine: the number of worker processes per machine. Typically the number of GPUs. """ global _LOCAL_PROCESS_GROUP assert _LOCAL_PROCESS_GROUP is None assert get_world_size() % num_workers_per_machine == 0 num_machines = get_world_size() // num_workers_per_machine machine_rank = get_rank() // num_workers_per_machine for i in range(num_machines): ranks_on_i = list(range(i * num_workers_per_machine, (i + 1) * num_workers_per_machine)) pg = dist.new_group(ranks_on_i) if i == machine_rank: _LOCAL_PROCESS_GROUP = pg def get_local_process_group(): """ Returns: A torch process group which only includes processes that are on the same machine as the current process. This group can be useful for communication within a machine, e.g. a per-machine SyncBN. """ assert _LOCAL_PROCESS_GROUP is not None, _MISSING_LOCAL_PG_ERROR return _LOCAL_PROCESS_GROUP def get_local_rank() -> int: """ Returns: The rank of the current process within the local (per-machine) process group. """ if not dist.is_available(): return 0 if not dist.is_initialized(): return 0 assert _LOCAL_PROCESS_GROUP is not None, _MISSING_LOCAL_PG_ERROR return dist.get_rank(group=_LOCAL_PROCESS_GROUP) def get_local_size() -> int: """ Returns: The size of the per-machine process group, i.e. the number of processes per machine. """ if not dist.is_available(): return 1 if not dist.is_initialized(): return 1 assert _LOCAL_PROCESS_GROUP is not None, _MISSING_LOCAL_PG_ERROR return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) def is_main_process() -> bool: return get_rank() == 0 def synchronize(): """ Helper function to synchronize (barrier) among all processes when using distributed training """ if not dist.is_available(): return if not dist.is_initialized(): return world_size = dist.get_world_size() if world_size == 1: return if dist.get_backend() == dist.Backend.NCCL: # This argument is needed to avoid warnings. # It's valid only for NCCL backend. dist.barrier(device_ids=[torch.cuda.current_device()]) else: dist.barrier() @functools.lru_cache() def _get_global_gloo_group(): """ Return a process group based on gloo backend, containing all the ranks The result is cached. """ if dist.get_backend() == "nccl": return dist.new_group(backend="gloo") else: return dist.group.WORLD def all_gather(data, group=None): """ Run all_gather on arbitrary picklable data (not necessarily tensors). Args: data: any picklable object group: a torch process group. By default, will use a group which contains all ranks on gloo backend. Returns: list[data]: list of data gathered from each rank """ if get_world_size() == 1: return [data] if group is None: group = _get_global_gloo_group() # use CPU group by default, to reduce GPU RAM usage. world_size = dist.get_world_size(group) if world_size == 1: return [data] output = [None for _ in range(world_size)] dist.all_gather_object(output, data, group=group) return output def gather(data, dst=0, group=None): """ Run gather on arbitrary picklable data (not necessarily tensors). Args: data: any picklable object dst (int): destination rank group: a torch process group. By default, will use a group which contains all ranks on gloo backend. Returns: list[data]: on dst, a list of data gathered from each rank. Otherwise, an empty list. """ if get_world_size() == 1: return [data] if group is None: group = _get_global_gloo_group() world_size = dist.get_world_size(group=group) if world_size == 1: return [data] rank = dist.get_rank(group=group) if rank == dst: output = [None for _ in range(world_size)] dist.gather_object(data, output, dst=dst, group=group) return output else: dist.gather_object(data, None, dst=dst, group=group) return [] def shared_random_seed(): """ Returns: int: a random number that is the same across all workers. If workers need a shared RNG, they can use this shared seed to create one. All workers must call this function, otherwise it will deadlock. """ ints = np.random.randint(2**31) all_ints = all_gather(ints) return all_ints[0] def reduce_dict(input_dict, average=True): """ Reduce the values in the dictionary from all processes so that process with rank 0 has the reduced results. Args: input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor. average (bool): whether to do average or sum Returns: a dict with the same keys as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.reduce(values, dst=0) if dist.get_rank() == 0 and average: # only main process gets accumulated, so only divide by # world_size in this case values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict ================================================ FILE: annotator/oneformer/detectron2/utils/develop.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. """ Utilities for developers only. These are not visible to users (not automatically imported). And should not appeared in docs.""" # adapted from https://github.com/tensorpack/tensorpack/blob/master/tensorpack/utils/develop.py def create_dummy_class(klass, dependency, message=""): """ When a dependency of a class is not available, create a dummy class which throws ImportError when used. Args: klass (str): name of the class. dependency (str): name of the dependency. message: extra message to print Returns: class: a class object """ err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass) if message: err = err + " " + message class _DummyMetaClass(type): # throw error on class attribute access def __getattr__(_, __): # noqa: B902 raise ImportError(err) class _Dummy(object, metaclass=_DummyMetaClass): # throw error on constructor def __init__(self, *args, **kwargs): raise ImportError(err) return _Dummy def create_dummy_func(func, dependency, message=""): """ When a dependency of a function is not available, create a dummy function which throws ImportError when used. Args: func (str): name of the function. dependency (str or list[str]): name(s) of the dependency. message: extra message to print Returns: function: a function object """ err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func) if message: err = err + " " + message if isinstance(dependency, (list, tuple)): dependency = ",".join(dependency) def _dummy(*args, **kwargs): raise ImportError(err) return _dummy ================================================ FILE: annotator/oneformer/detectron2/utils/env.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import importlib import importlib.util import logging import numpy as np import os import random import sys from datetime import datetime import torch __all__ = ["seed_all_rng"] TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2]) """ PyTorch version as a tuple of 2 ints. Useful for comparison. """ DOC_BUILDING = os.getenv("_DOC_BUILDING", False) # set in docs/conf.py """ Whether we're building documentation. """ def seed_all_rng(seed=None): """ Set the random seed for the RNG in torch, numpy and python. Args: seed (int): if None, will use a strong random seed. """ if seed is None: seed = ( os.getpid() + int(datetime.now().strftime("%S%f")) + int.from_bytes(os.urandom(2), "big") ) logger = logging.getLogger(__name__) logger.info("Using a generated random seed {}".format(seed)) np.random.seed(seed) torch.manual_seed(seed) random.seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path def _import_file(module_name, file_path, make_importable=False): spec = importlib.util.spec_from_file_location(module_name, file_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) if make_importable: sys.modules[module_name] = module return module def _configure_libraries(): """ Configurations for some libraries. """ # An environment option to disable `import cv2` globally, # in case it leads to negative performance impact disable_cv2 = int(os.environ.get("DETECTRON2_DISABLE_CV2", False)) if disable_cv2: sys.modules["cv2"] = None else: # Disable opencl in opencv since its interaction with cuda often has negative effects # This envvar is supported after OpenCV 3.4.0 os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled" try: import cv2 if int(cv2.__version__.split(".")[0]) >= 3: cv2.ocl.setUseOpenCL(False) except ModuleNotFoundError: # Other types of ImportError, if happened, should not be ignored. # Because a failed opencv import could mess up address space # https://github.com/skvark/opencv-python/issues/381 pass def get_version(module, digit=2): return tuple(map(int, module.__version__.split(".")[:digit])) # fmt: off assert get_version(torch) >= (1, 4), "Requires torch>=1.4" import fvcore assert get_version(fvcore, 3) >= (0, 1, 2), "Requires fvcore>=0.1.2" import yaml assert get_version(yaml) >= (5, 1), "Requires pyyaml>=5.1" # fmt: on _ENV_SETUP_DONE = False def setup_environment(): """Perform environment setup work. The default setup is a no-op, but this function allows the user to specify a Python source file or a module in the $DETECTRON2_ENV_MODULE environment variable, that performs custom setup work that may be necessary to their computing environment. """ global _ENV_SETUP_DONE if _ENV_SETUP_DONE: return _ENV_SETUP_DONE = True _configure_libraries() custom_module_path = os.environ.get("DETECTRON2_ENV_MODULE") if custom_module_path: setup_custom_environment(custom_module_path) else: # The default setup is a no-op pass def setup_custom_environment(custom_module): """ Load custom environment setup by importing a Python source file or a module, and run the setup function. """ if custom_module.endswith(".py"): module = _import_file("detectron2.utils.env.custom_module", custom_module) else: module = importlib.import_module(custom_module) assert hasattr(module, "setup_environment") and callable(module.setup_environment), ( "Custom environment module defined in {} does not have the " "required callable attribute 'setup_environment'." ).format(custom_module) module.setup_environment() def fixup_module_metadata(module_name, namespace, keys=None): """ Fix the __qualname__ of module members to be their exported api name, so when they are referenced in docs, sphinx can find them. Reference: https://github.com/python-trio/trio/blob/6754c74eacfad9cc5c92d5c24727a2f3b620624e/trio/_util.py#L216-L241 """ if not DOC_BUILDING: return seen_ids = set() def fix_one(qualname, name, obj): # avoid infinite recursion (relevant when using # typing.Generic, for example) if id(obj) in seen_ids: return seen_ids.add(id(obj)) mod = getattr(obj, "__module__", None) if mod is not None and (mod.startswith(module_name) or mod.startswith("fvcore.")): obj.__module__ = module_name # Modules, unlike everything else in Python, put fully-qualitied # names into their __name__ attribute. We check for "." to avoid # rewriting these. if hasattr(obj, "__name__") and "." not in obj.__name__: obj.__name__ = name obj.__qualname__ = qualname if isinstance(obj, type): for attr_name, attr_value in obj.__dict__.items(): fix_one(objname + "." + attr_name, attr_name, attr_value) if keys is None: keys = namespace.keys() for objname in keys: if not objname.startswith("_"): obj = namespace[objname] fix_one(objname, objname, obj) ================================================ FILE: annotator/oneformer/detectron2/utils/events.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import datetime import json import logging import os import time from collections import defaultdict from contextlib import contextmanager from typing import Optional import torch from fvcore.common.history_buffer import HistoryBuffer from annotator.oneformer.detectron2.utils.file_io import PathManager __all__ = [ "get_event_storage", "JSONWriter", "TensorboardXWriter", "CommonMetricPrinter", "EventStorage", ] _CURRENT_STORAGE_STACK = [] def get_event_storage(): """ Returns: The :class:`EventStorage` object that's currently being used. Throws an error if no :class:`EventStorage` is currently enabled. """ assert len( _CURRENT_STORAGE_STACK ), "get_event_storage() has to be called inside a 'with EventStorage(...)' context!" return _CURRENT_STORAGE_STACK[-1] class EventWriter: """ Base class for writers that obtain events from :class:`EventStorage` and process them. """ def write(self): raise NotImplementedError def close(self): pass class JSONWriter(EventWriter): """ Write scalars to a json file. It saves scalars as one json per line (instead of a big json) for easy parsing. Examples parsing such a json file: :: $ cat metrics.json | jq -s '.[0:2]' [ { "data_time": 0.008433341979980469, "iteration": 19, "loss": 1.9228371381759644, "loss_box_reg": 0.050025828182697296, "loss_classifier": 0.5316952466964722, "loss_mask": 0.7236229181289673, "loss_rpn_box": 0.0856662318110466, "loss_rpn_cls": 0.48198649287223816, "lr": 0.007173333333333333, "time": 0.25401854515075684 }, { "data_time": 0.007216215133666992, "iteration": 39, "loss": 1.282649278640747, "loss_box_reg": 0.06222952902317047, "loss_classifier": 0.30682939291000366, "loss_mask": 0.6970193982124329, "loss_rpn_box": 0.038663312792778015, "loss_rpn_cls": 0.1471673548221588, "lr": 0.007706666666666667, "time": 0.2490077018737793 } ] $ cat metrics.json | jq '.loss_mask' 0.7126231789588928 0.689423680305481 0.6776131987571716 ... """ def __init__(self, json_file, window_size=20): """ Args: json_file (str): path to the json file. New data will be appended if the file exists. window_size (int): the window size of median smoothing for the scalars whose `smoothing_hint` are True. """ self._file_handle = PathManager.open(json_file, "a") self._window_size = window_size self._last_write = -1 def write(self): storage = get_event_storage() to_save = defaultdict(dict) for k, (v, iter) in storage.latest_with_smoothing_hint(self._window_size).items(): # keep scalars that have not been written if iter <= self._last_write: continue to_save[iter][k] = v if len(to_save): all_iters = sorted(to_save.keys()) self._last_write = max(all_iters) for itr, scalars_per_iter in to_save.items(): scalars_per_iter["iteration"] = itr self._file_handle.write(json.dumps(scalars_per_iter, sort_keys=True) + "\n") self._file_handle.flush() try: os.fsync(self._file_handle.fileno()) except AttributeError: pass def close(self): self._file_handle.close() class TensorboardXWriter(EventWriter): """ Write all scalars to a tensorboard file. """ def __init__(self, log_dir: str, window_size: int = 20, **kwargs): """ Args: log_dir (str): the directory to save the output events window_size (int): the scalars will be median-smoothed by this window size kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)` """ self._window_size = window_size from torch.utils.tensorboard import SummaryWriter self._writer = SummaryWriter(log_dir, **kwargs) self._last_write = -1 def write(self): storage = get_event_storage() new_last_write = self._last_write for k, (v, iter) in storage.latest_with_smoothing_hint(self._window_size).items(): if iter > self._last_write: self._writer.add_scalar(k, v, iter) new_last_write = max(new_last_write, iter) self._last_write = new_last_write # storage.put_{image,histogram} is only meant to be used by # tensorboard writer. So we access its internal fields directly from here. if len(storage._vis_data) >= 1: for img_name, img, step_num in storage._vis_data: self._writer.add_image(img_name, img, step_num) # Storage stores all image data and rely on this writer to clear them. # As a result it assumes only one writer will use its image data. # An alternative design is to let storage store limited recent # data (e.g. only the most recent image) that all writers can access. # In that case a writer may not see all image data if its period is long. storage.clear_images() if len(storage._histograms) >= 1: for params in storage._histograms: self._writer.add_histogram_raw(**params) storage.clear_histograms() def close(self): if hasattr(self, "_writer"): # doesn't exist when the code fails at import self._writer.close() class CommonMetricPrinter(EventWriter): """ Print **common** metrics to the terminal, including iteration time, ETA, memory, all losses, and the learning rate. It also applies smoothing using a window of 20 elements. It's meant to print common metrics in common ways. To print something in more customized ways, please implement a similar printer by yourself. """ def __init__(self, max_iter: Optional[int] = None, window_size: int = 20): """ Args: max_iter: the maximum number of iterations to train. Used to compute ETA. If not given, ETA will not be printed. window_size (int): the losses will be median-smoothed by this window size """ self.logger = logging.getLogger(__name__) self._max_iter = max_iter self._window_size = window_size self._last_write = None # (step, time) of last call to write(). Used to compute ETA def _get_eta(self, storage) -> Optional[str]: if self._max_iter is None: return "" iteration = storage.iter try: eta_seconds = storage.history("time").median(1000) * (self._max_iter - iteration - 1) storage.put_scalar("eta_seconds", eta_seconds, smoothing_hint=False) return str(datetime.timedelta(seconds=int(eta_seconds))) except KeyError: # estimate eta on our own - more noisy eta_string = None if self._last_write is not None: estimate_iter_time = (time.perf_counter() - self._last_write[1]) / ( iteration - self._last_write[0] ) eta_seconds = estimate_iter_time * (self._max_iter - iteration - 1) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) self._last_write = (iteration, time.perf_counter()) return eta_string def write(self): storage = get_event_storage() iteration = storage.iter if iteration == self._max_iter: # This hook only reports training progress (loss, ETA, etc) but not other data, # therefore do not write anything after training succeeds, even if this method # is called. return try: avg_data_time = storage.history("data_time").avg( storage.count_samples("data_time", self._window_size) ) last_data_time = storage.history("data_time").latest() except KeyError: # they may not exist in the first few iterations (due to warmup) # or when SimpleTrainer is not used avg_data_time = None last_data_time = None try: avg_iter_time = storage.history("time").global_avg() last_iter_time = storage.history("time").latest() except KeyError: avg_iter_time = None last_iter_time = None try: lr = "{:.5g}".format(storage.history("lr").latest()) except KeyError: lr = "N/A" eta_string = self._get_eta(storage) if torch.cuda.is_available(): max_mem_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0 else: max_mem_mb = None # NOTE: max_mem is parsed by grep in "dev/parse_results.sh" self.logger.info( str.format( " {eta}iter: {iter} {losses} {non_losses} {avg_time}{last_time}" + "{avg_data_time}{last_data_time} lr: {lr} {memory}", eta=f"eta: {eta_string} " if eta_string else "", iter=iteration, losses=" ".join( [ "{}: {:.4g}".format( k, v.median(storage.count_samples(k, self._window_size)) ) for k, v in storage.histories().items() if "loss" in k ] ), non_losses=" ".join( [ "{}: {:.4g}".format( k, v.median(storage.count_samples(k, self._window_size)) ) for k, v in storage.histories().items() if "[metric]" in k ] ), avg_time="time: {:.4f} ".format(avg_iter_time) if avg_iter_time is not None else "", last_time="last_time: {:.4f} ".format(last_iter_time) if last_iter_time is not None else "", avg_data_time="data_time: {:.4f} ".format(avg_data_time) if avg_data_time is not None else "", last_data_time="last_data_time: {:.4f} ".format(last_data_time) if last_data_time is not None else "", lr=lr, memory="max_mem: {:.0f}M".format(max_mem_mb) if max_mem_mb is not None else "", ) ) class EventStorage: """ The user-facing class that provides metric storage functionalities. In the future we may add support for storing / logging other types of data if needed. """ def __init__(self, start_iter=0): """ Args: start_iter (int): the iteration number to start with """ self._history = defaultdict(HistoryBuffer) self._smoothing_hints = {} self._latest_scalars = {} self._iter = start_iter self._current_prefix = "" self._vis_data = [] self._histograms = [] def put_image(self, img_name, img_tensor): """ Add an `img_tensor` associated with `img_name`, to be shown on tensorboard. Args: img_name (str): The name of the image to put into tensorboard. img_tensor (torch.Tensor or numpy.array): An `uint8` or `float` Tensor of shape `[channel, height, width]` where `channel` is 3. The image format should be RGB. The elements in img_tensor can either have values in [0, 1] (float32) or [0, 255] (uint8). The `img_tensor` will be visualized in tensorboard. """ self._vis_data.append((img_name, img_tensor, self._iter)) def put_scalar(self, name, value, smoothing_hint=True): """ Add a scalar `value` to the `HistoryBuffer` associated with `name`. Args: smoothing_hint (bool): a 'hint' on whether this scalar is noisy and should be smoothed when logged. The hint will be accessible through :meth:`EventStorage.smoothing_hints`. A writer may ignore the hint and apply custom smoothing rule. It defaults to True because most scalars we save need to be smoothed to provide any useful signal. """ name = self._current_prefix + name history = self._history[name] value = float(value) history.update(value, self._iter) self._latest_scalars[name] = (value, self._iter) existing_hint = self._smoothing_hints.get(name) if existing_hint is not None: assert ( existing_hint == smoothing_hint ), "Scalar {} was put with a different smoothing_hint!".format(name) else: self._smoothing_hints[name] = smoothing_hint def put_scalars(self, *, smoothing_hint=True, **kwargs): """ Put multiple scalars from keyword arguments. Examples: storage.put_scalars(loss=my_loss, accuracy=my_accuracy, smoothing_hint=True) """ for k, v in kwargs.items(): self.put_scalar(k, v, smoothing_hint=smoothing_hint) def put_histogram(self, hist_name, hist_tensor, bins=1000): """ Create a histogram from a tensor. Args: hist_name (str): The name of the histogram to put into tensorboard. hist_tensor (torch.Tensor): A Tensor of arbitrary shape to be converted into a histogram. bins (int): Number of histogram bins. """ ht_min, ht_max = hist_tensor.min().item(), hist_tensor.max().item() # Create a histogram with PyTorch hist_counts = torch.histc(hist_tensor, bins=bins) hist_edges = torch.linspace(start=ht_min, end=ht_max, steps=bins + 1, dtype=torch.float32) # Parameter for the add_histogram_raw function of SummaryWriter hist_params = dict( tag=hist_name, min=ht_min, max=ht_max, num=len(hist_tensor), sum=float(hist_tensor.sum()), sum_squares=float(torch.sum(hist_tensor**2)), bucket_limits=hist_edges[1:].tolist(), bucket_counts=hist_counts.tolist(), global_step=self._iter, ) self._histograms.append(hist_params) def history(self, name): """ Returns: HistoryBuffer: the scalar history for name """ ret = self._history.get(name, None) if ret is None: raise KeyError("No history metric available for {}!".format(name)) return ret def histories(self): """ Returns: dict[name -> HistoryBuffer]: the HistoryBuffer for all scalars """ return self._history def latest(self): """ Returns: dict[str -> (float, int)]: mapping from the name of each scalar to the most recent value and the iteration number its added. """ return self._latest_scalars def latest_with_smoothing_hint(self, window_size=20): """ Similar to :meth:`latest`, but the returned values are either the un-smoothed original latest value, or a median of the given window_size, depend on whether the smoothing_hint is True. This provides a default behavior that other writers can use. Note: All scalars saved in the past `window_size` iterations are used for smoothing. This is different from the `window_size` definition in HistoryBuffer. Use :meth:`get_history_window_size` to get the `window_size` used in HistoryBuffer. """ result = {} for k, (v, itr) in self._latest_scalars.items(): result[k] = ( self._history[k].median(self.count_samples(k, window_size)) if self._smoothing_hints[k] else v, itr, ) return result def count_samples(self, name, window_size=20): """ Return the number of samples logged in the past `window_size` iterations. """ samples = 0 data = self._history[name].values() for _, iter_ in reversed(data): if iter_ > data[-1][1] - window_size: samples += 1 else: break return samples def smoothing_hints(self): """ Returns: dict[name -> bool]: the user-provided hint on whether the scalar is noisy and needs smoothing. """ return self._smoothing_hints def step(self): """ User should either: (1) Call this function to increment storage.iter when needed. Or (2) Set `storage.iter` to the correct iteration number before each iteration. The storage will then be able to associate the new data with an iteration number. """ self._iter += 1 @property def iter(self): """ Returns: int: The current iteration number. When used together with a trainer, this is ensured to be the same as trainer.iter. """ return self._iter @iter.setter def iter(self, val): self._iter = int(val) @property def iteration(self): # for backward compatibility return self._iter def __enter__(self): _CURRENT_STORAGE_STACK.append(self) return self def __exit__(self, exc_type, exc_val, exc_tb): assert _CURRENT_STORAGE_STACK[-1] == self _CURRENT_STORAGE_STACK.pop() @contextmanager def name_scope(self, name): """ Yields: A context within which all the events added to this storage will be prefixed by the name scope. """ old_prefix = self._current_prefix self._current_prefix = name.rstrip("/") + "/" yield self._current_prefix = old_prefix def clear_images(self): """ Delete all the stored images for visualization. This should be called after images are written to tensorboard. """ self._vis_data = [] def clear_histograms(self): """ Delete all the stored histograms for visualization. This should be called after histograms are written to tensorboard. """ self._histograms = [] ================================================ FILE: annotator/oneformer/detectron2/utils/file_io.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from iopath.common.file_io import HTTPURLHandler, OneDrivePathHandler, PathHandler from iopath.common.file_io import PathManager as PathManagerBase __all__ = ["PathManager", "PathHandler"] PathManager = PathManagerBase() """ This is a detectron2 project-specific PathManager. We try to stay away from global PathManager in fvcore as it introduces potential conflicts among other libraries. """ class Detectron2Handler(PathHandler): """ Resolve anything that's hosted under detectron2's namespace. """ PREFIX = "detectron2://" S3_DETECTRON2_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/" def _get_supported_prefixes(self): return [self.PREFIX] def _get_local_path(self, path, **kwargs): name = path[len(self.PREFIX) :] return PathManager.get_local_path(self.S3_DETECTRON2_PREFIX + name, **kwargs) def _open(self, path, mode="r", **kwargs): return PathManager.open( self.S3_DETECTRON2_PREFIX + path[len(self.PREFIX) :], mode, **kwargs ) PathManager.register_handler(HTTPURLHandler()) PathManager.register_handler(OneDrivePathHandler()) PathManager.register_handler(Detectron2Handler()) ================================================ FILE: annotator/oneformer/detectron2/utils/logger.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import atexit import functools import logging import os import sys import time from collections import Counter import torch from tabulate import tabulate from termcolor import colored from annotator.oneformer.detectron2.utils.file_io import PathManager __all__ = ["setup_logger", "log_first_n", "log_every_n", "log_every_n_seconds"] class _ColorfulFormatter(logging.Formatter): def __init__(self, *args, **kwargs): self._root_name = kwargs.pop("root_name") + "." self._abbrev_name = kwargs.pop("abbrev_name", "") if len(self._abbrev_name): self._abbrev_name = self._abbrev_name + "." super(_ColorfulFormatter, self).__init__(*args, **kwargs) def formatMessage(self, record): record.name = record.name.replace(self._root_name, self._abbrev_name) log = super(_ColorfulFormatter, self).formatMessage(record) if record.levelno == logging.WARNING: prefix = colored("WARNING", "red", attrs=["blink"]) elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL: prefix = colored("ERROR", "red", attrs=["blink", "underline"]) else: return log return prefix + " " + log @functools.lru_cache() # so that calling setup_logger multiple times won't add many handlers def setup_logger( output=None, distributed_rank=0, *, color=True, name="detectron2", abbrev_name=None ): """ Initialize the detectron2 logger and set its verbosity level to "DEBUG". Args: output (str): a file name or a directory to save log. If None, will not save log file. If ends with ".txt" or ".log", assumed to be a file name. Otherwise, logs will be saved to `output/log.txt`. name (str): the root module name of this logger abbrev_name (str): an abbreviation of the module, to avoid long names in logs. Set to "" to not log the root module in logs. By default, will abbreviate "detectron2" to "d2" and leave other modules unchanged. Returns: logging.Logger: a logger """ logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) logger.propagate = False if abbrev_name is None: abbrev_name = "d2" if name == "detectron2" else name plain_formatter = logging.Formatter( "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S" ) # stdout logging: master only if distributed_rank == 0: ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) if color: formatter = _ColorfulFormatter( colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s", datefmt="%m/%d %H:%M:%S", root_name=name, abbrev_name=str(abbrev_name), ) else: formatter = plain_formatter ch.setFormatter(formatter) logger.addHandler(ch) # file logging: all workers if output is not None: if output.endswith(".txt") or output.endswith(".log"): filename = output else: filename = os.path.join(output, "log.txt") if distributed_rank > 0: filename = filename + ".rank{}".format(distributed_rank) PathManager.mkdirs(os.path.dirname(filename)) fh = logging.StreamHandler(_cached_log_stream(filename)) fh.setLevel(logging.DEBUG) fh.setFormatter(plain_formatter) logger.addHandler(fh) return logger # cache the opened file object, so that different calls to `setup_logger` # with the same file name can safely write to the same file. @functools.lru_cache(maxsize=None) def _cached_log_stream(filename): # use 1K buffer if writing to cloud storage io = PathManager.open(filename, "a", buffering=1024 if "://" in filename else -1) atexit.register(io.close) return io """ Below are some other convenient logging methods. They are mainly adopted from https://github.com/abseil/abseil-py/blob/master/absl/logging/__init__.py """ def _find_caller(): """ Returns: str: module name of the caller tuple: a hashable key to be used to identify different callers """ frame = sys._getframe(2) while frame: code = frame.f_code if os.path.join("utils", "logger.") not in code.co_filename: mod_name = frame.f_globals["__name__"] if mod_name == "__main__": mod_name = "detectron2" return mod_name, (code.co_filename, frame.f_lineno, code.co_name) frame = frame.f_back _LOG_COUNTER = Counter() _LOG_TIMER = {} def log_first_n(lvl, msg, n=1, *, name=None, key="caller"): """ Log only for the first n times. Args: lvl (int): the logging level msg (str): n (int): name (str): name of the logger to use. Will use the caller's module by default. key (str or tuple[str]): the string(s) can be one of "caller" or "message", which defines how to identify duplicated logs. For example, if called with `n=1, key="caller"`, this function will only log the first call from the same caller, regardless of the message content. If called with `n=1, key="message"`, this function will log the same content only once, even if they are called from different places. If called with `n=1, key=("caller", "message")`, this function will not log only if the same caller has logged the same message before. """ if isinstance(key, str): key = (key,) assert len(key) > 0 caller_module, caller_key = _find_caller() hash_key = () if "caller" in key: hash_key = hash_key + caller_key if "message" in key: hash_key = hash_key + (msg,) _LOG_COUNTER[hash_key] += 1 if _LOG_COUNTER[hash_key] <= n: logging.getLogger(name or caller_module).log(lvl, msg) def log_every_n(lvl, msg, n=1, *, name=None): """ Log once per n times. Args: lvl (int): the logging level msg (str): n (int): name (str): name of the logger to use. Will use the caller's module by default. """ caller_module, key = _find_caller() _LOG_COUNTER[key] += 1 if n == 1 or _LOG_COUNTER[key] % n == 1: logging.getLogger(name or caller_module).log(lvl, msg) def log_every_n_seconds(lvl, msg, n=1, *, name=None): """ Log no more than once per n seconds. Args: lvl (int): the logging level msg (str): n (int): name (str): name of the logger to use. Will use the caller's module by default. """ caller_module, key = _find_caller() last_logged = _LOG_TIMER.get(key, None) current_time = time.time() if last_logged is None or current_time - last_logged >= n: logging.getLogger(name or caller_module).log(lvl, msg) _LOG_TIMER[key] = current_time def create_small_table(small_dict): """ Create a small table using the keys of small_dict as headers. This is only suitable for small dictionaries. Args: small_dict (dict): a result dictionary of only a few items. Returns: str: the table as a string. """ keys, values = tuple(zip(*small_dict.items())) table = tabulate( [values], headers=keys, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center", ) return table def _log_api_usage(identifier: str): """ Internal function used to log the usage of different detectron2 components inside facebook's infra. """ torch._C._log_api_usage_once("detectron2." + identifier) ================================================ FILE: annotator/oneformer/detectron2/utils/memory.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging from contextlib import contextmanager from functools import wraps import torch __all__ = ["retry_if_cuda_oom"] @contextmanager def _ignore_torch_cuda_oom(): """ A context which ignores CUDA OOM exception from pytorch. """ try: yield except RuntimeError as e: # NOTE: the string may change? if "CUDA out of memory. " in str(e): pass else: raise def retry_if_cuda_oom(func): """ Makes a function retry itself after encountering pytorch's CUDA OOM error. It will first retry after calling `torch.cuda.empty_cache()`. If that still fails, it will then retry by trying to convert inputs to CPUs. In this case, it expects the function to dispatch to CPU implementation. The return values may become CPU tensors as well and it's user's responsibility to convert it back to CUDA tensor if needed. Args: func: a stateless callable that takes tensor-like objects as arguments Returns: a callable which retries `func` if OOM is encountered. Examples: :: output = retry_if_cuda_oom(some_torch_function)(input1, input2) # output may be on CPU even if inputs are on GPU Note: 1. When converting inputs to CPU, it will only look at each argument and check if it has `.device` and `.to` for conversion. Nested structures of tensors are not supported. 2. Since the function might be called more than once, it has to be stateless. """ def maybe_to_cpu(x): try: like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to") except AttributeError: like_gpu_tensor = False if like_gpu_tensor: return x.to(device="cpu") else: return x @wraps(func) def wrapped(*args, **kwargs): with _ignore_torch_cuda_oom(): return func(*args, **kwargs) # Clear cache and retry torch.cuda.empty_cache() with _ignore_torch_cuda_oom(): return func(*args, **kwargs) # Try on CPU. This slows down the code significantly, therefore print a notice. logger = logging.getLogger(__name__) logger.info("Attempting to copy inputs of {} to CPU due to CUDA OOM".format(str(func))) new_args = (maybe_to_cpu(x) for x in args) new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} return func(*new_args, **new_kwargs) return wrapped ================================================ FILE: annotator/oneformer/detectron2/utils/registry.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from typing import Any import pydoc from fvcore.common.registry import Registry # for backward compatibility. """ ``Registry`` and `locate` provide ways to map a string (typically found in config files) to callable objects. """ __all__ = ["Registry", "locate"] def _convert_target_to_string(t: Any) -> str: """ Inverse of ``locate()``. Args: t: any object with ``__module__`` and ``__qualname__`` """ module, qualname = t.__module__, t.__qualname__ # Compress the path to this object, e.g. ``module.submodule._impl.class`` # may become ``module.submodule.class``, if the later also resolves to the same # object. This simplifies the string, and also is less affected by moving the # class implementation. module_parts = module.split(".") for k in range(1, len(module_parts)): prefix = ".".join(module_parts[:k]) candidate = f"{prefix}.{qualname}" try: if locate(candidate) is t: return candidate except ImportError: pass return f"{module}.{qualname}" def locate(name: str) -> Any: """ Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``, such as "module.submodule.class_name". Raise Exception if it cannot be found. """ obj = pydoc.locate(name) # Some cases (e.g. torch.optim.sgd.SGD) not handled correctly # by pydoc.locate. Try a private function from hydra. if obj is None: try: # from hydra.utils import get_method - will print many errors from hydra.utils import _locate except ImportError as e: raise ImportError(f"Cannot dynamically locate object {name}!") from e else: obj = _locate(name) # it raises if fails return obj ================================================ FILE: annotator/oneformer/detectron2/utils/serialize.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # import cloudpickle class PicklableWrapper(object): """ Wrap an object to make it more picklable, note that it uses heavy weight serialization libraries that are slower than pickle. It's best to use it only on closures (which are usually not picklable). This is a simplified version of https://github.com/joblib/joblib/blob/master/joblib/externals/loky/cloudpickle_wrapper.py """ def __init__(self, obj): while isinstance(obj, PicklableWrapper): # Wrapping an object twice is no-op obj = obj._obj self._obj = obj # def __reduce__(self): # s = cloudpickle.dumps(self._obj) # return cloudpickle.loads, (s,) def __call__(self, *args, **kwargs): return self._obj(*args, **kwargs) def __getattr__(self, attr): # Ensure that the wrapped object can be used seamlessly as the previous object. if attr not in ["_obj"]: return getattr(self._obj, attr) return getattr(self, attr) ================================================ FILE: annotator/oneformer/detectron2/utils/testing.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import io import numpy as np import os import re import tempfile import unittest from typing import Callable import torch import torch.onnx.symbolic_helper as sym_help from packaging import version from torch._C import ListType from torch.onnx import register_custom_op_symbolic from annotator.oneformer.detectron2 import model_zoo from annotator.oneformer.detectron2.config import CfgNode, LazyConfig, instantiate from annotator.oneformer.detectron2.data import DatasetCatalog from annotator.oneformer.detectron2.data.detection_utils import read_image from annotator.oneformer.detectron2.modeling import build_model from annotator.oneformer.detectron2.structures import Boxes, Instances, ROIMasks from annotator.oneformer.detectron2.utils.file_io import PathManager """ Internal utilities for tests. Don't use except for writing tests. """ def get_model_no_weights(config_path): """ Like model_zoo.get, but do not load any weights (even pretrained) """ cfg = model_zoo.get_config(config_path) if isinstance(cfg, CfgNode): if not torch.cuda.is_available(): cfg.MODEL.DEVICE = "cpu" return build_model(cfg) else: return instantiate(cfg.model) def random_boxes(num_boxes, max_coord=100, device="cpu"): """ Create a random Nx4 boxes tensor, with coordinates < max_coord. """ boxes = torch.rand(num_boxes, 4, device=device) * (max_coord * 0.5) boxes.clamp_(min=1.0) # tiny boxes cause numerical instability in box regression # Note: the implementation of this function in torchvision is: # boxes[:, 2:] += torch.rand(N, 2) * 100 # but it does not guarantee non-negative widths/heights constraints: # boxes[:, 2] >= boxes[:, 0] and boxes[:, 3] >= boxes[:, 1]: boxes[:, 2:] += boxes[:, :2] return boxes def get_sample_coco_image(tensor=True): """ Args: tensor (bool): if True, returns 3xHxW tensor. else, returns a HxWx3 numpy array. Returns: an image, in BGR color. """ try: file_name = DatasetCatalog.get("coco_2017_val_100")[0]["file_name"] if not PathManager.exists(file_name): raise FileNotFoundError() except IOError: # for public CI to run file_name = PathManager.get_local_path( "http://images.cocodataset.org/train2017/000000000009.jpg" ) ret = read_image(file_name, format="BGR") if tensor: ret = torch.from_numpy(np.ascontiguousarray(ret.transpose(2, 0, 1))) return ret def convert_scripted_instances(instances): """ Convert a scripted Instances object to a regular :class:`Instances` object """ assert hasattr( instances, "image_size" ), f"Expect an Instances object, but got {type(instances)}!" ret = Instances(instances.image_size) for name in instances._field_names: val = getattr(instances, "_" + name, None) if val is not None: ret.set(name, val) return ret def assert_instances_allclose(input, other, *, rtol=1e-5, msg="", size_as_tensor=False): """ Args: input, other (Instances): size_as_tensor: compare image_size of the Instances as tensors (instead of tuples). Useful for comparing outputs of tracing. """ if not isinstance(input, Instances): input = convert_scripted_instances(input) if not isinstance(other, Instances): other = convert_scripted_instances(other) if not msg: msg = "Two Instances are different! " else: msg = msg.rstrip() + " " size_error_msg = msg + f"image_size is {input.image_size} vs. {other.image_size}!" if size_as_tensor: assert torch.equal( torch.tensor(input.image_size), torch.tensor(other.image_size) ), size_error_msg else: assert input.image_size == other.image_size, size_error_msg fields = sorted(input.get_fields().keys()) fields_other = sorted(other.get_fields().keys()) assert fields == fields_other, msg + f"Fields are {fields} vs {fields_other}!" for f in fields: val1, val2 = input.get(f), other.get(f) if isinstance(val1, (Boxes, ROIMasks)): # boxes in the range of O(100) and can have a larger tolerance assert torch.allclose(val1.tensor, val2.tensor, atol=100 * rtol), ( msg + f"Field {f} differs too much!" ) elif isinstance(val1, torch.Tensor): if val1.dtype.is_floating_point: mag = torch.abs(val1).max().cpu().item() assert torch.allclose(val1, val2, atol=mag * rtol), ( msg + f"Field {f} differs too much!" ) else: assert torch.equal(val1, val2), msg + f"Field {f} is different!" else: raise ValueError(f"Don't know how to compare type {type(val1)}") def reload_script_model(module): """ Save a jit module and load it back. Similar to the `getExportImportCopy` function in torch/testing/ """ buffer = io.BytesIO() torch.jit.save(module, buffer) buffer.seek(0) return torch.jit.load(buffer) def reload_lazy_config(cfg): """ Save an object by LazyConfig.save and load it back. This is used to test that a config still works the same after serialization/deserialization. """ with tempfile.TemporaryDirectory(prefix="detectron2") as d: fname = os.path.join(d, "d2_cfg_test.yaml") LazyConfig.save(cfg, fname) return LazyConfig.load(fname) def min_torch_version(min_version: str) -> bool: """ Returns True when torch's version is at least `min_version`. """ try: import torch except ImportError: return False installed_version = version.parse(torch.__version__.split("+")[0]) min_version = version.parse(min_version) return installed_version >= min_version def has_dynamic_axes(onnx_model): """ Return True when all ONNX input/output have only dynamic axes for all ranks """ return all( not dim.dim_param.isnumeric() for inp in onnx_model.graph.input for dim in inp.type.tensor_type.shape.dim ) and all( not dim.dim_param.isnumeric() for out in onnx_model.graph.output for dim in out.type.tensor_type.shape.dim ) def register_custom_op_onnx_export( opname: str, symbolic_fn: Callable, opset_version: int, min_version: str ) -> None: """ Register `symbolic_fn` as PyTorch's symbolic `opname`-`opset_version` for ONNX export. The registration is performed only when current PyTorch's version is < `min_version.` IMPORTANT: symbolic must be manually unregistered after the caller function returns """ if min_torch_version(min_version): return register_custom_op_symbolic(opname, symbolic_fn, opset_version) print(f"_register_custom_op_onnx_export({opname}, {opset_version}) succeeded.") def unregister_custom_op_onnx_export(opname: str, opset_version: int, min_version: str) -> None: """ Unregister PyTorch's symbolic `opname`-`opset_version` for ONNX export. The un-registration is performed only when PyTorch's version is < `min_version` IMPORTANT: The symbolic must have been manually registered by the caller, otherwise the incorrect symbolic may be unregistered instead. """ # TODO: _unregister_custom_op_symbolic is introduced PyTorch>=1.10 # Remove after PyTorch 1.10+ is used by ALL detectron2's CI try: from torch.onnx import unregister_custom_op_symbolic as _unregister_custom_op_symbolic except ImportError: def _unregister_custom_op_symbolic(symbolic_name, opset_version): import torch.onnx.symbolic_registry as sym_registry from torch.onnx.symbolic_helper import _onnx_main_opset, _onnx_stable_opsets def _get_ns_op_name_from_custom_op(symbolic_name): try: from torch.onnx.utils import get_ns_op_name_from_custom_op ns, op_name = get_ns_op_name_from_custom_op(symbolic_name) except ImportError as import_error: if not bool( re.match(r"^[a-zA-Z0-9-_]*::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name) ): raise ValueError( f"Invalid symbolic name {symbolic_name}. Must be `domain::name`" ) from import_error ns, op_name = symbolic_name.split("::") if ns == "onnx": raise ValueError(f"{ns} domain cannot be modified.") from import_error if ns == "aten": ns = "" return ns, op_name def _unregister_op(opname: str, domain: str, version: int): try: sym_registry.unregister_op(op_name, ns, ver) except AttributeError as attribute_error: if sym_registry.is_registered_op(opname, domain, version): del sym_registry._registry[(domain, version)][opname] if not sym_registry._registry[(domain, version)]: del sym_registry._registry[(domain, version)] else: raise RuntimeError( f"The opname {opname} is not registered." ) from attribute_error ns, op_name = _get_ns_op_name_from_custom_op(symbolic_name) for ver in _onnx_stable_opsets + [_onnx_main_opset]: if ver >= opset_version: _unregister_op(op_name, ns, ver) if min_torch_version(min_version): return _unregister_custom_op_symbolic(opname, opset_version) print(f"_unregister_custom_op_onnx_export({opname}, {opset_version}) succeeded.") skipIfOnCPUCI = unittest.skipIf( os.environ.get("CI") and not torch.cuda.is_available(), "The test is too slow on CPUs and will be executed on CircleCI's GPU jobs.", ) def skipIfUnsupportedMinOpsetVersion(min_opset_version, current_opset_version=None): """ Skips tests for ONNX Opset versions older than min_opset_version. """ def skip_dec(func): def wrapper(self): try: opset_version = self.opset_version except AttributeError: opset_version = current_opset_version if opset_version < min_opset_version: raise unittest.SkipTest( f"Unsupported opset_version {opset_version}" f", required is {min_opset_version}" ) return func(self) return wrapper return skip_dec def skipIfUnsupportedMinTorchVersion(min_version): """ Skips tests for PyTorch versions older than min_version. """ reason = f"module 'torch' has __version__ {torch.__version__}" f", required is: {min_version}" return unittest.skipIf(not min_torch_version(min_version), reason) # TODO: Remove after PyTorch 1.11.1+ is used by detectron2's CI def _pytorch1111_symbolic_opset9_to(g, self, *args): """aten::to() symbolic that must be used for testing with PyTorch < 1.11.1.""" def is_aten_to_device_only(args): if len(args) == 4: # aten::to(Tensor, Device, bool, bool, memory_format) return ( args[0].node().kind() == "prim::device" or args[0].type().isSubtypeOf(ListType.ofInts()) or ( sym_help._is_value(args[0]) and args[0].node().kind() == "onnx::Constant" and isinstance(args[0].node()["value"], str) ) ) elif len(args) == 5: # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format) # When dtype is None, this is a aten::to(device) call dtype = sym_help._get_const(args[1], "i", "dtype") return dtype is None elif len(args) in (6, 7): # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) # When dtype is None, this is a aten::to(device) call dtype = sym_help._get_const(args[0], "i", "dtype") return dtype is None return False # ONNX doesn't have a concept of a device, so we ignore device-only casts if is_aten_to_device_only(args): return self if len(args) == 4: # TestONNXRuntime::test_ones_bool shows args[0] of aten::to can be onnx::Constant[Tensor] # In this case, the constant value is a tensor not int, # so sym_help._maybe_get_const(args[0], 'i') would not work. dtype = args[0] if sym_help._is_value(args[0]) and args[0].node().kind() == "onnx::Constant": tval = args[0].node()["value"] if isinstance(tval, torch.Tensor): if len(tval.shape) == 0: tval = tval.item() dtype = int(tval) else: dtype = tval if sym_help._is_value(dtype) or isinstance(dtype, torch.Tensor): # aten::to(Tensor, Tensor, bool, bool, memory_format) dtype = args[0].type().scalarType() return g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[dtype]) else: # aten::to(Tensor, ScalarType, bool, bool, memory_format) # memory_format is ignored return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype]) elif len(args) == 5: # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format) dtype = sym_help._get_const(args[1], "i", "dtype") # memory_format is ignored return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype]) elif len(args) == 6: # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) dtype = sym_help._get_const(args[0], "i", "dtype") # Layout, device and memory_format are ignored return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype]) elif len(args) == 7: # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) dtype = sym_help._get_const(args[0], "i", "dtype") # Layout, device and memory_format are ignored return g.op("Cast", self, to_i=sym_help.scalar_type_to_onnx[dtype]) else: return sym_help._onnx_unsupported("Unknown aten::to signature") # TODO: Remove after PyTorch 1.11.1+ is used by detectron2's CI def _pytorch1111_symbolic_opset9_repeat_interleave(g, self, repeats, dim=None, output_size=None): # from torch.onnx.symbolic_helper import ScalarType from torch.onnx.symbolic_opset9 import expand, unsqueeze input = self # if dim is None flatten # By default, use the flattened input array, and return a flat output array if sym_help._is_none(dim): input = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1]))) dim = 0 else: dim = sym_help._maybe_get_scalar(dim) repeats_dim = sym_help._get_tensor_rank(repeats) repeats_sizes = sym_help._get_tensor_sizes(repeats) input_sizes = sym_help._get_tensor_sizes(input) if repeats_dim is None: raise RuntimeError( "Unsupported: ONNX export of repeat_interleave for unknown " "repeats rank." ) if repeats_sizes is None: raise RuntimeError( "Unsupported: ONNX export of repeat_interleave for unknown " "repeats size." ) if input_sizes is None: raise RuntimeError( "Unsupported: ONNX export of repeat_interleave for unknown " "input size." ) input_sizes_temp = input_sizes.copy() for idx, input_size in enumerate(input_sizes): if input_size is None: input_sizes[idx], input_sizes_temp[idx] = 0, -1 # Cases where repeats is an int or single value tensor if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1): if not sym_help._is_tensor(repeats): repeats = g.op("Constant", value_t=torch.LongTensor(repeats)) if input_sizes[dim] == 0: return sym_help._onnx_opset_unsupported_detailed( "repeat_interleave", 9, 13, "Unsupported along dimension with unknown input size", ) else: reps = input_sizes[dim] repeats = expand(g, repeats, g.op("Constant", value_t=torch.tensor([reps])), None) # Cases where repeats is a 1 dim Tensor elif repeats_dim == 1: if input_sizes[dim] == 0: return sym_help._onnx_opset_unsupported_detailed( "repeat_interleave", 9, 13, "Unsupported along dimension with unknown input size", ) if repeats_sizes[0] is None: return sym_help._onnx_opset_unsupported_detailed( "repeat_interleave", 9, 13, "Unsupported for cases with dynamic repeats" ) assert ( repeats_sizes[0] == input_sizes[dim] ), "repeats must have the same size as input along dim" reps = repeats_sizes[0] else: raise RuntimeError("repeats must be 0-dim or 1-dim tensor") final_splits = list() r_splits = sym_help._repeat_interleave_split_helper(g, repeats, reps, 0) if isinstance(r_splits, torch._C.Value): r_splits = [r_splits] i_splits = sym_help._repeat_interleave_split_helper(g, input, reps, dim) if isinstance(i_splits, torch._C.Value): i_splits = [i_splits] input_sizes[dim], input_sizes_temp[dim] = -1, 1 for idx, r_split in enumerate(r_splits): i_split = unsqueeze(g, i_splits[idx], dim + 1) r_concat = [ g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])), r_split, g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])), ] r_concat = g.op("Concat", *r_concat, axis_i=0) i_split = expand(g, i_split, r_concat, None) i_split = sym_help._reshape_helper( g, i_split, g.op("Constant", value_t=torch.LongTensor(input_sizes)), allowzero=0, ) final_splits.append(i_split) return g.op("Concat", *final_splits, axis_i=dim) ================================================ FILE: annotator/oneformer/detectron2/utils/tracing.py ================================================ import inspect import torch from annotator.oneformer.detectron2.utils.env import TORCH_VERSION try: from torch.fx._symbolic_trace import is_fx_tracing as is_fx_tracing_current tracing_current_exists = True except ImportError: tracing_current_exists = False try: from torch.fx._symbolic_trace import _orig_module_call tracing_legacy_exists = True except ImportError: tracing_legacy_exists = False @torch.jit.ignore def is_fx_tracing_legacy() -> bool: """ Returns a bool indicating whether torch.fx is currently symbolically tracing a module. Can be useful for gating module logic that is incompatible with symbolic tracing. """ return torch.nn.Module.__call__ is not _orig_module_call @torch.jit.ignore def is_fx_tracing() -> bool: """Returns whether execution is currently in Torch FX tracing mode""" if TORCH_VERSION >= (1, 10) and tracing_current_exists: return is_fx_tracing_current() elif tracing_legacy_exists: return is_fx_tracing_legacy() else: # Can't find either current or legacy tracing indication code. # Enabling this assert_fx_safe() call regardless of tracing status. return False @torch.jit.ignore def assert_fx_safe(condition: bool, message: str) -> torch.Tensor: """An FX-tracing safe version of assert. Avoids erroneous type assertion triggering when types are masked inside an fx.proxy.Proxy object during tracing. Args: condition - either a boolean expression or a string representing the condition to test. If this assert triggers an exception when tracing due to dynamic control flow, try encasing the expression in quotation marks and supplying it as a string.""" # Must return a concrete tensor for compatibility with PyTorch <=1.8. # If <=1.8 compatibility is not needed, return type can be converted to None if not is_fx_tracing(): try: if isinstance(condition, str): caller_frame = inspect.currentframe().f_back torch._assert( eval(condition, caller_frame.f_globals, caller_frame.f_locals), message ) return torch.ones(1) else: torch._assert(condition, message) return torch.ones(1) except torch.fx.proxy.TraceError as e: print( "Found a non-FX compatible assertion. Skipping the check. Failure is shown below" + str(e) ) return torch.zeros(1) ================================================ FILE: annotator/oneformer/detectron2/utils/video_visualizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import numpy as np from typing import List import annotator.oneformer.pycocotools.mask as mask_util from annotator.oneformer.detectron2.structures import Instances from annotator.oneformer.detectron2.utils.visualizer import ( ColorMode, Visualizer, _create_text_labels, _PanopticPrediction, ) from .colormap import random_color, random_colors class _DetectedInstance: """ Used to store data about detected objects in video frame, in order to transfer color to objects in the future frames. Attributes: label (int): bbox (tuple[float]): mask_rle (dict): color (tuple[float]): RGB colors in range (0, 1) ttl (int): time-to-live for the instance. For example, if ttl=2, the instance color can be transferred to objects in the next two frames. """ __slots__ = ["label", "bbox", "mask_rle", "color", "ttl"] def __init__(self, label, bbox, mask_rle, color, ttl): self.label = label self.bbox = bbox self.mask_rle = mask_rle self.color = color self.ttl = ttl class VideoVisualizer: def __init__(self, metadata, instance_mode=ColorMode.IMAGE): """ Args: metadata (MetadataCatalog): image metadata. """ self.metadata = metadata self._old_instances = [] assert instance_mode in [ ColorMode.IMAGE, ColorMode.IMAGE_BW, ], "Other mode not supported yet." self._instance_mode = instance_mode self._max_num_instances = self.metadata.get("max_num_instances", 74) self._assigned_colors = {} self._color_pool = random_colors(self._max_num_instances, rgb=True, maximum=1) self._color_idx_set = set(range(len(self._color_pool))) def draw_instance_predictions(self, frame, predictions): """ Draw instance-level prediction results on an image. Args: frame (ndarray): an RGB image of shape (H, W, C), in the range [0, 255]. predictions (Instances): the output of an instance detection/segmentation model. Following fields will be used to draw: "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). Returns: output (VisImage): image object with visualizations. """ frame_visualizer = Visualizer(frame, self.metadata) num_instances = len(predictions) if num_instances == 0: return frame_visualizer.output boxes = predictions.pred_boxes.tensor.numpy() if predictions.has("pred_boxes") else None scores = predictions.scores if predictions.has("scores") else None classes = predictions.pred_classes.numpy() if predictions.has("pred_classes") else None keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None colors = predictions.COLOR if predictions.has("COLOR") else [None] * len(predictions) periods = predictions.ID_period if predictions.has("ID_period") else None period_threshold = self.metadata.get("period_threshold", 0) visibilities = ( [True] * len(predictions) if periods is None else [x > period_threshold for x in periods] ) if predictions.has("pred_masks"): masks = predictions.pred_masks # mask IOU is not yet enabled # masks_rles = mask_util.encode(np.asarray(masks.permute(1, 2, 0), order="F")) # assert len(masks_rles) == num_instances else: masks = None if not predictions.has("COLOR"): if predictions.has("ID"): colors = self._assign_colors_by_id(predictions) else: # ToDo: clean old assign color method and use a default tracker to assign id detected = [ _DetectedInstance(classes[i], boxes[i], mask_rle=None, color=colors[i], ttl=8) for i in range(num_instances) ] colors = self._assign_colors(detected) labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None)) if self._instance_mode == ColorMode.IMAGE_BW: # any() returns uint8 tensor frame_visualizer.output.reset_image( frame_visualizer._create_grayscale_image( (masks.any(dim=0) > 0).numpy() if masks is not None else None ) ) alpha = 0.3 else: alpha = 0.5 labels = ( None if labels is None else [y[0] for y in filter(lambda x: x[1], zip(labels, visibilities))] ) # noqa assigned_colors = ( None if colors is None else [y[0] for y in filter(lambda x: x[1], zip(colors, visibilities))] ) # noqa frame_visualizer.overlay_instances( boxes=None if masks is not None else boxes[visibilities], # boxes are a bit distracting masks=None if masks is None else masks[visibilities], labels=labels, keypoints=None if keypoints is None else keypoints[visibilities], assigned_colors=assigned_colors, alpha=alpha, ) return frame_visualizer.output def draw_sem_seg(self, frame, sem_seg, area_threshold=None): """ Args: sem_seg (ndarray or Tensor): semantic segmentation of shape (H, W), each value is the integer label. area_threshold (Optional[int]): only draw segmentations larger than the threshold """ # don't need to do anything special frame_visualizer = Visualizer(frame, self.metadata) frame_visualizer.draw_sem_seg(sem_seg, area_threshold=None) return frame_visualizer.output def draw_panoptic_seg_predictions( self, frame, panoptic_seg, segments_info, area_threshold=None, alpha=0.5 ): frame_visualizer = Visualizer(frame, self.metadata) pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata) if self._instance_mode == ColorMode.IMAGE_BW: frame_visualizer.output.reset_image( frame_visualizer._create_grayscale_image(pred.non_empty_mask()) ) # draw mask for all semantic segments first i.e. "stuff" for mask, sinfo in pred.semantic_masks(): category_idx = sinfo["category_id"] try: mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]] except AttributeError: mask_color = None frame_visualizer.draw_binary_mask( mask, color=mask_color, text=self.metadata.stuff_classes[category_idx], alpha=alpha, area_threshold=area_threshold, ) all_instances = list(pred.instance_masks()) if len(all_instances) == 0: return frame_visualizer.output # draw mask for all instances second masks, sinfo = list(zip(*all_instances)) num_instances = len(masks) masks_rles = mask_util.encode( np.asarray(np.asarray(masks).transpose(1, 2, 0), dtype=np.uint8, order="F") ) assert len(masks_rles) == num_instances category_ids = [x["category_id"] for x in sinfo] detected = [ _DetectedInstance(category_ids[i], bbox=None, mask_rle=masks_rles[i], color=None, ttl=8) for i in range(num_instances) ] colors = self._assign_colors(detected) labels = [self.metadata.thing_classes[k] for k in category_ids] frame_visualizer.overlay_instances( boxes=None, masks=masks, labels=labels, keypoints=None, assigned_colors=colors, alpha=alpha, ) return frame_visualizer.output def _assign_colors(self, instances): """ Naive tracking heuristics to assign same color to the same instance, will update the internal state of tracked instances. Returns: list[tuple[float]]: list of colors. """ # Compute iou with either boxes or masks: is_crowd = np.zeros((len(instances),), dtype=bool) if instances[0].bbox is None: assert instances[0].mask_rle is not None # use mask iou only when box iou is None # because box seems good enough rles_old = [x.mask_rle for x in self._old_instances] rles_new = [x.mask_rle for x in instances] ious = mask_util.iou(rles_old, rles_new, is_crowd) threshold = 0.5 else: boxes_old = [x.bbox for x in self._old_instances] boxes_new = [x.bbox for x in instances] ious = mask_util.iou(boxes_old, boxes_new, is_crowd) threshold = 0.6 if len(ious) == 0: ious = np.zeros((len(self._old_instances), len(instances)), dtype="float32") # Only allow matching instances of the same label: for old_idx, old in enumerate(self._old_instances): for new_idx, new in enumerate(instances): if old.label != new.label: ious[old_idx, new_idx] = 0 matched_new_per_old = np.asarray(ious).argmax(axis=1) max_iou_per_old = np.asarray(ious).max(axis=1) # Try to find match for each old instance: extra_instances = [] for idx, inst in enumerate(self._old_instances): if max_iou_per_old[idx] > threshold: newidx = matched_new_per_old[idx] if instances[newidx].color is None: instances[newidx].color = inst.color continue # If an old instance does not match any new instances, # keep it for the next frame in case it is just missed by the detector inst.ttl -= 1 if inst.ttl > 0: extra_instances.append(inst) # Assign random color to newly-detected instances: for inst in instances: if inst.color is None: inst.color = random_color(rgb=True, maximum=1) self._old_instances = instances[:] + extra_instances return [d.color for d in instances] def _assign_colors_by_id(self, instances: Instances) -> List: colors = [] untracked_ids = set(self._assigned_colors.keys()) for id in instances.ID: if id in self._assigned_colors: colors.append(self._color_pool[self._assigned_colors[id]]) untracked_ids.remove(id) else: assert ( len(self._color_idx_set) >= 1 ), f"Number of id exceeded maximum, \ max = {self._max_num_instances}" idx = self._color_idx_set.pop() color = self._color_pool[idx] self._assigned_colors[id] = idx colors.append(color) for id in untracked_ids: self._color_idx_set.add(self._assigned_colors[id]) del self._assigned_colors[id] return colors ================================================ FILE: annotator/oneformer/detectron2/utils/visualizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import colorsys import logging import math import numpy as np from enum import Enum, unique import cv2 import matplotlib as mpl import matplotlib.colors as mplc import matplotlib.figure as mplfigure import annotator.oneformer.pycocotools.mask as mask_util import torch from matplotlib.backends.backend_agg import FigureCanvasAgg from PIL import Image from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes from annotator.oneformer.detectron2.utils.file_io import PathManager from .colormap import random_color logger = logging.getLogger(__name__) __all__ = ["ColorMode", "VisImage", "Visualizer"] _SMALL_OBJECT_AREA_THRESH = 1000 _LARGE_MASK_AREA_THRESH = 120000 _OFF_WHITE = (1.0, 1.0, 240.0 / 255) _BLACK = (0, 0, 0) _RED = (1.0, 0, 0) _KEYPOINT_THRESHOLD = 0.05 @unique class ColorMode(Enum): """ Enum of different color modes to use for instance visualizations. """ IMAGE = 0 """ Picks a random color for every instance and overlay segmentations with low opacity. """ SEGMENTATION = 1 """ Let instances of the same category have similar colors (from metadata.thing_colors), and overlay them with high opacity. This provides more attention on the quality of segmentation. """ IMAGE_BW = 2 """ Same as IMAGE, but convert all areas without masks to gray-scale. Only available for drawing per-instance mask predictions. """ class GenericMask: """ Attribute: polygons (list[ndarray]): list[ndarray]: polygons for this mask. Each ndarray has format [x, y, x, y, ...] mask (ndarray): a binary mask """ def __init__(self, mask_or_polygons, height, width): self._mask = self._polygons = self._has_holes = None self.height = height self.width = width m = mask_or_polygons if isinstance(m, dict): # RLEs assert "counts" in m and "size" in m if isinstance(m["counts"], list): # uncompressed RLEs h, w = m["size"] assert h == height and w == width m = mask_util.frPyObjects(m, h, w) self._mask = mask_util.decode(m)[:, :] return if isinstance(m, list): # list[ndarray] self._polygons = [np.asarray(x).reshape(-1) for x in m] return if isinstance(m, np.ndarray): # assumed to be a binary mask assert m.shape[1] != 2, m.shape assert m.shape == ( height, width, ), f"mask shape: {m.shape}, target dims: {height}, {width}" self._mask = m.astype("uint8") return raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m))) @property def mask(self): if self._mask is None: self._mask = self.polygons_to_mask(self._polygons) return self._mask @property def polygons(self): if self._polygons is None: self._polygons, self._has_holes = self.mask_to_polygons(self._mask) return self._polygons @property def has_holes(self): if self._has_holes is None: if self._mask is not None: self._polygons, self._has_holes = self.mask_to_polygons(self._mask) else: self._has_holes = False # if original format is polygon, does not have holes return self._has_holes def mask_to_polygons(self, mask): # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level # hierarchy. External contours (boundary) of the object are placed in hierarchy-1. # Internal contours (holes) are placed in hierarchy-2. # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours. mask = np.ascontiguousarray(mask) # some versions of cv2 does not support incontiguous arr res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) hierarchy = res[-1] if hierarchy is None: # empty mask return [], False has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0 res = res[-2] res = [x.flatten() for x in res] # These coordinates from OpenCV are integers in range [0, W-1 or H-1]. # We add 0.5 to turn them into real-value coordinate space. A better solution # would be to first +0.5 and then dilate the returned polygon by 0.5. res = [x + 0.5 for x in res if len(x) >= 6] return res, has_holes def polygons_to_mask(self, polygons): rle = mask_util.frPyObjects(polygons, self.height, self.width) rle = mask_util.merge(rle) return mask_util.decode(rle)[:, :] def area(self): return self.mask.sum() def bbox(self): p = mask_util.frPyObjects(self.polygons, self.height, self.width) p = mask_util.merge(p) bbox = mask_util.toBbox(p) bbox[2] += bbox[0] bbox[3] += bbox[1] return bbox class _PanopticPrediction: """ Unify different panoptic annotation/prediction formats """ def __init__(self, panoptic_seg, segments_info, metadata=None): if segments_info is None: assert metadata is not None # If "segments_info" is None, we assume "panoptic_img" is a # H*W int32 image storing the panoptic_id in the format of # category_id * label_divisor + instance_id. We reserve -1 for # VOID label. label_divisor = metadata.label_divisor segments_info = [] for panoptic_label in np.unique(panoptic_seg.numpy()): if panoptic_label == -1: # VOID region. continue pred_class = panoptic_label // label_divisor isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values() segments_info.append( { "id": int(panoptic_label), "category_id": int(pred_class), "isthing": bool(isthing), } ) del metadata self._seg = panoptic_seg self._sinfo = {s["id"]: s for s in segments_info} # seg id -> seg info segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True) areas = areas.numpy() sorted_idxs = np.argsort(-areas) self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs] self._seg_ids = self._seg_ids.tolist() for sid, area in zip(self._seg_ids, self._seg_areas): if sid in self._sinfo: self._sinfo[sid]["area"] = float(area) def non_empty_mask(self): """ Returns: (H, W) array, a mask for all pixels that have a prediction """ empty_ids = [] for id in self._seg_ids: if id not in self._sinfo: empty_ids.append(id) if len(empty_ids) == 0: return np.zeros(self._seg.shape, dtype=np.uint8) assert ( len(empty_ids) == 1 ), ">1 ids corresponds to no labels. This is currently not supported" return (self._seg != empty_ids[0]).numpy().astype(bool) def semantic_masks(self): for sid in self._seg_ids: sinfo = self._sinfo.get(sid) if sinfo is None or sinfo["isthing"]: # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions. continue yield (self._seg == sid).numpy().astype(bool), sinfo def instance_masks(self): for sid in self._seg_ids: sinfo = self._sinfo.get(sid) if sinfo is None or not sinfo["isthing"]: continue mask = (self._seg == sid).numpy().astype(bool) if mask.sum() > 0: yield mask, sinfo def _create_text_labels(classes, scores, class_names, is_crowd=None): """ Args: classes (list[int] or None): scores (list[float] or None): class_names (list[str] or None): is_crowd (list[bool] or None): Returns: list[str] or None """ labels = None if classes is not None: if class_names is not None and len(class_names) > 0: labels = [class_names[i] for i in classes] else: labels = [str(i) for i in classes] if scores is not None: if labels is None: labels = ["{:.0f}%".format(s * 100) for s in scores] else: labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)] if labels is not None and is_crowd is not None: labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)] return labels class VisImage: def __init__(self, img, scale=1.0): """ Args: img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255]. scale (float): scale the input image """ self.img = img self.scale = scale self.width, self.height = img.shape[1], img.shape[0] self._setup_figure(img) def _setup_figure(self, img): """ Args: Same as in :meth:`__init__()`. Returns: fig (matplotlib.pyplot.figure): top level container for all the image plot elements. ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system. """ fig = mplfigure.Figure(frameon=False) self.dpi = fig.get_dpi() # add a small 1e-2 to avoid precision lost due to matplotlib's truncation # (https://github.com/matplotlib/matplotlib/issues/15363) fig.set_size_inches( (self.width * self.scale + 1e-2) / self.dpi, (self.height * self.scale + 1e-2) / self.dpi, ) self.canvas = FigureCanvasAgg(fig) # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig) ax = fig.add_axes([0.0, 0.0, 1.0, 1.0]) ax.axis("off") self.fig = fig self.ax = ax self.reset_image(img) def reset_image(self, img): """ Args: img: same as in __init__ """ img = img.astype("uint8") self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest") def save(self, filepath): """ Args: filepath (str): a string that contains the absolute path, including the file name, where the visualized image will be saved. """ self.fig.savefig(filepath) def get_image(self): """ Returns: ndarray: the visualized image of shape (H, W, 3) (RGB) in uint8 type. The shape is scaled w.r.t the input image using the given `scale` argument. """ canvas = self.canvas s, (width, height) = canvas.print_to_buffer() # buf = io.BytesIO() # works for cairo backend # canvas.print_rgba(buf) # width, height = self.width, self.height # s = buf.getvalue() buffer = np.frombuffer(s, dtype="uint8") img_rgba = buffer.reshape(height, width, 4) rgb, alpha = np.split(img_rgba, [3], axis=2) return rgb.astype("uint8") class Visualizer: """ Visualizer that draws data about detection/segmentation on images. It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}` that draw primitive objects to images, as well as high-level wrappers like `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}` that draw composite data in some pre-defined style. Note that the exact visualization style for the high-level wrappers are subject to change. Style such as color, opacity, label contents, visibility of labels, or even the visibility of objects themselves (e.g. when the object is too small) may change according to different heuristics, as long as the results still look visually reasonable. To obtain a consistent style, you can implement custom drawing functions with the abovementioned primitive methods instead. If you need more customized visualization styles, you can process the data yourself following their format documented in tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not intend to satisfy everyone's preference on drawing styles. This visualizer focuses on high rendering quality rather than performance. It is not designed to be used for real-time applications. """ # TODO implement a fast, rasterized version using OpenCV def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE): """ Args: img_rgb: a numpy array of shape (H, W, C), where H and W correspond to the height and width of the image respectively. C is the number of color channels. The image is required to be in RGB format since that is a requirement of the Matplotlib library. The image is also expected to be in the range [0, 255]. metadata (Metadata): dataset metadata (e.g. class names and colors) instance_mode (ColorMode): defines one of the pre-defined style for drawing instances on an image. """ self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8) if metadata is None: metadata = MetadataCatalog.get("__nonexist__") self.metadata = metadata self.output = VisImage(self.img, scale=scale) self.cpu_device = torch.device("cpu") # too small texts are useless, therefore clamp to 9 self._default_font_size = max( np.sqrt(self.output.height * self.output.width) // 90, 10 // scale ) self._instance_mode = instance_mode self.keypoint_threshold = _KEYPOINT_THRESHOLD def draw_instance_predictions(self, predictions): """ Draw instance-level prediction results on an image. Args: predictions (Instances): the output of an instance detection/segmentation model. Following fields will be used to draw: "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). Returns: output (VisImage): image object with visualizations. """ boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None scores = predictions.scores if predictions.has("scores") else None classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None)) keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None if predictions.has("pred_masks"): masks = np.asarray(predictions.pred_masks) masks = [GenericMask(x, self.output.height, self.output.width) for x in masks] else: masks = None if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"): colors = [ self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes ] alpha = 0.8 else: colors = None alpha = 0.5 if self._instance_mode == ColorMode.IMAGE_BW: self.output.reset_image( self._create_grayscale_image( (predictions.pred_masks.any(dim=0) > 0).numpy() if predictions.has("pred_masks") else None ) ) alpha = 0.3 self.overlay_instances( masks=masks, boxes=boxes, labels=labels, keypoints=keypoints, assigned_colors=colors, alpha=alpha, ) return self.output def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8): """ Draw semantic segmentation predictions/labels. Args: sem_seg (Tensor or ndarray): the segmentation of shape (H, W). Each value is the integer label of the pixel. area_threshold (int): segments with less than `area_threshold` are not drawn. alpha (float): the larger it is, the more opaque the segmentations are. Returns: output (VisImage): image object with visualizations. """ if isinstance(sem_seg, torch.Tensor): sem_seg = sem_seg.numpy() labels, areas = np.unique(sem_seg, return_counts=True) sorted_idxs = np.argsort(-areas).tolist() labels = labels[sorted_idxs] for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels): try: mask_color = [x / 255 for x in self.metadata.stuff_colors[label]] except (AttributeError, IndexError): mask_color = None binary_mask = (sem_seg == label).astype(np.uint8) text = self.metadata.stuff_classes[label] self.draw_binary_mask( binary_mask, color=mask_color, edge_color=_OFF_WHITE, text=text, alpha=alpha, area_threshold=area_threshold, ) return self.output def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7): """ Draw panoptic prediction annotations or results. Args: panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. segments_info (list[dict] or None): Describe each segment in `panoptic_seg`. If it is a ``list[dict]``, each dict contains keys "id", "category_id". If None, category id of each pixel is computed by ``pixel // metadata.label_divisor``. area_threshold (int): stuff segments with less than `area_threshold` are not drawn. Returns: output (VisImage): image object with visualizations. """ pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata) if self._instance_mode == ColorMode.IMAGE_BW: self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask())) # draw mask for all semantic segments first i.e. "stuff" for mask, sinfo in pred.semantic_masks(): category_idx = sinfo["category_id"] try: mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]] except AttributeError: mask_color = None text = self.metadata.stuff_classes[category_idx] self.draw_binary_mask( mask, color=mask_color, edge_color=_OFF_WHITE, text=text, alpha=alpha, area_threshold=area_threshold, ) # draw mask for all instances second all_instances = list(pred.instance_masks()) if len(all_instances) == 0: return self.output masks, sinfo = list(zip(*all_instances)) category_ids = [x["category_id"] for x in sinfo] try: scores = [x["score"] for x in sinfo] except KeyError: scores = None labels = _create_text_labels( category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo] ) try: colors = [ self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids ] except AttributeError: colors = None self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha) return self.output draw_panoptic_seg_predictions = draw_panoptic_seg # backward compatibility def draw_dataset_dict(self, dic): """ Draw annotations/segmentations in Detectron2 Dataset format. Args: dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. Returns: output (VisImage): image object with visualizations. """ annos = dic.get("annotations", None) if annos: if "segmentation" in annos[0]: masks = [x["segmentation"] for x in annos] else: masks = None if "keypoints" in annos[0]: keypts = [x["keypoints"] for x in annos] keypts = np.array(keypts).reshape(len(annos), -1, 3) else: keypts = None boxes = [ BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) if len(x["bbox"]) == 4 else x["bbox"] for x in annos ] colors = None category_ids = [x["category_id"] for x in annos] if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"): colors = [ self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids ] names = self.metadata.get("thing_classes", None) labels = _create_text_labels( category_ids, scores=None, class_names=names, is_crowd=[x.get("iscrowd", 0) for x in annos], ) self.overlay_instances( labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors ) sem_seg = dic.get("sem_seg", None) if sem_seg is None and "sem_seg_file_name" in dic: with PathManager.open(dic["sem_seg_file_name"], "rb") as f: sem_seg = Image.open(f) sem_seg = np.asarray(sem_seg, dtype="uint8") if sem_seg is not None: self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5) pan_seg = dic.get("pan_seg", None) if pan_seg is None and "pan_seg_file_name" in dic: with PathManager.open(dic["pan_seg_file_name"], "rb") as f: pan_seg = Image.open(f) pan_seg = np.asarray(pan_seg) from panopticapi.utils import rgb2id pan_seg = rgb2id(pan_seg) if pan_seg is not None: segments_info = dic["segments_info"] pan_seg = torch.tensor(pan_seg) self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5) return self.output def overlay_instances( self, *, boxes=None, labels=None, masks=None, keypoints=None, assigned_colors=None, alpha=0.5, ): """ Args: boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`, or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image, or a :class:`RotatedBoxes`, or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format for the N objects in a single image, labels (list[str]): the text to be displayed for each instance. masks (masks-like object): Supported types are: * :class:`detectron2.structures.PolygonMasks`, :class:`detectron2.structures.BitMasks`. * list[list[ndarray]]: contains the segmentation masks for all objects in one image. The first level of the list corresponds to individual instances. The second level to all the polygon that compose the instance, and the third level to the polygon coordinates. The third level should have the format of [x0, y0, x1, y1, ..., xn, yn] (n >= 3). * list[ndarray]: each ndarray is a binary mask of shape (H, W). * list[dict]: each dict is a COCO-style RLE. keypoints (Keypoint or array like): an array-like object of shape (N, K, 3), where the N is the number of instances and K is the number of keypoints. The last dimension corresponds to (x, y, visibility or score). assigned_colors (list[matplotlib.colors]): a list of colors, where each color corresponds to each mask or box in the image. Refer to 'matplotlib.colors' for full list of formats that the colors are accepted in. Returns: output (VisImage): image object with visualizations. """ num_instances = 0 if boxes is not None: boxes = self._convert_boxes(boxes) num_instances = len(boxes) if masks is not None: masks = self._convert_masks(masks) if num_instances: assert len(masks) == num_instances else: num_instances = len(masks) if keypoints is not None: if num_instances: assert len(keypoints) == num_instances else: num_instances = len(keypoints) keypoints = self._convert_keypoints(keypoints) if labels is not None: assert len(labels) == num_instances if assigned_colors is None: assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] if num_instances == 0: return self.output if boxes is not None and boxes.shape[1] == 5: return self.overlay_rotated_instances( boxes=boxes, labels=labels, assigned_colors=assigned_colors ) # Display in largest to smallest order to reduce occlusion. areas = None if boxes is not None: areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1) elif masks is not None: areas = np.asarray([x.area() for x in masks]) if areas is not None: sorted_idxs = np.argsort(-areas).tolist() # Re-order overlapped instances in descending order. boxes = boxes[sorted_idxs] if boxes is not None else None labels = [labels[k] for k in sorted_idxs] if labels is not None else None masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] keypoints = keypoints[sorted_idxs] if keypoints is not None else None for i in range(num_instances): color = assigned_colors[i] if boxes is not None: self.draw_box(boxes[i], edge_color=color) if masks is not None: for segment in masks[i].polygons: self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha) if labels is not None: # first get a box if boxes is not None: x0, y0, x1, y1 = boxes[i] text_pos = (x0, y0) # if drawing boxes, put text on the box corner. horiz_align = "left" elif masks is not None: # skip small mask without polygon if len(masks[i].polygons) == 0: continue x0, y0, x1, y1 = masks[i].bbox() # draw text in the center (defined by median) when box is not drawn # median is less sensitive to outliers. text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1] horiz_align = "center" else: continue # drawing the box confidence for keypoints isn't very useful. # for small objects, draw text at the side to avoid occlusion instance_area = (y1 - y0) * (x1 - x0) if ( instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale or y1 - y0 < 40 * self.output.scale ): if y1 >= self.output.height - 5: text_pos = (x1, y0) else: text_pos = (x0, y1) height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width) lighter_color = self._change_color_brightness(color, brightness_factor=0.7) font_size = ( np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size ) self.draw_text( labels[i], text_pos, color=lighter_color, horizontal_alignment=horiz_align, font_size=font_size, ) # draw keypoints if keypoints is not None: for keypoints_per_instance in keypoints: self.draw_and_connect_keypoints(keypoints_per_instance) return self.output def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None): """ Args: boxes (ndarray): an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format for the N objects in a single image. labels (list[str]): the text to be displayed for each instance. assigned_colors (list[matplotlib.colors]): a list of colors, where each color corresponds to each mask or box in the image. Refer to 'matplotlib.colors' for full list of formats that the colors are accepted in. Returns: output (VisImage): image object with visualizations. """ num_instances = len(boxes) if assigned_colors is None: assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] if num_instances == 0: return self.output # Display in largest to smallest order to reduce occlusion. if boxes is not None: areas = boxes[:, 2] * boxes[:, 3] sorted_idxs = np.argsort(-areas).tolist() # Re-order overlapped instances in descending order. boxes = boxes[sorted_idxs] labels = [labels[k] for k in sorted_idxs] if labels is not None else None colors = [assigned_colors[idx] for idx in sorted_idxs] for i in range(num_instances): self.draw_rotated_box_with_label( boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None ) return self.output def draw_and_connect_keypoints(self, keypoints): """ Draws keypoints of an instance and follows the rules for keypoint connections to draw lines between appropriate keypoints. This follows color heuristics for line color. Args: keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints and the last dimension corresponds to (x, y, probability). Returns: output (VisImage): image object with visualizations. """ visible = {} keypoint_names = self.metadata.get("keypoint_names") for idx, keypoint in enumerate(keypoints): # draw keypoint x, y, prob = keypoint if prob > self.keypoint_threshold: self.draw_circle((x, y), color=_RED) if keypoint_names: keypoint_name = keypoint_names[idx] visible[keypoint_name] = (x, y) if self.metadata.get("keypoint_connection_rules"): for kp0, kp1, color in self.metadata.keypoint_connection_rules: if kp0 in visible and kp1 in visible: x0, y0 = visible[kp0] x1, y1 = visible[kp1] color = tuple(x / 255.0 for x in color) self.draw_line([x0, x1], [y0, y1], color=color) # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip # Note that this strategy is specific to person keypoints. # For other keypoints, it should just do nothing try: ls_x, ls_y = visible["left_shoulder"] rs_x, rs_y = visible["right_shoulder"] mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2 except KeyError: pass else: # draw line from nose to mid-shoulder nose_x, nose_y = visible.get("nose", (None, None)) if nose_x is not None: self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED) try: # draw line from mid-shoulder to mid-hip lh_x, lh_y = visible["left_hip"] rh_x, rh_y = visible["right_hip"] except KeyError: pass else: mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2 self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED) return self.output """ Primitive drawing functions: """ def draw_text( self, text, position, *, font_size=None, color="g", horizontal_alignment="center", rotation=0, ): """ Args: text (str): class label position (tuple): a tuple of the x and y coordinates to place text on image. font_size (int, optional): font of the text. If not provided, a font size proportional to the image width is calculated and used. color: color of the text. Refer to `matplotlib.colors` for full list of formats that are accepted. horizontal_alignment (str): see `matplotlib.text.Text` rotation: rotation angle in degrees CCW Returns: output (VisImage): image object with text drawn. """ if not font_size: font_size = self._default_font_size # since the text background is dark, we don't want the text to be dark color = np.maximum(list(mplc.to_rgb(color)), 0.2) color[np.argmax(color)] = max(0.8, np.max(color)) x, y = position self.output.ax.text( x, y, text, size=font_size * self.output.scale, family="sans-serif", bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"}, verticalalignment="top", horizontalalignment=horizontal_alignment, color=color, zorder=10, rotation=rotation, ) return self.output def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"): """ Args: box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0 are the coordinates of the image's top left corner. x1 and y1 are the coordinates of the image's bottom right corner. alpha (float): blending efficient. Smaller values lead to more transparent masks. edge_color: color of the outline of the box. Refer to `matplotlib.colors` for full list of formats that are accepted. line_style (string): the string to use to create the outline of the boxes. Returns: output (VisImage): image object with box drawn. """ x0, y0, x1, y1 = box_coord width = x1 - x0 height = y1 - y0 linewidth = max(self._default_font_size / 4, 1) self.output.ax.add_patch( mpl.patches.Rectangle( (x0, y0), width, height, fill=False, edgecolor=edge_color, linewidth=linewidth * self.output.scale, alpha=alpha, linestyle=line_style, ) ) return self.output def draw_rotated_box_with_label( self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None ): """ Draw a rotated box with label on its top-left corner. Args: rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle), where cnt_x and cnt_y are the center coordinates of the box. w and h are the width and height of the box. angle represents how many degrees the box is rotated CCW with regard to the 0-degree box. alpha (float): blending efficient. Smaller values lead to more transparent masks. edge_color: color of the outline of the box. Refer to `matplotlib.colors` for full list of formats that are accepted. line_style (string): the string to use to create the outline of the boxes. label (string): label for rotated box. It will not be rendered when set to None. Returns: output (VisImage): image object with box drawn. """ cnt_x, cnt_y, w, h, angle = rotated_box area = w * h # use thinner lines when the box is small linewidth = self._default_font_size / ( 6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3 ) theta = angle * math.pi / 180.0 c = math.cos(theta) s = math.sin(theta) rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)] # x: left->right ; y: top->down rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect] for k in range(4): j = (k + 1) % 4 self.draw_line( [rotated_rect[k][0], rotated_rect[j][0]], [rotated_rect[k][1], rotated_rect[j][1]], color=edge_color, linestyle="--" if k == 1 else line_style, linewidth=linewidth, ) if label is not None: text_pos = rotated_rect[1] # topleft corner height_ratio = h / np.sqrt(self.output.height * self.output.width) label_color = self._change_color_brightness(edge_color, brightness_factor=0.7) font_size = ( np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size ) self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle) return self.output def draw_circle(self, circle_coord, color, radius=3): """ Args: circle_coord (list(int) or tuple(int)): contains the x and y coordinates of the center of the circle. color: color of the polygon. Refer to `matplotlib.colors` for a full list of formats that are accepted. radius (int): radius of the circle. Returns: output (VisImage): image object with box drawn. """ x, y = circle_coord self.output.ax.add_patch( mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color) ) return self.output def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None): """ Args: x_data (list[int]): a list containing x values of all the points being drawn. Length of list should match the length of y_data. y_data (list[int]): a list containing y values of all the points being drawn. Length of list should match the length of x_data. color: color of the line. Refer to `matplotlib.colors` for a full list of formats that are accepted. linestyle: style of the line. Refer to `matplotlib.lines.Line2D` for a full list of formats that are accepted. linewidth (float or None): width of the line. When it's None, a default value will be computed and used. Returns: output (VisImage): image object with line drawn. """ if linewidth is None: linewidth = self._default_font_size / 3 linewidth = max(linewidth, 1) self.output.ax.add_line( mpl.lines.Line2D( x_data, y_data, linewidth=linewidth * self.output.scale, color=color, linestyle=linestyle, ) ) return self.output def draw_binary_mask( self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=10 ): """ Args: binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and W is the image width. Each value in the array is either a 0 or 1 value of uint8 type. color: color of the mask. Refer to `matplotlib.colors` for a full list of formats that are accepted. If None, will pick a random color. edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a full list of formats that are accepted. text (str): if None, will be drawn on the object alpha (float): blending efficient. Smaller values lead to more transparent masks. area_threshold (float): a connected component smaller than this area will not be shown. Returns: output (VisImage): image object with mask drawn. """ if color is None: color = random_color(rgb=True, maximum=1) color = mplc.to_rgb(color) has_valid_segment = False binary_mask = binary_mask.astype("uint8") # opencv needs uint8 mask = GenericMask(binary_mask, self.output.height, self.output.width) shape2d = (binary_mask.shape[0], binary_mask.shape[1]) if not mask.has_holes: # draw polygons for regular masks for segment in mask.polygons: area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1])) if area < (area_threshold or 0): continue has_valid_segment = True segment = segment.reshape(-1, 2) self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha) else: # TODO: Use Path/PathPatch to draw vector graphics: # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon rgba = np.zeros(shape2d + (4,), dtype="float32") rgba[:, :, :3] = color rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha has_valid_segment = True self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0)) if text is not None and has_valid_segment: lighter_color = self._change_color_brightness(color, brightness_factor=0.7) self._draw_text_in_mask(binary_mask, text, lighter_color) return self.output def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5): """ Args: soft_mask (ndarray): float array of shape (H, W), each value in [0, 1]. color: color of the mask. Refer to `matplotlib.colors` for a full list of formats that are accepted. If None, will pick a random color. text (str): if None, will be drawn on the object alpha (float): blending efficient. Smaller values lead to more transparent masks. Returns: output (VisImage): image object with mask drawn. """ if color is None: color = random_color(rgb=True, maximum=1) color = mplc.to_rgb(color) shape2d = (soft_mask.shape[0], soft_mask.shape[1]) rgba = np.zeros(shape2d + (4,), dtype="float32") rgba[:, :, :3] = color rgba[:, :, 3] = soft_mask * alpha self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0)) if text is not None: lighter_color = self._change_color_brightness(color, brightness_factor=0.7) binary_mask = (soft_mask > 0.5).astype("uint8") self._draw_text_in_mask(binary_mask, text, lighter_color) return self.output def draw_polygon(self, segment, color, edge_color=None, alpha=0.5): """ Args: segment: numpy array of shape Nx2, containing all the points in the polygon. color: color of the polygon. Refer to `matplotlib.colors` for a full list of formats that are accepted. edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a full list of formats that are accepted. If not provided, a darker shade of the polygon color will be used instead. alpha (float): blending efficient. Smaller values lead to more transparent masks. Returns: output (VisImage): image object with polygon drawn. """ if edge_color is None: # make edge color darker than the polygon color if alpha > 0.8: edge_color = self._change_color_brightness(color, brightness_factor=-0.7) else: edge_color = color edge_color = mplc.to_rgb(edge_color) + (1,) polygon = mpl.patches.Polygon( segment, fill=True, facecolor=mplc.to_rgb(color) + (alpha,), edgecolor=edge_color, linewidth=max(self._default_font_size // 15 * self.output.scale, 1), ) self.output.ax.add_patch(polygon) return self.output """ Internal methods: """ def _jitter(self, color): """ Randomly modifies given color to produce a slightly different color than the color given. Args: color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color picked. The values in the list are in the [0.0, 1.0] range. Returns: jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color after being jittered. The values in the list are in the [0.0, 1.0] range. """ color = mplc.to_rgb(color) vec = np.random.rand(3) # better to do it in another color space vec = vec / np.linalg.norm(vec) * 0.5 res = np.clip(vec + color, 0, 1) return tuple(res) def _create_grayscale_image(self, mask=None): """ Create a grayscale version of the original image. The colors in masked area, if given, will be kept. """ img_bw = self.img.astype("f4").mean(axis=2) img_bw = np.stack([img_bw] * 3, axis=2) if mask is not None: img_bw[mask] = self.img[mask] return img_bw def _change_color_brightness(self, color, brightness_factor): """ Depending on the brightness_factor, gives a lighter or darker color i.e. a color with less or more saturation than the original color. Args: color: color of the polygon. Refer to `matplotlib.colors` for a full list of formats that are accepted. brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of 0 will correspond to no change, a factor in [-1.0, 0) range will result in a darker color and a factor in (0, 1.0] range will result in a lighter color. Returns: modified_color (tuple[double]): a tuple containing the RGB values of the modified color. Each value in the tuple is in the [0.0, 1.0] range. """ assert brightness_factor >= -1.0 and brightness_factor <= 1.0 color = mplc.to_rgb(color) polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color)) modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1]) modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2]) return tuple(np.clip(modified_color, 0.0, 1.0)) def _convert_boxes(self, boxes): """ Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension. """ if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes): return boxes.tensor.detach().numpy() else: return np.asarray(boxes) def _convert_masks(self, masks_or_polygons): """ Convert different format of masks or polygons to a tuple of masks and polygons. Returns: list[GenericMask]: """ m = masks_or_polygons if isinstance(m, PolygonMasks): m = m.polygons if isinstance(m, BitMasks): m = m.tensor.numpy() if isinstance(m, torch.Tensor): m = m.numpy() ret = [] for x in m: if isinstance(x, GenericMask): ret.append(x) else: ret.append(GenericMask(x, self.output.height, self.output.width)) return ret def _draw_text_in_mask(self, binary_mask, text, color): """ Find proper places to draw text given a binary mask. """ # TODO sometimes drawn on wrong objects. the heuristics here can improve. _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8) if stats[1:, -1].size == 0: return largest_component_id = np.argmax(stats[1:, -1]) + 1 # draw text on the largest component, as well as other very large components. for cid in range(1, _num_cc): if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH: # median is more stable than centroid # center = centroids[largest_component_id] center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1] self.draw_text(text, center, color=color) def _convert_keypoints(self, keypoints): if isinstance(keypoints, Keypoints): keypoints = keypoints.tensor keypoints = np.asarray(keypoints) return keypoints def get_output(self): """ Returns: output (VisImage): the image output containing the visualizations added to the image. """ return self.output ================================================ FILE: annotator/oneformer/oneformer/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from . import data # register all new datasets from . import modeling # config from .config import * # models from .oneformer_model import OneFormer ================================================ FILE: annotator/oneformer/oneformer/config.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. from annotator.oneformer.detectron2.config import CfgNode as CN __all__ = ["add_common_config", "add_oneformer_config", "add_swin_config", "add_dinat_config", "add_beit_adapter_config", "add_convnext_config"] def add_common_config(cfg): """ Add config for common configuration """ # data config # select the dataset mapper cfg.INPUT.DATASET_MAPPER_NAME = "oneformer_unified" # Color augmentation cfg.INPUT.COLOR_AUG_SSD = False # We retry random cropping until no single category in semantic segmentation GT occupies more # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 # Pad image and segmentation GT in dataset mapper. cfg.INPUT.SIZE_DIVISIBILITY = -1 cfg.INPUT.TASK_SEQ_LEN = 77 cfg.INPUT.MAX_SEQ_LEN = 77 cfg.INPUT.TASK_PROB = CN() cfg.INPUT.TASK_PROB.SEMANTIC = 0.33 cfg.INPUT.TASK_PROB.INSTANCE = 0.66 # test dataset cfg.DATASETS.TEST_PANOPTIC = ("",) cfg.DATASETS.TEST_INSTANCE = ("",) cfg.DATASETS.TEST_SEMANTIC = ("",) # solver config # weight decay on embedding cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 # optimizer cfg.SOLVER.OPTIMIZER = "ADAMW" cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 # wandb cfg.WANDB = CN() cfg.WANDB.PROJECT = "unified_dense_recognition" cfg.WANDB.NAME = None cfg.MODEL.IS_TRAIN = False cfg.MODEL.IS_DEMO = True # text encoder config cfg.MODEL.TEXT_ENCODER = CN() cfg.MODEL.TEXT_ENCODER.WIDTH = 256 cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH = 77 cfg.MODEL.TEXT_ENCODER.NUM_LAYERS = 12 cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE = 49408 cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS = 2 cfg.MODEL.TEXT_ENCODER.N_CTX = 16 # mask_former inference config cfg.MODEL.TEST = CN() cfg.MODEL.TEST.SEMANTIC_ON = True cfg.MODEL.TEST.INSTANCE_ON = False cfg.MODEL.TEST.PANOPTIC_ON = False cfg.MODEL.TEST.DETECTION_ON = False cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD = 0.0 cfg.MODEL.TEST.OVERLAP_THRESHOLD = 0.0 cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False cfg.MODEL.TEST.TASK = "panoptic" # TEST AUG Slide cfg.TEST.AUG.IS_SLIDE = False cfg.TEST.AUG.CROP_SIZE = (640, 640) cfg.TEST.AUG.STRIDE = (426, 426) cfg.TEST.AUG.SCALE = (2048, 640) cfg.TEST.AUG.SETR_MULTI_SCALE = True cfg.TEST.AUG.KEEP_RATIO = True cfg.TEST.AUG.SIZE_DIVISOR = 32 # pixel decoder config cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 # adding transformer in pixel decoder cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 # pixel decoder cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" cfg.MODEL.SEM_SEG_HEAD.SEM_EMBED_DIM = 256 cfg.MODEL.SEM_SEG_HEAD.INST_EMBED_DIM = 256 # LSJ aug cfg.INPUT.IMAGE_SIZE = 1024 cfg.INPUT.MIN_SCALE = 0.1 cfg.INPUT.MAX_SCALE = 2.0 # MSDeformAttn encoder configs cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 def add_oneformer_config(cfg): """ Add config for ONE_FORMER. """ # mask_former model config cfg.MODEL.ONE_FORMER = CN() # loss cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION = True cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT = 0.1 cfg.MODEL.ONE_FORMER.CLASS_WEIGHT = 1.0 cfg.MODEL.ONE_FORMER.DICE_WEIGHT = 1.0 cfg.MODEL.ONE_FORMER.MASK_WEIGHT = 20.0 cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT = 0.5 cfg.MODEL.ONE_FORMER.CONTRASTIVE_TEMPERATURE = 0.07 # transformer config cfg.MODEL.ONE_FORMER.NHEADS = 8 cfg.MODEL.ONE_FORMER.DROPOUT = 0.1 cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD = 2048 cfg.MODEL.ONE_FORMER.ENC_LAYERS = 0 cfg.MODEL.ONE_FORMER.CLASS_DEC_LAYERS = 2 cfg.MODEL.ONE_FORMER.DEC_LAYERS = 6 cfg.MODEL.ONE_FORMER.PRE_NORM = False cfg.MODEL.ONE_FORMER.HIDDEN_DIM = 256 cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES = 120 cfg.MODEL.ONE_FORMER.NUM_OBJECT_CTX = 16 cfg.MODEL.ONE_FORMER.USE_TASK_NORM = True cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE = "res5" cfg.MODEL.ONE_FORMER.ENFORCE_INPUT_PROJ = False # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) # you can use this config to override cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY = 32 # transformer module cfg.MODEL.ONE_FORMER.TRANSFORMER_DECODER_NAME = "ContrastiveMultiScaleMaskedTransformerDecoder" # point loss configs # Number of points sampled during training for a mask point head. cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS = 112 * 112 # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the # original paper. cfg.MODEL.ONE_FORMER.OVERSAMPLE_RATIO = 3.0 # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in # the original paper. cfg.MODEL.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 def add_swin_config(cfg): """ Add config forSWIN Backbone. """ # swin transformer backbone cfg.MODEL.SWIN = CN() cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 cfg.MODEL.SWIN.PATCH_SIZE = 4 cfg.MODEL.SWIN.EMBED_DIM = 96 cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] cfg.MODEL.SWIN.WINDOW_SIZE = 7 cfg.MODEL.SWIN.MLP_RATIO = 4.0 cfg.MODEL.SWIN.QKV_BIAS = True cfg.MODEL.SWIN.QK_SCALE = None cfg.MODEL.SWIN.DROP_RATE = 0.0 cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 cfg.MODEL.SWIN.APE = False cfg.MODEL.SWIN.PATCH_NORM = True cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] cfg.MODEL.SWIN.USE_CHECKPOINT = False ## Semask additions cfg.MODEL.SWIN.SEM_WINDOW_SIZE = 7 cfg.MODEL.SWIN.NUM_SEM_BLOCKS = 1 def add_dinat_config(cfg): """ Add config for NAT Backbone. """ # DINAT transformer backbone cfg.MODEL.DiNAT = CN() cfg.MODEL.DiNAT.DEPTHS = [3, 4, 18, 5] cfg.MODEL.DiNAT.OUT_FEATURES = ["res2", "res3", "res4", "res5"] cfg.MODEL.DiNAT.EMBED_DIM = 64 cfg.MODEL.DiNAT.MLP_RATIO = 3.0 cfg.MODEL.DiNAT.NUM_HEADS = [2, 4, 8, 16] cfg.MODEL.DiNAT.DROP_PATH_RATE = 0.2 cfg.MODEL.DiNAT.KERNEL_SIZE = 7 cfg.MODEL.DiNAT.DILATIONS = [[1, 16, 1], [1, 4, 1, 8], [1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]] cfg.MODEL.DiNAT.OUT_INDICES = (0, 1, 2, 3) cfg.MODEL.DiNAT.QKV_BIAS = True cfg.MODEL.DiNAT.QK_SCALE = None cfg.MODEL.DiNAT.DROP_RATE = 0 cfg.MODEL.DiNAT.ATTN_DROP_RATE = 0. cfg.MODEL.DiNAT.IN_PATCH_SIZE = 4 def add_convnext_config(cfg): """ Add config for ConvNeXt Backbone. """ # swin transformer backbone cfg.MODEL.CONVNEXT = CN() cfg.MODEL.CONVNEXT.IN_CHANNELS = 3 cfg.MODEL.CONVNEXT.DEPTHS = [3, 3, 27, 3] cfg.MODEL.CONVNEXT.DIMS = [192, 384, 768, 1536] cfg.MODEL.CONVNEXT.DROP_PATH_RATE = 0.4 cfg.MODEL.CONVNEXT.LSIT = 1.0 cfg.MODEL.CONVNEXT.OUT_INDICES = [0, 1, 2, 3] cfg.MODEL.CONVNEXT.OUT_FEATURES = ["res2", "res3", "res4", "res5"] def add_beit_adapter_config(cfg): """ Add config for BEiT Adapter Backbone. """ # beit adapter backbone cfg.MODEL.BEiTAdapter = CN() cfg.MODEL.BEiTAdapter.IMG_SIZE = 640 cfg.MODEL.BEiTAdapter.PATCH_SIZE = 16 cfg.MODEL.BEiTAdapter.EMBED_DIM = 1024 cfg.MODEL.BEiTAdapter.DEPTH = 24 cfg.MODEL.BEiTAdapter.NUM_HEADS = 16 cfg.MODEL.BEiTAdapter.MLP_RATIO = 4 cfg.MODEL.BEiTAdapter.QKV_BIAS = True cfg.MODEL.BEiTAdapter.USE_ABS_POS_EMB = False cfg.MODEL.BEiTAdapter.USE_REL_POS_BIAS = True cfg.MODEL.BEiTAdapter.INIT_VALUES = 1e-6 cfg.MODEL.BEiTAdapter.DROP_PATH_RATE = 0.3 cfg.MODEL.BEiTAdapter.CONV_INPLANE = 64 cfg.MODEL.BEiTAdapter.N_POINTS = 4 cfg.MODEL.BEiTAdapter.DEFORM_NUM_HEADS = 16 cfg.MODEL.BEiTAdapter.CFFN_RATIO = 0.25 cfg.MODEL.BEiTAdapter.DEFORM_RATIO = 0.5 cfg.MODEL.BEiTAdapter.WITH_CP = True cfg.MODEL.BEiTAdapter.INTERACTION_INDEXES=[[0, 5], [6, 11], [12, 17], [18, 23]] cfg.MODEL.BEiTAdapter.OUT_FEATURES = ["res2", "res3", "res4", "res5"] ================================================ FILE: annotator/oneformer/oneformer/data/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from . import datasets ================================================ FILE: annotator/oneformer/oneformer/data/build.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from typing import Any, Callable, Dict, List, Optional, Union import torch.utils.data as torchdata from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.data.common import DatasetFromList, MapDataset from annotator.oneformer.detectron2.data.dataset_mapper import DatasetMapper from annotator.oneformer.detectron2.data.samplers import ( InferenceSampler, ) from annotator.oneformer.detectron2.data.build import ( get_detection_dataset_dicts, trivial_batch_collator ) """ This file contains the default logic to build a dataloader for training or testing. """ __all__ = [ "build_detection_test_loader", ] def _test_loader_from_config(cfg, dataset_name, mapper=None): """ Uses the given `dataset_name` argument (instead of the names in cfg), because the standard practice is to evaluate each test set individually (not combining them). """ if isinstance(dataset_name, str): dataset_name = [dataset_name] dataset = get_detection_dataset_dicts( dataset_name, filter_empty=False, proposal_files=[ cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name ] if cfg.MODEL.LOAD_PROPOSALS else None, ) if mapper is None: mapper = DatasetMapper(cfg, False) return { "dataset": dataset, "mapper": mapper, "num_workers": cfg.DATALOADER.NUM_WORKERS, "sampler": InferenceSampler(len(dataset)) if not isinstance(dataset, torchdata.IterableDataset) else None, } @configurable(from_config=_test_loader_from_config) def build_detection_test_loader( dataset: Union[List[Any], torchdata.Dataset], *, mapper: Callable[[Dict[str, Any]], Any], sampler: Optional[torchdata.Sampler] = None, batch_size: int = 1, num_workers: int = 0, collate_fn: Optional[Callable[[List[Any]], Any]] = None, ) -> torchdata.DataLoader: """ Similar to `build_detection_train_loader`, with default batch size = 1, and sampler = :class:`InferenceSampler`. This sampler coordinates all workers to produce the exact set of all samples. Args: dataset: a list of dataset dicts, or a pytorch dataset (either map-style or iterable). They can be obtained by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. mapper: a callable which takes a sample (dict) from dataset and returns the format to be consumed by the model. When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. sampler: a sampler that produces indices to be applied on ``dataset``. Default to :class:`InferenceSampler`, which splits the dataset across all workers. Sampler must be None if `dataset` is iterable. batch_size: the batch size of the data loader to be created. Default to 1 image per worker since this is the standard when reporting inference time in papers. num_workers: number of parallel data loading workers collate_fn: same as the argument of `torch.utils.data.DataLoader`. Defaults to do no collation and return a list of data. Returns: DataLoader: a torch DataLoader, that loads the given detection dataset, with test-time transformation and batching. Examples: :: data_loader = build_detection_test_loader( DatasetRegistry.get("my_test"), mapper=DatasetMapper(...)) # or, instantiate with a CfgNode: data_loader = build_detection_test_loader(cfg, "my_test") """ if isinstance(dataset, list): dataset = DatasetFromList(dataset, copy=False) if mapper is not None: dataset = MapDataset(dataset, mapper) if isinstance(dataset, torchdata.IterableDataset): assert sampler is None, "sampler must be None if dataset is IterableDataset" else: if sampler is None: sampler = InferenceSampler(len(dataset)) return torchdata.DataLoader( dataset, batch_size=batch_size, sampler=sampler, drop_last=False, num_workers=num_workers, collate_fn=trivial_batch_collator if collate_fn is None else collate_fn, ) ================================================ FILE: annotator/oneformer/oneformer/data/dataset_mappers/__init__.py ================================================ ================================================ FILE: annotator/oneformer/oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import copy import logging import numpy as np import torch from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.data import detection_utils as utils from annotator.oneformer.detectron2.data import transforms as T from annotator.oneformer.detectron2.structures import BitMasks, Instances from annotator.oneformer.oneformer.utils.box_ops import masks_to_boxes from annotator.oneformer.oneformer.data.tokenizer import SimpleTokenizer, Tokenize __all__ = ["COCOUnifiedNewBaselineDatasetMapper"] def build_transform_gen(cfg, is_train): """ Create a list of default :class:`Augmentation` from config. Now it includes resizing and flipping. Returns: list[Augmentation] """ assert is_train, "Only support training augmentation" image_size = cfg.INPUT.IMAGE_SIZE min_scale = cfg.INPUT.MIN_SCALE max_scale = cfg.INPUT.MAX_SCALE augmentation = [] if cfg.INPUT.RANDOM_FLIP != "none": augmentation.append( T.RandomFlip( horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", vertical=cfg.INPUT.RANDOM_FLIP == "vertical", ) ) augmentation.extend([ T.ResizeScale( min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size ), T.FixedSizeCrop(crop_size=(image_size, image_size)), ]) return augmentation # This is specifically designed for the COCO dataset. class COCOUnifiedNewBaselineDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by OneFormer. This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, num_queries, tfm_gens, meta, image_format, max_seq_len, task_seq_len, semantic_prob, instance_prob, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply crop_gen: crop augmentation tfm_gens: data augmentation image_format: an image format supported by :func:`detection_utils.read_image`. """ self.tfm_gens = tfm_gens logging.getLogger(__name__).info( "[COCOUnifiedNewBaselineDatasetMapper] Full TransformGens used in training: {}".format( str(self.tfm_gens) ) ) self.img_format = image_format self.is_train = is_train self.meta = meta self.ignore_label = self.meta.ignore_label self.num_queries = num_queries self.things = [] for k,v in self.meta.thing_dataset_id_to_contiguous_id.items(): self.things.append(v) self.class_names = self.meta.stuff_classes self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len) self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len) self.semantic_prob = semantic_prob self.instance_prob = instance_prob @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation tfm_gens = build_transform_gen(cfg, is_train) dataset_names = cfg.DATASETS.TRAIN meta = MetadataCatalog.get(dataset_names[0]) ret = { "is_train": is_train, "meta": meta, "tfm_gens": tfm_gens, "image_format": cfg.INPUT.FORMAT, "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX, "task_seq_len": cfg.INPUT.TASK_SEQ_LEN, "max_seq_len": cfg.INPUT.MAX_SEQ_LEN, "semantic_prob": cfg.INPUT.TASK_PROB.SEMANTIC, "instance_prob": cfg.INPUT.TASK_PROB.INSTANCE, } return ret def _get_semantic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): instances = Instances(image_shape) classes = [] texts = ["a semantic photo"] * self.num_queries masks = [] label = np.ones_like(pan_seg_gt) * self.ignore_label for segment_info in segments_info: class_id = segment_info["category_id"] if not segment_info["iscrowd"]: mask = pan_seg_gt == segment_info["id"] if not np.all(mask == False): if class_id not in classes: cls_name = self.class_names[class_id] classes.append(class_id) masks.append(mask) num_class_obj[cls_name] += 1 else: idx = classes.index(class_id) masks[idx] += mask masks[idx] = np.clip(masks[idx], 0, 1).astype(np.bool) label[mask] = class_id num = 0 for i, cls_name in enumerate(self.class_names): if num_class_obj[cls_name] > 0: for _ in range(num_class_obj[cls_name]): if num >= len(texts): break texts[num] = f"a photo with a {cls_name}" num += 1 classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) instances.gt_bboxes = torch.zeros((0, 4)) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor # Placeholder bounding boxes for stuff regions. Note that these are not used during training. instances.gt_bboxes = torch.stack([torch.tensor([0., 0., 1., 1.])] * instances.gt_masks.shape[0]) return instances, texts, label def _get_instance_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): instances = Instances(image_shape) classes = [] texts = ["an instance photo"] * self.num_queries masks = [] label = np.ones_like(pan_seg_gt) * self.ignore_label for segment_info in segments_info: class_id = segment_info["category_id"] if class_id in self.things: if not segment_info["iscrowd"]: mask = pan_seg_gt == segment_info["id"] if not np.all(mask == False): cls_name = self.class_names[class_id] classes.append(class_id) masks.append(mask) num_class_obj[cls_name] += 1 label[mask] = class_id num = 0 for i, cls_name in enumerate(self.class_names): if num_class_obj[cls_name] > 0: for _ in range(num_class_obj[cls_name]): if num >= len(texts): break texts[num] = f"a photo with a {cls_name}" num += 1 classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) instances.gt_bboxes = torch.zeros((0, 4)) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor instances.gt_bboxes = masks_to_boxes(instances.gt_masks) return instances, texts, label def _get_panoptic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): instances = Instances(image_shape) classes = [] texts = ["a panoptic photo"] * self.num_queries masks = [] label = np.ones_like(pan_seg_gt) * self.ignore_label for segment_info in segments_info: class_id = segment_info["category_id"] if not segment_info["iscrowd"]: mask = pan_seg_gt == segment_info["id"] if not np.all(mask == False): cls_name = self.class_names[class_id] classes.append(class_id) masks.append(mask) num_class_obj[cls_name] += 1 label[mask] = class_id num = 0 for i, cls_name in enumerate(self.class_names): if num_class_obj[cls_name] > 0: for _ in range(num_class_obj[cls_name]): if num >= len(texts): break texts[num] = f"a photo with a {cls_name}" num += 1 classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) instances.gt_bboxes = torch.zeros((0, 4)) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor instances.gt_bboxes = masks_to_boxes(instances.gt_masks) for i in range(instances.gt_classes.shape[0]): # Placeholder bounding boxes for stuff regions. Note that these are not used during training. if instances.gt_classes[i].item() not in self.things: instances.gt_bboxes[i] = torch.tensor([0., 0., 1., 1.]) return instances, texts, label def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) image, transforms = T.apply_transform_gens(self.tfm_gens, image) image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if not self.is_train: # USER: Modify this if you want to keep them for some reason. dataset_dict.pop("annotations", None) return dataset_dict # semantic segmentation if "sem_seg_file_name" in dataset_dict: # PyTorch transformation not implemented for uint16, so converting it to double first sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") sem_seg_gt = transforms.apply_segmentation(sem_seg_gt) else: sem_seg_gt = None if "pan_seg_file_name" in dataset_dict: pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") segments_info = dataset_dict["segments_info"] # apply the same transformation to panoptic segmentation pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) from panopticapi.utils import rgb2id pan_seg_gt = rgb2id(pan_seg_gt) prob_task = np.random.uniform(0,1.) num_class_obj = {} for name in self.class_names: num_class_obj[name] = 0 if prob_task < self.semantic_prob: task = "The task is semantic" instances, text, sem_seg = self._get_semantic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) elif prob_task < self.instance_prob: task = "The task is instance" instances, text, sem_seg = self._get_instance_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) else: task = "The task is panoptic" instances, text, sem_seg = self._get_panoptic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) dataset_dict["sem_seg"] = torch.from_numpy(sem_seg).long() dataset_dict["instances"] = instances dataset_dict["orig_shape"] = image_shape dataset_dict["task"] = task dataset_dict["text"] = text dataset_dict["thing_ids"] = self.things return dataset_dict ================================================ FILE: annotator/oneformer/oneformer/data/dataset_mappers/dataset_mapper.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/dataset_mapper.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import copy import logging import numpy as np from typing import List, Optional, Union import torch from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.data import detection_utils as utils from annotator.oneformer.detectron2.data import transforms as T from annotator.oneformer.oneformer.data.tokenizer import SimpleTokenizer, Tokenize __all__ = ["DatasetMapper"] class DatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by the model. This is the default callable to be used to map your dataset dict into training data. You may need to follow it to implement your own one for customized logic, such as a different way to read or transform images. See :doc:`/tutorials/data_loading` for details. The callable currently does the following: 1. Read the image from "file_name" 2. Applies cropping/geometric transforms to the image and annotations 3. Prepare data and annotations to Tensor and :class:`Instances` """ @configurable def __init__( self, is_train: bool, *, augmentations: List[Union[T.Augmentation, T.Transform]], image_format: str, task_seq_len: int, task: str = "panoptic", use_instance_mask: bool = False, use_keypoint: bool = False, instance_mask_format: str = "polygon", keypoint_hflip_indices: Optional[np.ndarray] = None, precomputed_proposal_topk: Optional[int] = None, recompute_boxes: bool = False, ): """ NOTE: this interface is experimental. Args: is_train: whether it's used in training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. use_instance_mask: whether to process instance segmentation annotations, if available use_keypoint: whether to process keypoint annotations if available instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation masks into this format. keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices` precomputed_proposal_topk: if given, will load pre-computed proposals from dataset_dict and keep the top k proposals for each image. recompute_boxes: whether to overwrite bounding box annotations by computing tight bounding boxes from instance mask annotations. """ if recompute_boxes: assert use_instance_mask, "recompute_boxes requires instance masks" # fmt: off self.is_train = is_train self.augmentations = T.AugmentationList(augmentations) self.image_format = image_format self.use_instance_mask = use_instance_mask self.instance_mask_format = instance_mask_format self.use_keypoint = use_keypoint self.keypoint_hflip_indices = keypoint_hflip_indices self.proposal_topk = precomputed_proposal_topk self.recompute_boxes = recompute_boxes self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len) self.task = task assert self.task in ["panoptic", "semantic", "instance"] # fmt: on logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train: bool = True): augs = utils.build_augmentation(cfg, is_train) if cfg.INPUT.CROP.ENABLED and is_train: augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) recompute_boxes = cfg.MODEL.MASK_ON else: recompute_boxes = False ret = { "is_train": is_train, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "use_instance_mask": cfg.MODEL.MASK_ON, "instance_mask_format": cfg.INPUT.MASK_FORMAT, "use_keypoint": cfg.MODEL.KEYPOINT_ON, "task_seq_len": cfg.INPUT.TASK_SEQ_LEN, "recompute_boxes": recompute_boxes, "task": cfg.MODEL.TEST.TASK, } if cfg.MODEL.KEYPOINT_ON: ret["keypoint_hflip_indices"] = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN) if cfg.MODEL.LOAD_PROPOSALS: ret["precomputed_proposal_topk"] = ( cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN if is_train else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST ) return ret def _transform_annotations(self, dataset_dict, transforms, image_shape): # USER: Modify this if you want to keep them for some reason. for anno in dataset_dict["annotations"]: if not self.use_instance_mask: anno.pop("segmentation", None) if not self.use_keypoint: anno.pop("keypoints", None) # USER: Implement additional transformations if you have other types of data annos = [ utils.transform_instance_annotations( obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices ) for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] instances = utils.annotations_to_instances( annos, image_shape, mask_format=self.instance_mask_format ) # After transforms such as cropping are applied, the bounding box may no longer # tightly bound the object. As an example, imagine a triangle object # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to # the intersection of original bounding box and the cropping box. if self.recompute_boxes: instances.gt_boxes = instances.gt_masks.get_bounding_boxes() dataset_dict["instances"] = utils.filter_empty_instances(instances) def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below # USER: Write your own image loading if it's not from a file image = utils.read_image(dataset_dict["file_name"], format=self.image_format) utils.check_image_size(dataset_dict, image) task = f"The task is {self.task}" dataset_dict["task"] = task # USER: Remove if you don't do semantic/panoptic segmentation. if "sem_seg_file_name" in dataset_dict: sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2) else: sem_seg_gt = None aug_input = T.AugInput(image, sem_seg=sem_seg_gt) transforms = self.augmentations(aug_input) image, sem_seg_gt = aug_input.image, aug_input.sem_seg image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if sem_seg_gt is not None: dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long")) # USER: Remove if you don't use pre-computed proposals. # Most users would not need this feature. if self.proposal_topk is not None: utils.transform_proposals( dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk ) if not self.is_train: # USER: Modify this if you want to keep them for some reason. dataset_dict.pop("annotations", None) dataset_dict.pop("sem_seg_file_name", None) return dataset_dict if "annotations" in dataset_dict: self._transform_annotations(dataset_dict, transforms, image_shape) return dataset_dict ================================================ FILE: annotator/oneformer/oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import copy import logging import os import numpy as np import torch from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.data import detection_utils as utils from annotator.oneformer.detectron2.data import transforms as T from annotator.oneformer.detectron2.structures import BitMasks, Instances from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.projects.point_rend import ColorAugSSDTransform from annotator.oneformer.oneformer.utils.box_ops import masks_to_boxes from annotator.oneformer.oneformer.data.tokenizer import SimpleTokenizer, Tokenize __all__ = ["OneFormerUnifiedDatasetMapper"] class OneFormerUnifiedDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by OneFormer for universal segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, name, num_queries, meta, augmentations, image_format, ignore_label, size_divisibility, task_seq_len, max_seq_len, semantic_prob, instance_prob, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. ignore_label: the label that is ignored to evaluation size_divisibility: pad image size to be divisible by this value """ self.is_train = is_train self.meta = meta self.name = name self.tfm_gens = augmentations self.img_format = image_format self.ignore_label = ignore_label self.size_divisibility = size_divisibility self.num_queries = num_queries logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") self.things = [] for k,v in self.meta.thing_dataset_id_to_contiguous_id.items(): self.things.append(v) self.class_names = self.meta.stuff_classes self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len) self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len) self.semantic_prob = semantic_prob self.instance_prob = instance_prob @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation augs = [ T.ResizeShortestEdge( cfg.INPUT.MIN_SIZE_TRAIN, cfg.INPUT.MAX_SIZE_TRAIN, cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, ) ] if cfg.INPUT.CROP.ENABLED: augs.append( T.RandomCrop_CategoryAreaConstraint( cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE, cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, ) ) if cfg.INPUT.COLOR_AUG_SSD: augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) augs.append(T.RandomFlip()) # Assume always applies to the training set. dataset_names = cfg.DATASETS.TRAIN meta = MetadataCatalog.get(dataset_names[0]) ignore_label = meta.ignore_label ret = { "is_train": is_train, "meta": meta, "name": dataset_names[0], "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX, "task_seq_len": cfg.INPUT.TASK_SEQ_LEN, "max_seq_len": cfg.INPUT.MAX_SEQ_LEN, "augmentations": augs, "image_format": cfg.INPUT.FORMAT, "ignore_label": ignore_label, "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, "semantic_prob": cfg.INPUT.TASK_PROB.SEMANTIC, "instance_prob": cfg.INPUT.TASK_PROB.INSTANCE, } return ret def _get_semantic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): pan_seg_gt = pan_seg_gt.numpy() instances = Instances(image_shape) classes = [] texts = ["a semantic photo"] * self.num_queries masks = [] label = np.ones_like(pan_seg_gt) * self.ignore_label for segment_info in segments_info: class_id = segment_info["category_id"] if not segment_info["iscrowd"]: mask = pan_seg_gt == segment_info["id"] if not np.all(mask == False): if class_id not in classes: cls_name = self.class_names[class_id] classes.append(class_id) masks.append(mask) num_class_obj[cls_name] += 1 else: idx = classes.index(class_id) masks[idx] += mask masks[idx] = np.clip(masks[idx], 0, 1).astype(np.bool) label[mask] = class_id num = 0 for i, cls_name in enumerate(self.class_names): if num_class_obj[cls_name] > 0: for _ in range(num_class_obj[cls_name]): if num >= len(texts): break texts[num] = f"a photo with a {cls_name}" num += 1 classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) instances.gt_bboxes = torch.zeros((0, 4)) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor # Placeholder bounding boxes for stuff regions. Note that these are not used during training. instances.gt_bboxes = torch.stack([torch.tensor([0., 0., 1., 1.])] * instances.gt_masks.shape[0]) return instances, texts, label def _get_instance_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): pan_seg_gt = pan_seg_gt.numpy() instances = Instances(image_shape) classes = [] texts = ["an instance photo"] * self.num_queries masks = [] label = np.ones_like(pan_seg_gt) * self.ignore_label for segment_info in segments_info: class_id = segment_info["category_id"] if class_id in self.things: if not segment_info["iscrowd"]: mask = pan_seg_gt == segment_info["id"] if not np.all(mask == False): cls_name = self.class_names[class_id] classes.append(class_id) masks.append(mask) num_class_obj[cls_name] += 1 label[mask] = class_id num = 0 for i, cls_name in enumerate(self.class_names): if num_class_obj[cls_name] > 0: for _ in range(num_class_obj[cls_name]): if num >= len(texts): break texts[num] = f"a photo with a {cls_name}" num += 1 classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) instances.gt_bboxes = torch.zeros((0, 4)) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor instances.gt_bboxes = masks_to_boxes(instances.gt_masks) return instances, texts, label def _get_panoptic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): pan_seg_gt = pan_seg_gt.numpy() instances = Instances(image_shape) classes = [] texts = ["a panoptic photo"] * self.num_queries masks = [] label = np.ones_like(pan_seg_gt) * self.ignore_label for segment_info in segments_info: class_id = segment_info["category_id"] if not segment_info["iscrowd"]: mask = pan_seg_gt == segment_info["id"] if not np.all(mask == False): cls_name = self.class_names[class_id] classes.append(class_id) masks.append(mask) num_class_obj[cls_name] += 1 label[mask] = class_id num = 0 for i, cls_name in enumerate(self.class_names): if num_class_obj[cls_name] > 0: for _ in range(num_class_obj[cls_name]): if num >= len(texts): break texts[num] = f"a photo with a {cls_name}" num += 1 classes = np.array(classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) instances.gt_bboxes = torch.zeros((0, 4)) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor instances.gt_bboxes = masks_to_boxes(instances.gt_masks) for i in range(instances.gt_classes.shape[0]): # Placeholder bounding boxes for stuff regions. Note that these are not used during training. if instances.gt_classes[i].item() not in self.things: instances.gt_bboxes[i] = torch.tensor([0., 0., 1., 1.]) return instances, texts, label def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ assert self.is_train, "OneFormerUnifiedDatasetMapper should only be used for training!" dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) # semantic segmentation if "sem_seg_file_name" in dataset_dict: # PyTorch transformation not implemented for uint16, so converting it to double first sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") else: sem_seg_gt = None # panoptic segmentation if "pan_seg_file_name" in dataset_dict: pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") segments_info = dataset_dict["segments_info"] else: pan_seg_gt = None segments_info = None if pan_seg_gt is None: raise ValueError( "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format( dataset_dict["file_name"] ) ) aug_input = T.AugInput(image, sem_seg=sem_seg_gt) aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) image = aug_input.image if sem_seg_gt is not None: sem_seg_gt = aug_input.sem_seg # apply the same transformation to panoptic segmentation pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) from panopticapi.utils import rgb2id pan_seg_gt = rgb2id(pan_seg_gt) # Pad image and segmentation label here! image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if sem_seg_gt is not None: sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long")) if self.size_divisibility > 0: image_size = (image.shape[-2], image.shape[-1]) padding_size = [ 0, self.size_divisibility - image_size[1], 0, self.size_divisibility - image_size[0], ] image = F.pad(image, padding_size, value=128).contiguous() if sem_seg_gt is not None: sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() pan_seg_gt = F.pad( pan_seg_gt, padding_size, value=0 ).contiguous() # 0 is the VOID panoptic label image_shape = (image.shape[-2], image.shape[-1]) # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = image if "annotations" in dataset_dict: raise ValueError("Pemantic segmentation dataset should not have 'annotations'.") prob_task = np.random.uniform(0,1.) num_class_obj = {} for name in self.class_names: num_class_obj[name] = 0 if prob_task < self.semantic_prob: task = "The task is semantic" instances, text, sem_seg = self._get_semantic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) elif prob_task < self.instance_prob: task = "The task is instance" instances, text, sem_seg = self._get_instance_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) else: task = "The task is panoptic" instances, text, sem_seg = self._get_panoptic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) dataset_dict["sem_seg"] = torch.from_numpy(sem_seg).long() dataset_dict["instances"] = instances dataset_dict["orig_shape"] = image_shape dataset_dict["task"] = task dataset_dict["text"] = text dataset_dict["thing_ids"] = self.things return dataset_dict ================================================ FILE: annotator/oneformer/oneformer/data/datasets/__init__.py ================================================ from . import ( register_ade20k_panoptic, register_cityscapes_panoptic, register_coco_panoptic_annos_semseg, register_ade20k_instance, register_coco_panoptic2instance, ) ================================================ FILE: annotator/oneformer/oneformer/data/datasets/register_ade20k_instance.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_instance.py # ------------------------------------------------------------------------------ import json import logging import numpy as np import os from PIL import Image from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from annotator.oneformer.detectron2.data.datasets.coco import load_coco_json, register_coco_instances from annotator.oneformer.detectron2.utils.file_io import PathManager ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}] _PREDEFINED_SPLITS = { # point annotations without masks "ade20k_instance_train": ( "ADEChallengeData2016/images/training", "ADEChallengeData2016/ade20k_instance_train.json", ), "ade20k_instance_val": ( "ADEChallengeData2016/images/validation", "ADEChallengeData2016/ade20k_instance_val.json", ), } def _get_ade_instances_meta(): thing_ids = [k["id"] for k in ADE_CATEGORIES] assert len(thing_ids) == 100, len(thing_ids) # Mapping from the incontiguous ADE category id to an id in [0, 99] thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} thing_classes = [k["name"] for k in ADE_CATEGORIES] ret = { "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes, } return ret def register_all_ade20k_instance(root): for key, (image_root, json_file) in _PREDEFINED_SPLITS.items(): # Assume pre-defined datasets live in `./datasets`. register_coco_instances( key, _get_ade_instances_meta(), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ade20k_instance(_root) ================================================ FILE: annotator/oneformer/oneformer/data/datasets/register_ade20k_panoptic.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_panoptic.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import json import os from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from annotator.oneformer.detectron2.utils.file_io import PathManager ADE20K_150_CATEGORIES = [ {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"}, {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"}, {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"}, {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"}, {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"}, {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"}, {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"}, {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"}, {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "}, {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"}, {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"}, {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"}, {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"}, {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"}, {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"}, {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"}, {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"}, {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"}, {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"}, {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"}, {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"}, {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"}, {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"}, {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"}, {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"}, {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"}, {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"}, {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"}, {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"}, {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"}, {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"}, {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"}, {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"}, {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"}, {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"}, {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"}, {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"}, {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"}, {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"}, {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"}, {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"}, {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"}, {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"}, {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"}, { "color": [6, 51, 255], "id": 44, "isthing": 1, "name": "chest of drawers, chest, bureau, dresser", }, {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"}, {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"}, {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"}, {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"}, {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"}, {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"}, {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"}, {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"}, {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"}, {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"}, {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"}, { "color": [255, 71, 0], "id": 56, "isthing": 1, "name": "pool table, billiard table, snooker table", }, {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"}, {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"}, {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"}, {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"}, {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"}, {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"}, {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"}, {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"}, { "color": [0, 255, 133], "id": 65, "isthing": 1, "name": "toilet, can, commode, crapper, pot, potty, stool, throne", }, {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"}, {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"}, {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"}, {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"}, {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"}, {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"}, {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"}, {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"}, {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"}, {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"}, {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"}, {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"}, {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"}, {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"}, {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"}, {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"}, {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"}, {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"}, {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"}, {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"}, {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"}, {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"}, {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"}, {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"}, {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"}, {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"}, {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"}, {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"}, {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"}, { "color": [0, 122, 255], "id": 95, "isthing": 1, "name": "bannister, banister, balustrade, balusters, handrail", }, { "color": [0, 255, 163], "id": 96, "isthing": 0, "name": "escalator, moving staircase, moving stairway", }, { "color": [255, 153, 0], "id": 97, "isthing": 1, "name": "ottoman, pouf, pouffe, puff, hassock", }, {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"}, {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"}, { "color": [143, 255, 0], "id": 100, "isthing": 0, "name": "poster, posting, placard, notice, bill, card", }, {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"}, {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"}, {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"}, {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"}, { "color": [133, 0, 255], "id": 105, "isthing": 0, "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter", }, {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"}, { "color": [184, 0, 255], "id": 107, "isthing": 1, "name": "washer, automatic washer, washing machine", }, {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"}, {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"}, {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"}, {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"}, {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"}, {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"}, {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"}, {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"}, {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"}, {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"}, {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"}, {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"}, {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"}, {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"}, {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"}, {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"}, {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"}, {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"}, {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"}, {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"}, {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"}, {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"}, {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"}, {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"}, {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"}, {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"}, {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"}, {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"}, {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"}, {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"}, {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"}, {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"}, {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"}, {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"}, {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"}, {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"}, {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"}, {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"}, {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"}, {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"}, {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"}, {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"}, ] ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES] MetadataCatalog.get("ade20k_sem_seg_train").set( stuff_colors=ADE20k_COLORS[:], ) MetadataCatalog.get("ade20k_sem_seg_val").set( stuff_colors=ADE20k_COLORS[:], ) def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = True else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = False return segment_info with PathManager.open(json_file) as f: json_info = json.load(f) ret = [] for ann in json_info["annotations"]: image_id = ann["image_id"] # TODO: currently we assume image and label has the same filename but # different extension, and images have extension ".jpg" for COCO. Need # to make image extension a user-provided argument if we extend this # function to support other COCO-like datasets. image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") label_file = os.path.join(gt_dir, ann["file_name"]) sem_label_file = os.path.join(semseg_dir, ann["file_name"]) segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] ret.append( { "file_name": image_file, "image_id": image_id, "pan_seg_file_name": label_file, "sem_seg_file_name": sem_label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] return ret def register_ade20k_panoptic( name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None, ): """ Register a "standard" version of ADE20k panoptic segmentation dataset named `name`. The dictionaries in this registered dataset follows detectron2's standard format. Hence it's called "standard". Args: name (str): the name that identifies a dataset, e.g. "ade20k_panoptic_train" metadata (dict): extra metadata associated with this dataset. image_root (str): directory which contains all the images panoptic_root (str): directory which contains panoptic annotation images in COCO format panoptic_json (str): path to the json panoptic annotation file in COCO format sem_seg_root (none): not used, to be consistent with `register_coco_panoptic_separated`. instances_json (str): path to the json instance annotation file """ panoptic_name = name DatasetCatalog.register( panoptic_name, lambda: load_ade20k_panoptic_json( panoptic_json, image_root, panoptic_root, semantic_root, metadata ), ) MetadataCatalog.get(panoptic_name).set( panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, json_file=instances_json, evaluator_type="ade20k_panoptic_seg", ignore_label=255, label_divisor=1000, **metadata, ) _PREDEFINED_SPLITS_ADE20K_PANOPTIC = { "ade20k_panoptic_train": ( "ADEChallengeData2016/images/training", "ADEChallengeData2016/ade20k_panoptic_train", "ADEChallengeData2016/ade20k_panoptic_train.json", "ADEChallengeData2016/annotations_detectron2/training", "ADEChallengeData2016/ade20k_instance_train.json", ), "ade20k_panoptic_val": ( "ADEChallengeData2016/images/validation", "ADEChallengeData2016/ade20k_panoptic_val", "ADEChallengeData2016/ade20k_panoptic_val.json", "ADEChallengeData2016/annotations_detectron2/validation", "ADEChallengeData2016/ade20k_instance_val.json", ), } def get_metadata(): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1] thing_colors = [k["color"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1] stuff_classes = [k["name"] for k in ADE20K_150_CATEGORIES] stuff_colors = [k["color"] for k in ADE20K_150_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # Convert category id for training: # category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the linear # softmax classifier. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for i, cat in enumerate(ADE20K_150_CATEGORIES): if cat["isthing"]: thing_dataset_id_to_contiguous_id[cat["id"]] = i # else: # stuff_dataset_id_to_contiguous_id[cat["id"]] = i # in order to use sem_seg evaluator stuff_dataset_id_to_contiguous_id[cat["id"]] = i meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id return meta def register_all_ade20k_panoptic(root): metadata = get_metadata() for ( prefix, (image_root, panoptic_root, panoptic_json, semantic_root, instance_json), ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items(): # The "standard" version of COCO panoptic segmentation dataset, # e.g. used by Panoptic-DeepLab register_ade20k_panoptic( prefix, metadata, os.path.join(root, image_root), os.path.join(root, panoptic_root), os.path.join(root, semantic_root), os.path.join(root, panoptic_json), os.path.join(root, instance_json), ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_ade20k_panoptic(_root) ================================================ FILE: annotator/oneformer/oneformer/data/datasets/register_cityscapes_panoptic.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/cityscapes_panoptic.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import json import logging import os from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from annotator.oneformer.detectron2.data.datasets.builtin_meta import CITYSCAPES_CATEGORIES from annotator.oneformer.detectron2.utils.file_io import PathManager """ This file contains functions to register the Cityscapes panoptic dataset to the DatasetCatalog. """ logger = logging.getLogger(__name__) def get_cityscapes_panoptic_files(image_dir, gt_dir, json_info): files = [] # scan through the directory cities = PathManager.ls(image_dir) logger.info(f"{len(cities)} cities found in '{image_dir}'.") image_dict = {} for city in cities: city_img_dir = os.path.join(image_dir, city) for basename in PathManager.ls(city_img_dir): image_file = os.path.join(city_img_dir, basename) suffix = "_leftImg8bit.png" assert basename.endswith(suffix), basename basename = os.path.basename(basename)[: -len(suffix)] image_dict[basename] = image_file for ann in json_info["annotations"]: image_file = image_dict.get(ann["image_id"], None) assert image_file is not None, "No image {} found for annotation {}".format( ann["image_id"], ann["file_name"] ) label_file = os.path.join(gt_dir, ann["file_name"]) segments_info = ann["segments_info"] files.append((image_file, label_file, segments_info)) assert len(files), "No images found in {}".format(image_dir) assert PathManager.isfile(files[0][0]), files[0][0] assert PathManager.isfile(files[0][1]), files[0][1] return files def load_cityscapes_panoptic(image_dir, gt_dir, gt_json, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/cityscapes_panoptic_train". gt_json (str): path to the json file. e.g., "~/cityscapes/gtFine/cityscapes_panoptic_train.json". meta (dict): dictionary containing "thing_dataset_id_to_contiguous_id" and "stuff_dataset_id_to_contiguous_id" to map category ids to contiguous ids for training. Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] return segment_info assert os.path.exists( gt_json ), "Please run `python cityscapesscripts/preparation/createPanopticImgs.py` to generate label files." # noqa with open(gt_json) as f: json_info = json.load(f) files = get_cityscapes_panoptic_files(image_dir, gt_dir, json_info) ret = [] for image_file, label_file, segments_info in files: sem_label_file = ( image_file.replace("leftImg8bit", "gtFine").split(".")[0] + "_labelTrainIds.png" ) segments_info = [_convert_category_id(x, meta) for x in segments_info] ret.append( { "file_name": image_file, "image_id": "_".join( os.path.splitext(os.path.basename(image_file))[0].split("_")[:3] ), "sem_seg_file_name": sem_label_file, "pan_seg_file_name": label_file, "segments_info": segments_info, } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile( ret[0]["sem_seg_file_name"] ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py" # noqa assert PathManager.isfile( ret[0]["pan_seg_file_name"] ), "Please generate panoptic annotation with python cityscapesscripts/preparation/createPanopticImgs.py" # noqa return ret _RAW_CITYSCAPES_PANOPTIC_SPLITS = { "cityscapes_fine_panoptic_train": ( "cityscapes/leftImg8bit/train", "cityscapes/gtFine/cityscapes_panoptic_train", "cityscapes/gtFine/cityscapes_panoptic_train.json", ), "cityscapes_fine_panoptic_val": ( "cityscapes/leftImg8bit/val", "cityscapes/gtFine/cityscapes_panoptic_val", "cityscapes/gtFine/cityscapes_panoptic_val.json", ), # "cityscapes_fine_panoptic_test": not supported yet } def register_all_cityscapes_panoptic(root): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES] thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES] stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES] stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # There are three types of ids in cityscapes panoptic segmentation: # (1) category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the classifier # (2) instance id: this id is used to differentiate different instances from # the same category. For "stuff" classes, the instance id is always 0; for # "thing" classes, the instance id starts from 1 and 0 is reserved for # ignored instances (e.g. crowd annotation). # (3) panoptic id: this is the compact id that encode both category and # instance id by: category_id * 1000 + instance_id. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for k in CITYSCAPES_CATEGORIES: if k["isthing"] == 1: thing_dataset_id_to_contiguous_id[k["id"]] = k["trainId"] else: stuff_dataset_id_to_contiguous_id[k["id"]] = k["trainId"] meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id for key, (image_dir, gt_dir, gt_json) in _RAW_CITYSCAPES_PANOPTIC_SPLITS.items(): image_dir = os.path.join(root, image_dir) gt_dir = os.path.join(root, gt_dir) gt_json = os.path.join(root, gt_json) if key in DatasetCatalog.list(): DatasetCatalog.remove(key) DatasetCatalog.register( key, lambda x=image_dir, y=gt_dir, z=gt_json: load_cityscapes_panoptic(x, y, z, meta) ) MetadataCatalog.get(key).set( panoptic_root=gt_dir, image_root=image_dir, panoptic_json=gt_json, gt_dir=gt_dir.replace("cityscapes_panoptic_", ""), evaluator_type="cityscapes_panoptic_seg", ignore_label=255, label_divisor=1000, **meta, ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_cityscapes_panoptic(_root) ================================================ FILE: annotator/oneformer/oneformer/data/datasets/register_coco_panoptic2instance.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/builtin.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ """ This file registers pre-defined datasets at hard-coded paths, and their metadata. We hard-code metadata for common datasets. This will enable: 1. Consistency check when loading the datasets 2. Use models on these standard datasets directly and run demos, without having to download the dataset annotations We hard-code some paths to the dataset that's assumed to exist in "./datasets/". Users SHOULD NOT use this file to create new dataset / metadata for new dataset. To add new dataset, refer to the tutorial "docs/DATASETS.md". """ import os from annotator.oneformer.detectron2.data.datasets.builtin_meta import _get_builtin_metadata from annotator.oneformer.detectron2.data.datasets.coco import register_coco_instances _PREDEFINED_SPLITS_COCO = { "coco_2017_val_panoptic2instance": ("coco/val2017", "coco/annotations/panoptic2instances_val2017.json"), } def register_panoptic2instances_coco(root): for key, (image_root, json_file) in _PREDEFINED_SPLITS_COCO.items(): # Assume pre-defined datasets live in `./datasets`. register_coco_instances( key, _get_builtin_metadata("coco"), os.path.join(root, json_file) if "://" not in json_file else json_file, os.path.join(root, image_root), ) _root = os.path.expanduser(os.getenv("DETECTRON2_DATASETS", "datasets")) register_panoptic2instances_coco(_root) ================================================ FILE: annotator/oneformer/oneformer/data/datasets/register_coco_panoptic_annos_semseg.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import json import os from annotator.oneformer.detectron2.data import DatasetCatalog, MetadataCatalog from annotator.oneformer.detectron2.data.datasets import load_sem_seg from annotator.oneformer.detectron2.data.datasets.builtin_meta import COCO_CATEGORIES from annotator.oneformer.detectron2.utils.file_io import PathManager import contextlib import logging import io from fvcore.common.timer import Timer import annotator.oneformer.pycocotools.mask as mask_util from annotator.oneformer.detectron2.structures import BoxMode logger = logging.getLogger(__name__) _PREDEFINED_SPLITS_COCO_PANOPTIC = { "coco_2017_train_panoptic": ( # This is the original panoptic annotation directory "coco/panoptic_train2017", "coco/annotations/panoptic_train2017.json", # This directory contains semantic annotations that are # converted from panoptic annotations. # It is used by PanopticFPN. # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py # to create these directories. "coco/panoptic_semseg_train2017", ), "coco_2017_val_panoptic": ( "coco/panoptic_val2017", "coco/annotations/panoptic_val2017.json", "coco/panoptic_semseg_val2017", ), } def load_coco_instance_json(json_file, image_root, dataset_name=None): from annotator.oneformer.pycocotools.coco import COCO timer = Timer() json_file = PathManager.get_local_path(json_file) with contextlib.redirect_stdout(io.StringIO()): coco_api = COCO(json_file) if timer.seconds() > 1: logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) id_map = None if dataset_name is not None: meta = MetadataCatalog.get(dataset_name) cat_ids = sorted(coco_api.getCatIds()) cats = coco_api.loadCats(cat_ids) # The categories in a custom json file may not be sorted. thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])] meta.thing_classes = thing_classes # In COCO, certain category ids are artificially removed, # and by convention they are always ignored. # We deal with COCO's id issue and translate # the category ids to contiguous ids in [0, 80). # It works by looking at the "categories" field in the json, therefore # if users' own json also have incontiguous ids, we'll # apply this mapping as well but print a warning. if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)): if "coco" not in dataset_name: logger.warning( """ Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you. """ ) id_map = {v: i for i, v in enumerate(cat_ids)} meta.thing_dataset_id_to_contiguous_id = id_map # sort indices for reproducible results img_ids = sorted(coco_api.imgs.keys()) # imgs is a list of dicts, each looks something like: # {'license': 4, # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', # 'file_name': 'COCO_val2014_000000001268.jpg', # 'height': 427, # 'width': 640, # 'date_captured': '2013-11-17 05:57:24', # 'id': 1268} imgs = coco_api.loadImgs(img_ids) # anns is a list[list[dict]], where each dict is an annotation # record for an object. The inner list enumerates the objects in an image # and the outer list enumerates over images. Example of anns[0]: # [{'segmentation': [[192.81, # 247.09, # ... # 219.03, # 249.06]], # 'area': 1035.749, # 'iscrowd': 0, # 'image_id': 1268, # 'bbox': [192.81, 224.8, 74.73, 33.43], # 'category_id': 16, # 'id': 42986}, # ...] anns = [coco_api.imgToAnns[img_id] for img_id in img_ids] total_num_valid_anns = sum([len(x) for x in anns]) total_num_anns = len(coco_api.anns) if total_num_valid_anns < total_num_anns: logger.warning( f"{json_file} contains {total_num_anns} annotations, but only " f"{total_num_valid_anns} of them match to images in the file." ) if "minival" not in json_file: # The popular valminusminival & minival annotations for COCO2014 contain this bug. # However the ratio of buggy annotations there is tiny and does not affect accuracy. # Therefore we explicitly white-list them. ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format( json_file ) imgs_anns = list(zip(imgs, anns)) logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file)) dataset_dicts = {} ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] num_instances_without_valid_segmentation = 0 for (img_dict, anno_dict_list) in imgs_anns: record = {} record["file_name"] = os.path.join(image_root, img_dict["file_name"]) record["height"] = img_dict["height"] record["width"] = img_dict["width"] image_id = record["image_id"] = img_dict["id"] objs = [] for anno in anno_dict_list: # Check that the image_id in this annotation is the same as # the image_id we're looking at. # This fails only when the data parsing logic or the annotation file is buggy. # The original COCO valminusminival2014 & minival2014 annotation files # actually contains bugs that, together with certain ways of using COCO API, # can trigger this assertion. assert anno["image_id"] == image_id assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.' obj = {key: anno[key] for key in ann_keys if key in anno} if "bbox" in obj and len(obj["bbox"]) == 0: raise ValueError( f"One annotation of image {image_id} contains empty 'bbox' value! " "This json does not have valid COCO format." ) segm = anno.get("segmentation", None) if segm: # either list[list[float]] or dict(RLE) if isinstance(segm, dict): if isinstance(segm["counts"], list): # convert to compressed RLE segm = mask_util.frPyObjects(segm, *segm["size"]) else: # filter out invalid polygons (< 3 points) segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] if len(segm) == 0: num_instances_without_valid_segmentation += 1 continue # ignore this instance obj["segmentation"] = segm keypts = anno.get("keypoints", None) if keypts: # list[int] for idx, v in enumerate(keypts): if idx % 3 != 2: # COCO's segmentation coordinates are floating points in [0, H or W], # but keypoint coordinates are integers in [0, H-1 or W-1] # Therefore we assume the coordinates are "pixel indices" and # add 0.5 to convert to floating point coordinates. keypts[idx] = v + 0.5 obj["keypoints"] = keypts obj["bbox_mode"] = BoxMode.XYWH_ABS if id_map: annotation_category_id = obj["category_id"] try: obj["category_id"] = id_map[annotation_category_id] except KeyError as e: raise KeyError( f"Encountered category_id={annotation_category_id} " "but this id does not exist in 'categories' of the json file." ) from e objs.append(obj) record["annotations"] = objs dataset_dicts[image_id] = record if num_instances_without_valid_segmentation > 0: logger.warning( "Filtered out {} instances without valid segmentation. ".format( num_instances_without_valid_segmentation ) + "There might be issues in your dataset generation process. Please " "check https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html carefully" ) return dataset_dicts def get_metadata(): meta = {} # The following metadata maps contiguous id from [0, #thing categories + # #stuff categories) to their names and colors. We have to replica of the # same name and color under "thing_*" and "stuff_*" because the current # visualization function in D2 handles thing and class classes differently # due to some heuristic used in Panoptic FPN. We keep the same naming to # enable reusing existing visualization functions. thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1] thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1] stuff_classes = [k["name"] for k in COCO_CATEGORIES] stuff_colors = [k["color"] for k in COCO_CATEGORIES] meta["thing_classes"] = thing_classes meta["thing_colors"] = thing_colors meta["stuff_classes"] = stuff_classes meta["stuff_colors"] = stuff_colors # Convert category id for training: # category id: like semantic segmentation, it is the class id for each # pixel. Since there are some classes not used in evaluation, the category # id is not always contiguous and thus we have two set of category ids: # - original category id: category id in the original dataset, mainly # used for evaluation. # - contiguous category id: [0, #classes), in order to train the linear # softmax classifier. thing_dataset_id_to_contiguous_id = {} stuff_dataset_id_to_contiguous_id = {} for i, cat in enumerate(COCO_CATEGORIES): if cat["isthing"]: thing_dataset_id_to_contiguous_id[cat["id"]] = i # else: # stuff_dataset_id_to_contiguous_id[cat["id"]] = i # in order to use sem_seg evaluator stuff_dataset_id_to_contiguous_id[cat["id"]] = i meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id return meta def load_coco_panoptic_json(json_file, instances_json, instances_name, image_dir, gt_dir, semseg_dir, meta): """ Args: image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". Returns: list[dict]: a list of dicts in Detectron2 standard format. (See `Using Custom Datasets `_ ) """ def _convert_category_id(segment_info, meta): if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = True else: segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ segment_info["category_id"] ] segment_info["isthing"] = False return segment_info with PathManager.open(json_file) as f: json_info = json.load(f) instance_data_dicts = load_coco_instance_json(instances_json, image_dir.replace("panoptic_", ""), instances_name) ret = [] for ann in json_info["annotations"]: image_id = int(ann["image_id"]) # TODO: currently we assume image and label has the same filename but # different extension, and images have extension ".jpg" for COCO. Need # to make image extension a user-provided argument if we extend this # function to support other COCO-like datasets. image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") label_file = os.path.join(gt_dir, ann["file_name"]) sem_label_file = os.path.join(semseg_dir, ann["file_name"]) segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] ret.append( { "file_name": image_file, "image_id": image_id, "pan_seg_file_name": label_file, "sem_seg_file_name": sem_label_file, "segments_info": segments_info, "annotations": instance_data_dicts[image_id]["annotations"], } ) assert len(ret), f"No images found in {image_dir}!" assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] return ret def register_coco_panoptic_annos_sem_seg( name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json, instances_name, ): panoptic_name = name delattr(MetadataCatalog.get(panoptic_name), "thing_classes") delattr(MetadataCatalog.get(panoptic_name), "thing_colors") MetadataCatalog.get(panoptic_name).set( thing_classes=metadata["thing_classes"], thing_colors=metadata["thing_colors"], # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"], ) # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg" semantic_name = name + "_with_sem_seg" DatasetCatalog.register( semantic_name, lambda: load_coco_panoptic_json(panoptic_json, instances_json, instances_name, image_root, panoptic_root, sem_seg_root, metadata), ) MetadataCatalog.get(semantic_name).set( sem_seg_root=sem_seg_root, panoptic_root=panoptic_root, image_root=image_root, panoptic_json=panoptic_json, json_file=instances_json, evaluator_type="coco_panoptic_seg", ignore_label=255, label_divisor=1000, **metadata, ) def register_all_coco_panoptic_annos_sem_seg(root): for ( prefix, (panoptic_root, panoptic_json, semantic_root), ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items(): prefix_instances = prefix[: -len("_panoptic")] instances_meta = MetadataCatalog.get(prefix_instances) image_root, instances_json = instances_meta.image_root, instances_meta.json_file if 'val' in instances_json: instances_json = instances_json.replace('instances_', 'panoptic2instances_') register_coco_panoptic_annos_sem_seg( prefix, get_metadata(), image_root, os.path.join(root, panoptic_root), os.path.join(root, panoptic_json), os.path.join(root, semantic_root), instances_json, prefix_instances, ) _root = os.getenv("DETECTRON2_DATASETS", "datasets") register_all_coco_panoptic_annos_sem_seg(_root) ================================================ FILE: annotator/oneformer/oneformer/data/tokenizer.py ================================================ # ------------------------------------------------------------------------- # MIT License # # Copyright (c) 2021 OpenAI # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # # Modified by Jiarui Xu # ------------------------------------------------------------------------- import gzip import html import os from functools import lru_cache import ftfy import regex as re import torch @lru_cache() def default_bpe(): return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bpe_simple_vocab_16e6.txt.gz') @lru_cache() def bytes_to_unicode(): """Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ bs = list(range(ord('!'), ord('~') + 1)) + list(range(ord('¡'), ord('¬') + 1)) + list(range(ord('®'), ord('ÿ') + 1)) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs def basic_clean(text): text = ftfy.fix_text(text) text = html.unescape(html.unescape(text)) return text.strip() def whitespace_clean(text): text = re.sub(r'\s+', ' ', text) text = text.strip() return text class Tokenize: def __init__(self, tokenizer, max_seq_len=77, truncate=True): self.tokenizer = tokenizer self.max_seq_len = max_seq_len self.truncate = truncate def __call__(self, texts): expanded_dim = False if isinstance(texts, str): texts = [texts] expanded_dim = True sot_token = self.tokenizer.encoder['<|startoftext|>'] eot_token = self.tokenizer.encoder['<|endoftext|>'] all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] for text in texts] result = torch.zeros(len(all_tokens), self.max_seq_len, dtype=torch.long) for i, tokens in enumerate(all_tokens): if len(tokens) > self.max_seq_len: if self.truncate: tokens = tokens[:self.max_seq_len] tokens[-1] = eot_token else: raise RuntimeError(f'Input {texts[i]} is too long for context length {self.max_seq_len}') result[i, :len(tokens)] = torch.tensor(tokens) if expanded_dim: return result[0] return result class SimpleTokenizer(object): def __init__(self, bpe_path: str = default_bpe()): self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} merges = gzip.open(bpe_path).read().decode('utf-8').split('\n') merges = merges[1:49152 - 256 - 2 + 1] merges = [tuple(merge.split()) for merge in merges] vocab = list(bytes_to_unicode().values()) vocab = vocab + [v + '' for v in vocab] for merge in merges: vocab.append(''.join(merge)) vocab.extend(['<|startoftext|>', '<|endoftext|>']) self.encoder = dict(zip(vocab, range(len(vocab)))) self.decoder = {v: k for k, v in self.encoder.items()} self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} self.pat = re.compile( r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) def bpe(self, token): if token in self.cache: return self.cache[token] word = tuple(token[:-1]) + (token[-1] + '', ) pairs = get_pairs(word) if not pairs: return token + '' while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) new_word.extend(word[i:j]) i = j except: # noqa: E722 new_word.extend(word[i:]) break if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = ' '.join(word) self.cache[token] = word return word def encode(self, text): bpe_tokens = [] text = whitespace_clean(basic_clean(text)).lower() for token in re.findall(self.pat, text): token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) return bpe_tokens def decode(self, tokens): text = ''.join([self.decoder[token] for token in tokens]) text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors='replace').replace('', ' ') return text ================================================ FILE: annotator/oneformer/oneformer/demo/colormap.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. """ An awesome colormap for really neat visualizations. Copied from Detectron, and removed gray colors. """ import numpy as np import random random.seed(0) __all__ = ["colormap", "random_color", "random_colors"] # fmt: off # RGB: # _COLORS = np.array( # [ # 0.000, 0.447, 0.741, # 0.850, 0.325, 0.098, # 0.929, 0.694, 0.125, # 0.494, 0.184, 0.556, # 0.466, 0.674, 0.188, # 0.301, 0.745, 0.933, # 0.635, 0.078, 0.184, # 0.300, 0.300, 0.300, # 0.600, 0.600, 0.600, # 1.000, 0.000, 0.000, # 1.000, 0.500, 0.000, # 0.749, 0.749, 0.000, # 0.000, 1.000, 0.000, # 0.000, 0.000, 1.000, # 0.667, 0.000, 1.000, # 0.333, 0.333, 0.000, # 0.333, 0.667, 0.000, # 0.333, 1.000, 0.000, # 0.667, 0.333, 0.000, # 0.667, 0.667, 0.000, # 0.667, 1.000, 0.000, # 1.000, 0.333, 0.000, # 1.000, 0.667, 0.000, # 1.000, 1.000, 0.000, # 0.000, 0.333, 0.500, # 0.000, 0.667, 0.500, # 0.000, 1.000, 0.500, # 0.333, 0.000, 0.500, # 0.333, 0.333, 0.500, # 0.333, 0.667, 0.500, # 0.333, 1.000, 0.500, # 0.667, 0.000, 0.500, # 0.667, 0.333, 0.500, # 0.667, 0.667, 0.500, # 0.667, 1.000, 0.500, # 1.000, 0.000, 0.500, # 1.000, 0.333, 0.500, # 1.000, 0.667, 0.500, # 1.000, 1.000, 0.500, # 0.000, 0.333, 1.000, # 0.000, 0.667, 1.000, # 0.000, 1.000, 1.000, # 0.333, 0.000, 1.000, # 0.333, 0.333, 1.000, # 0.333, 0.667, 1.000, # 0.333, 1.000, 1.000, # 0.667, 0.000, 1.000, # 0.667, 0.333, 1.000, # 0.667, 0.667, 1.000, # 0.667, 1.000, 1.000, # 1.000, 0.000, 1.000, # 1.000, 0.333, 1.000, # 1.000, 0.667, 1.000, # 0.333, 0.000, 0.000, # 0.500, 0.000, 0.000, # 0.667, 0.000, 0.000, # 0.833, 0.000, 0.000, # 1.000, 0.000, 0.000, # 0.000, 0.167, 0.000, # 0.000, 0.333, 0.000, # 0.000, 0.500, 0.000, # 0.000, 0.667, 0.000, # 0.000, 0.833, 0.000, # 0.000, 1.000, 0.000, # 0.000, 0.000, 0.167, # 0.000, 0.000, 0.333, # 0.000, 0.000, 0.500, # 0.000, 0.000, 0.667, # 0.000, 0.000, 0.833, # 0.000, 0.000, 1.000, # 0.000, 0.000, 0.000, # 0.143, 0.143, 0.143, # 0.857, 0.857, 0.857, # 1.000, 1.000, 1.000 # ] # ).astype(np.float32).reshape(-1, 3) # fmt: on _COLORS = [] def gen_color(): color = tuple(np.round(np.random.choice(range(256), size=3)/255, 3)) if color not in _COLORS and np.mean(color) != 0.0: _COLORS.append(color) else: gen_color() for _ in range(300): gen_color() def colormap(rgb=False, maximum=255): """ Args: rgb (bool): whether to return RGB colors or BGR colors. maximum (int): either 255 or 1 Returns: ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1] """ assert maximum in [255, 1], maximum c = _COLORS * maximum if not rgb: c = c[:, ::-1] return c def random_color(rgb=False, maximum=255): """ Args: rgb (bool): whether to return RGB colors or BGR colors. maximum (int): either 255 or 1 Returns: ndarray: a vector of 3 numbers """ idx = np.random.randint(0, len(_COLORS)) ret = _COLORS[idx] * maximum if not rgb: ret = ret[::-1] return ret def random_colors(N, rgb=False, maximum=255): """ Args: N (int): number of unique colors needed rgb (bool): whether to return RGB colors or BGR colors. maximum (int): either 255 or 1 Returns: ndarray: a list of random_color """ indices = random.sample(range(len(_COLORS)), N) ret = [_COLORS[i] * maximum for i in indices] if not rgb: ret = [x[::-1] for x in ret] return ret if __name__ == "__main__": import cv2 size = 100 H, W = 10, 10 canvas = np.random.rand(H * size, W * size, 3).astype("float32") for h in range(H): for w in range(W): idx = h * W + w if idx >= len(_COLORS): break canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx] cv2.imshow("a", canvas) cv2.waitKey(0) ================================================ FILE: annotator/oneformer/oneformer/demo/defaults.py ================================================ import torch import annotator.oneformer.detectron2.data.transforms as T from annotator.oneformer.detectron2.checkpoint import DetectionCheckpointer from annotator.oneformer.detectron2.data import ( MetadataCatalog, ) from annotator.oneformer.detectron2.modeling import build_model __all__ = [ "DefaultPredictor", ] class DefaultPredictor: """ Create a simple end-to-end predictor with the given config that runs on single device for a single input image. Compared to using the model directly, this class does the following additions: 1. Load checkpoint from `cfg.MODEL.WEIGHTS`. 2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`. 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`. 4. Take one input image and produce a single output, instead of a batch. This is meant for simple demo purposes, so it does the above steps automatically. This is not meant for benchmarks or running complicated inference logic. If you'd like to do anything more complicated, please refer to its source code as examples to build and use the model manually. Attributes: metadata (Metadata): the metadata of the underlying dataset, obtained from cfg.DATASETS.TEST. Examples: :: pred = DefaultPredictor(cfg) inputs = cv2.imread("input.jpg") outputs = pred(inputs) """ def __init__(self, cfg): self.cfg = cfg.clone() # cfg can be modified by model self.model = build_model(self.cfg) self.model.eval() if len(cfg.DATASETS.TEST): self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) checkpointer = DetectionCheckpointer(self.model) checkpointer.load(cfg.MODEL.WEIGHTS) self.aug = T.ResizeShortestEdge( [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST ) self.input_format = cfg.INPUT.FORMAT assert self.input_format in ["RGB", "BGR"], self.input_format def __call__(self, original_image, task): """ Args: original_image (np.ndarray): an image of shape (H, W, C) (in BGR order). Returns: predictions (dict): the output of the model for one image only. See :doc:`/tutorials/models` for details about the format. """ with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 # Apply pre-processing to image. if self.input_format == "RGB": # whether the model expects BGR inputs or RGB original_image = original_image[:, :, ::-1] height, width = original_image.shape[:2] image = self.aug.get_transform(original_image).apply_image(original_image) image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) task = f"The task is {task}" inputs = {"image": image, "height": height, "width": width, "task": task} predictions = self.model([inputs])[0] return predictions ================================================ FILE: annotator/oneformer/oneformer/demo/predictor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py import atexit import bisect import multiprocessing as mp from collections import deque import cv2 import torch from annotator.oneformer.detectron2.data import MetadataCatalog from defaults import DefaultPredictor from annotator.oneformer.detectron2.utils.video_visualizer import VideoVisualizer from visualizer import ColorMode, Visualizer class VisualizationDemo(object): def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): """ Args: cfg (CfgNode): instance_mode (ColorMode): parallel (bool): whether to run the model in different processes from visualization. Useful since the visualization logic can be slow. """ self.metadata = MetadataCatalog.get( cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" ) if 'cityscapes_fine_sem_seg_val' in cfg.DATASETS.TEST[0]: from cityscapesscripts.helpers.labels import labels stuff_colors = [k.color for k in labels if k.trainId != 255] self.metadata = self.metadata.set(stuff_colors=stuff_colors) self.cpu_device = torch.device("cpu") self.instance_mode = instance_mode self.parallel = parallel if parallel: num_gpu = torch.cuda.device_count() self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) else: self.predictor = DefaultPredictor(cfg) def run_on_image(self, image, task, sem_gt, pan_gt, ins_gt, box_gt): """ Args: image (np.ndarray): an image of shape (H, W, C) (in BGR order). This is the format used by OpenCV. Returns: predictions (dict): the output of the model. vis_output (VisImage): the visualized image output. """ vis_output = None # Convert image from OpenCV BGR format to Matplotlib RGB format. image = image[:, :, ::-1] vis_output = {} if task == 'panoptic': visualizer = Visualizer(image, metadata=self.metadata, instance_mode=0) predictions = self.predictor(image, "panoptic") panoptic_seg, segments_info = predictions["panoptic_seg"] vis_output['panoptic'] = visualizer.draw_panoptic_seg_predictions( panoptic_seg.to(self.cpu_device), segments_info, alpha=1 ) # visualizer = Visualizer(image, metadata=self.metadata, instance_mode=0) # vis_output['pan_gt'] = visualizer.draw_panoptic_seg( # pan_gt[0].to(self.cpu_device), pan_gt[1], alpha=1 # ) if task == 'panoptic' or task == 'semantic': visualizer = Visualizer(image, metadata=self.metadata, instance_mode=1) predictions = self.predictor(image, "semantic") vis_output['semantic'] = visualizer.draw_sem_seg( predictions["sem_seg"].argmax(dim=0).to(self.cpu_device), alpha=1 ) # visualizer = Visualizer(image, metadata=self.metadata, instance_mode=1) # vis_output['gt_sem'] = visualizer.draw_sem_seg( # sem_gt.to(self.cpu_device), alpha=1 # ) if task == 'panoptic' or task == 'instance': visualizer = Visualizer(image, metadata=self.metadata, instance_mode=2) predictions = self.predictor(image, "instance") instances = predictions["instances"].to(self.cpu_device) vis_output['instance'] = visualizer.draw_instance_predictions(predictions=instances, alpha=1) if 'boxes' in predictions: boxes, labels, scores = predictions["boxes"] visualizer = Visualizer(image, False, metadata=self.metadata, instance_mode=0) vis_output['boxes'] = visualizer.draw_box_predictions( boxes.to(self.cpu_device), labels.to(self.cpu_device), scores.to(self.cpu_device)) # visualizer = Visualizer(image, metadata=self.metadata, instance_mode=2) # vis_output['ins_gt'] = visualizer.draw_instance_predictions(predictions=ins_gt.to(self.cpu_device), alpha=1) # vis_output['input'] = visualizer.get_image(image) return predictions, vis_output class AsyncPredictor: """ A predictor that runs the model asynchronously, possibly on >1 GPUs. Because rendering the visualization takes considerably amount of time, this helps improve throughput a little bit when rendering videos. """ class _StopToken: pass class _PredictWorker(mp.Process): def __init__(self, cfg, task_queue, result_queue): self.cfg = cfg self.task_queue = task_queue self.result_queue = result_queue super().__init__() def run(self): predictor = DefaultPredictor(self.cfg) while True: task = self.task_queue.get() if isinstance(task, AsyncPredictor._StopToken): break idx, data = task result = predictor(data) self.result_queue.put((idx, result)) def __init__(self, cfg, num_gpus: int = 1): """ Args: cfg (CfgNode): num_gpus (int): if 0, will run on CPU """ num_workers = max(num_gpus, 1) self.task_queue = mp.Queue(maxsize=num_workers * 3) self.result_queue = mp.Queue(maxsize=num_workers * 3) self.procs = [] for gpuid in range(max(num_gpus, 1)): cfg = cfg.clone() cfg.defrost() cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" self.procs.append( AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) ) self.put_idx = 0 self.get_idx = 0 self.result_rank = [] self.result_data = [] for p in self.procs: p.start() atexit.register(self.shutdown) def put(self, image): self.put_idx += 1 self.task_queue.put((self.put_idx, image)) def get(self): self.get_idx += 1 # the index needed for this request if len(self.result_rank) and self.result_rank[0] == self.get_idx: res = self.result_data[0] del self.result_data[0], self.result_rank[0] return res while True: # make sure the results are returned in the correct order idx, res = self.result_queue.get() if idx == self.get_idx: return res insert = bisect.bisect(self.result_rank, idx) self.result_rank.insert(insert, idx) self.result_data.insert(insert, res) def __len__(self): return self.put_idx - self.get_idx def __call__(self, image): self.put(image) return self.get() def shutdown(self): for _ in self.procs: self.task_queue.put(AsyncPredictor._StopToken()) @property def default_buffer_size(self): return len(self.procs) * 5 ================================================ FILE: annotator/oneformer/oneformer/demo/visualizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import colorsys import logging import math import numpy as np from enum import Enum, unique import cv2 import matplotlib as mpl import matplotlib.colors as mplc import matplotlib.figure as mplfigure import annotator.oneformer.pycocotools.mask as mask_util import torch from matplotlib.backends.backend_agg import FigureCanvasAgg from PIL import Image from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes from annotator.oneformer.detectron2.utils.file_io import PathManager import random random.seed(0) from .colormap import random_color, _COLORS logger = logging.getLogger(__name__) __all__ = ["ColorMode", "VisImage", "Visualizer"] _SMALL_OBJECT_AREA_THRESH = 1000 _LARGE_MASK_AREA_THRESH = 120000 _OFF_WHITE = (1.0, 1.0, 1.0) _BLACK = (0, 0, 0) _RED = (1.0, 0, 0) _KEYPOINT_THRESHOLD = 0.05 def instance_color(rgb=False, idx=1, maximum=255): """ Args: rgb (bool): whether to return RGB colors or BGR colors. maximum (int): either 255 or 1 Returns: ndarray: a vector of 3 numbers """ ret = _COLORS[idx] * maximum if not rgb: ret = ret[::-1] return ret @unique class ColorMode(Enum): """ Enum of different color modes to use for instance visualizations. """ IMAGE = 0 """ Picks a random color for every instance and overlay segmentations with low opacity. """ SEGMENTATION = 1 """ Let instances of the same category have similar colors (from metadata.thing_colors), and overlay them with high opacity. This provides more attention on the quality of segmentation. """ IMAGE_BW = 2 """ Same as IMAGE, but convert all areas without masks to gray-scale. Only available for drawing per-instance mask predictions. """ class GenericMask: """ Attribute: polygons (list[ndarray]): list[ndarray]: polygons for this mask. Each ndarray has format [x, y, x, y, ...] mask (ndarray): a binary mask """ def __init__(self, mask_or_polygons, height, width): self._mask = self._polygons = self._has_holes = None self.height = height self.width = width m = mask_or_polygons if isinstance(m, dict): # RLEs assert "counts" in m and "size" in m if isinstance(m["counts"], list): # uncompressed RLEs h, w = m["size"] assert h == height and w == width m = mask_util.frPyObjects(m, h, w) self._mask = mask_util.decode(m)[:, :] return if isinstance(m, list): # list[ndarray] self._polygons = [np.asarray(x).reshape(-1) for x in m] return if isinstance(m, np.ndarray): # assumed to be a binary mask assert m.shape[1] != 2, m.shape assert m.shape == ( height, width, ), f"mask shape: {m.shape}, target dims: {height}, {width}" self._mask = m.astype("uint8") return raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m))) @property def mask(self): if self._mask is None: self._mask = self.polygons_to_mask(self._polygons) return self._mask @property def polygons(self): if self._polygons is None: self._polygons, self._has_holes = self.mask_to_polygons(self._mask) return self._polygons @property def has_holes(self): if self._has_holes is None: if self._mask is not None: self._polygons, self._has_holes = self.mask_to_polygons(self._mask) else: self._has_holes = False # if original format is polygon, does not have holes return self._has_holes def mask_to_polygons(self, mask): # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level # hierarchy. External contours (boundary) of the object are placed in hierarchy-1. # Internal contours (holes) are placed in hierarchy-2. # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours. mask = np.ascontiguousarray(mask) # some versions of cv2 does not support incontiguous arr res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) hierarchy = res[-1] if hierarchy is None: # empty mask return [], False has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0 res = res[-2] res = [x.flatten() for x in res] # These coordinates from OpenCV are integers in range [0, W-1 or H-1]. # We add 0.5 to turn them into real-value coordinate space. A better solution # would be to first +0.5 and then dilate the returned polygon by 0.5. res = [x + 0.5 for x in res if len(x) >= 6] return res, has_holes def polygons_to_mask(self, polygons): rle = mask_util.frPyObjects(polygons, self.height, self.width) rle = mask_util.merge(rle) return mask_util.decode(rle)[:, :] def area(self): return self.mask.sum() def bbox(self): p = mask_util.frPyObjects(self.polygons, self.height, self.width) p = mask_util.merge(p) bbox = mask_util.toBbox(p) bbox[2] += bbox[0] bbox[3] += bbox[1] return bbox class _PanopticPrediction: """ Unify different panoptic annotation/prediction formats """ def __init__(self, panoptic_seg, segments_info, metadata=None): if segments_info is None: assert metadata is not None # If "segments_info" is None, we assume "panoptic_img" is a # H*W int32 image storing the panoptic_id in the format of # category_id * label_divisor + instance_id. We reserve -1 for # VOID label. label_divisor = metadata.label_divisor segments_info = [] for panoptic_label in np.unique(panoptic_seg.numpy()): if panoptic_label == -1: # VOID region. continue pred_class = panoptic_label // label_divisor isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values() segments_info.append( { "id": int(panoptic_label), "category_id": int(pred_class), "isthing": bool(isthing), } ) del metadata self._seg = panoptic_seg self._sinfo = {s["id"]: s for s in segments_info} # seg id -> seg info segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True) areas = areas.numpy() sorted_idxs = np.argsort(-areas) self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs] self._seg_ids = self._seg_ids.tolist() for sid, area in zip(self._seg_ids, self._seg_areas): if sid in self._sinfo: self._sinfo[sid]["area"] = float(area) def non_empty_mask(self): """ Returns: (H, W) array, a mask for all pixels that have a prediction """ empty_ids = [] for id in self._seg_ids: if id not in self._sinfo: empty_ids.append(id) if len(empty_ids) == 0: return np.zeros(self._seg.shape, dtype=np.uint8) assert ( len(empty_ids) == 1 ), ">1 ids corresponds to no labels. This is currently not supported" return (self._seg != empty_ids[0]).numpy().astype(np.bool) def semantic_masks(self): for sid in self._seg_ids: sinfo = self._sinfo.get(sid) if sinfo is None or sinfo["isthing"]: # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions. continue yield (self._seg == sid).numpy().astype(np.bool), sinfo def instance_masks(self): for sid in self._seg_ids: sinfo = self._sinfo.get(sid) if sinfo is None or not sinfo["isthing"]: continue mask = (self._seg == sid).numpy().astype(np.bool) if mask.sum() > 0: yield mask, sinfo def _create_text_labels(classes, scores, class_names, is_crowd=None): """ Args: classes (list[int] or None): scores (list[float] or None): class_names (list[str] or None): is_crowd (list[bool] or None): Returns: list[str] or None """ labels = None if classes is not None: if class_names is not None and len(class_names) > 0: labels = [class_names[i] for i in classes] else: labels = [str(i) for i in classes] if scores is not None: if labels is None: labels = ["{:.0f}%".format(s * 100) for s in scores] else: labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)] if labels is not None and is_crowd is not None: labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)] return labels class VisImage: def __init__(self, img, scale=1.0): """ Args: img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255]. scale (float): scale the input image """ self.img = img self.scale = scale self.width, self.height = img.shape[1], img.shape[0] self._setup_figure(img) def _setup_figure(self, img): """ Args: Same as in :meth:`__init__()`. Returns: fig (matplotlib.pyplot.figure): top level container for all the image plot elements. ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system. """ fig = mplfigure.Figure(frameon=False) self.dpi = fig.get_dpi() # add a small 1e-2 to avoid precision lost due to matplotlib's truncation # (https://github.com/matplotlib/matplotlib/issues/15363) fig.set_size_inches( (self.width * self.scale + 1e-2) / self.dpi, (self.height * self.scale + 1e-2) / self.dpi, ) self.canvas = FigureCanvasAgg(fig) # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig) ax = fig.add_axes([0.0, 0.0, 1.0, 1.0]) ax.axis("off") self.fig = fig self.ax = ax self.reset_image(img) def reset_image(self, img): """ Args: img: same as in __init__ """ img = img.astype("uint8") self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest") def save(self, filepath): """ Args: filepath (str): a string that contains the absolute path, including the file name, where the visualized image will be saved. """ self.fig.savefig(filepath) def get_image(self): """ Returns: ndarray: the visualized image of shape (H, W, 3) (RGB) in uint8 type. The shape is scaled w.r.t the input image using the given `scale` argument. """ canvas = self.canvas s, (width, height) = canvas.print_to_buffer() # buf = io.BytesIO() # works for cairo backend # canvas.print_rgba(buf) # width, height = self.width, self.height # s = buf.getvalue() buffer = np.frombuffer(s, dtype="uint8") img_rgba = buffer.reshape(height, width, 4) rgb, alpha = np.split(img_rgba, [3], axis=2) return rgb.astype("uint8") class Visualizer: """ Visualizer that draws data about detection/segmentation on images. It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}` that draw primitive objects to images, as well as high-level wrappers like `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}` that draw composite data in some pre-defined style. Note that the exact visualization style for the high-level wrappers are subject to change. Style such as color, opacity, label contents, visibility of labels, or even the visibility of objects themselves (e.g. when the object is too small) may change according to different heuristics, as long as the results still look visually reasonable. To obtain a consistent style, you can implement custom drawing functions with the abovementioned primitive methods instead. If you need more customized visualization styles, you can process the data yourself following their format documented in tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not intend to satisfy everyone's preference on drawing styles. This visualizer focuses on high rendering quality rather than performance. It is not designed to be used for real-time applications. """ # TODO implement a fast, rasterized version using OpenCV def __init__(self, img_rgb, is_img=True, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE): """ Args: img_rgb: a numpy array of shape (H, W, C), where H and W correspond to the height and width of the image respectively. C is the number of color channels. The image is required to be in RGB format since that is a requirement of the Matplotlib library. The image is also expected to be in the range [0, 255]. metadata (Metadata): dataset metadata (e.g. class names and colors) instance_mode (ColorMode): defines one of the pre-defined style for drawing instances on an image. """ if is_img: self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8) else: self.img = np.zeros_like(img_rgb).clip(0, 255).astype(np.uint8) + 255 if metadata is None: metadata = MetadataCatalog.get("__nonexist__") self.metadata = metadata self.output = VisImage(self.img, scale=scale) self.cpu_device = torch.device("cpu") # too small texts are useless, therefore clamp to 9 self._default_font_size = max( np.sqrt(self.output.height * self.output.width) // 90, 10 // scale ) self._instance_mode = instance_mode self.keypoint_threshold = _KEYPOINT_THRESHOLD def get_image(self, img): img = np.asarray(img).clip(0, 255).astype(np.uint8) return VisImage(img, scale=1.0) def draw_box_predictions( self, boxes=None, labels=None, scores=None, assigned_colors=None ): """ Args: boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`, or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image, or a :class:`RotatedBoxes`, or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format for the N objects in a single image, labels (list[str]): the text to be displayed for each instance. assigned_colors (list[matplotlib.colors]): a list of colors, where each color corresponds to each mask or box in the image. Refer to 'matplotlib.colors' for full list of formats that the colors are accepted in. Returns: output (VisImage): image object with visualizations. """ num_instances = 0 boxes = self._convert_boxes(boxes) classes = labels.tolist() scores = scores.tolist() labels = _create_text_labels(classes, scores, self.metadata.get("stuff_classes", None)) num_instances = len(boxes) assert len(labels) == num_instances if assigned_colors is None: # assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] assigned_colors = [instance_color(rgb=True, idx=i, maximum=1) for i in range(num_instances)] if num_instances == 0: return self.output # Display in largest to smallest order to reduce occlusion. areas = None areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1) if areas is not None: sorted_idxs = np.argsort(-areas).tolist() # Re-order overlapped instances in descending order. boxes = boxes[sorted_idxs] if boxes is not None else None labels = [labels[k] for k in sorted_idxs] if labels is not None else None assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] for i in range(num_instances): color = assigned_colors[i] if boxes is not None: self.draw_box(boxes[i], edge_color=color) if labels is not None: # first get a box if boxes is not None: x0, y0, x1, y1 = boxes[i] text_pos = (x0, y0) # if drawing boxes, put text on the box corner. horiz_align = "left" else: continue # drawing the box confidence for keypoints isn't very useful. # for small objects, draw text at the side to avoid occlusion instance_area = (y1 - y0) * (x1 - x0) if ( instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale or y1 - y0 < 40 * self.output.scale ): if y1 >= self.output.height - 5: text_pos = (x1, y0) else: text_pos = (x0, y1) height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width) lighter_color = self._change_color_brightness(color, brightness_factor=0.7) font_size = ( np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size ) self.draw_text( labels[i], text_pos, color=lighter_color, horizontal_alignment=horiz_align, font_size=font_size, ) return self.output def draw_instance_predictions(self, predictions, alpha=0.8, is_text=True): """ Draw instance-level prediction results on an image. Args: predictions (Instances): the output of an instance detection/segmentation model. Following fields will be used to draw: "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). Returns: output (VisImage): image object with visualizations. """ boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None scores = predictions.scores if predictions.has("scores") else None classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None labels = _create_text_labels(classes, scores, self.metadata.get("stuff_classes", None)) keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None if predictions.has("pred_masks"): masks = np.asarray(predictions.pred_masks) masks = [GenericMask(x, self.output.height, self.output.width) for x in masks] else: masks = None if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("stuff_colors"): # colors = [ # self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes # ] colors = [ instance_color(rgb=True, idx=c, maximum=1) for c in classes ] else: colors = None if self._instance_mode == ColorMode.IMAGE_BW: self.output.reset_image( self._create_grayscale_image( (predictions.pred_masks.any(dim=0) > 0).numpy() if predictions.has("pred_masks") else None ) ) self.overlay_instances( masks=masks, boxes=boxes, labels=labels, keypoints=keypoints, assigned_colors=colors, alpha=alpha, is_text=is_text, ) return self.output def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8, is_text=True, edge_color=_OFF_WHITE): """ Draw semantic segmentation predictions/labels. Args: sem_seg (Tensor or ndarray): the segmentation of shape (H, W). Each value is the integer label of the pixel. area_threshold (int): segments with less than `area_threshold` are not drawn. alpha (float): the larger it is, the more opaque the segmentations are. Returns: output (VisImage): image object with visualizations. """ if isinstance(sem_seg, torch.Tensor): sem_seg = sem_seg.numpy() labels, areas = np.unique(sem_seg, return_counts=True) sorted_idxs = np.argsort(-areas).tolist() labels = labels[sorted_idxs] for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels): try: mask_color = [x / 255 for x in self.metadata.stuff_colors[label]] except (AttributeError, IndexError): mask_color = None binary_mask = (sem_seg == label).astype(np.uint8) text = self.metadata.stuff_classes[label] self.draw_binary_mask( binary_mask, color=mask_color, edge_color=edge_color, text=text, alpha=alpha, area_threshold=area_threshold, is_text=is_text, ) return self.output def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7, is_text=True,): """ Draw panoptic prediction annotations or results. Args: panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. segments_info (list[dict] or None): Describe each segment in `panoptic_seg`. If it is a ``list[dict]``, each dict contains keys "id", "category_id". If None, category id of each pixel is computed by ``pixel // metadata.label_divisor``. area_threshold (int): stuff segments with less than `area_threshold` are not drawn. Returns: output (VisImage): image object with visualizations. """ pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata) if self._instance_mode == ColorMode.IMAGE_BW: self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask())) # draw mask for all semantic segments first i.e. "stuff" for mask, sinfo in pred.semantic_masks(): category_idx = sinfo["category_id"] try: mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]] except AttributeError: mask_color = None text = self.metadata.stuff_classes[category_idx] self.draw_binary_mask( mask, color=mask_color, edge_color=_OFF_WHITE, text=text, alpha=alpha, area_threshold=area_threshold, is_text=is_text, ) # draw mask for all instances second all_instances = list(pred.instance_masks()) if len(all_instances) == 0: return self.output masks, sinfo = list(zip(*all_instances)) category_ids = [x["category_id"] for x in sinfo] try: scores = [x["score"] for x in sinfo] except KeyError: scores = None labels = _create_text_labels( category_ids, scores, self.metadata.stuff_classes, [x.get("iscrowd", 0) for x in sinfo] ) try: colors = [ self._jitter([x / 255 for x in self.metadata.stuff_colors[c]]) for c in category_ids ] except AttributeError: colors = None self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha, is_text=is_text) return self.output draw_panoptic_seg_predictions = draw_panoptic_seg # backward compatibility def draw_dataset_dict(self, dic): """ Draw annotations/segmentaions in Detectron2 Dataset format. Args: dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. Returns: output (VisImage): image object with visualizations. """ annos = dic.get("annotations", None) if annos: if "segmentation" in annos[0]: masks = [x["segmentation"] for x in annos] else: masks = None if "keypoints" in annos[0]: keypts = [x["keypoints"] for x in annos] keypts = np.array(keypts).reshape(len(annos), -1, 3) else: keypts = None boxes = [ BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) if len(x["bbox"]) == 4 else x["bbox"] for x in annos ] colors = None category_ids = [x["category_id"] for x in annos] if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("stuff_colors"): colors = [ self._jitter([x / 255 for x in self.metadata.stuff_colors[c]]) for c in category_ids ] names = self.metadata.get("stuff_classes", None) labels = _create_text_labels( category_ids, scores=None, class_names=names, is_crowd=[x.get("iscrowd", 0) for x in annos], ) self.overlay_instances( labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors ) sem_seg = dic.get("sem_seg", None) if sem_seg is None and "sem_seg_file_name" in dic: with PathManager.open(dic["sem_seg_file_name"], "rb") as f: sem_seg = Image.open(f) sem_seg = np.asarray(sem_seg, dtype="uint8") if sem_seg is not None: self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5) pan_seg = dic.get("pan_seg", None) # if pan_seg is None and "pan_seg_file_name" in dic: # with PathManager.open(dic["pan_seg_file_name"], "rb") as f: # pan_seg = Image.open(f) # pan_seg = np.asarray(pan_seg) # from panopticapi.utils import rgb2id # # pan_seg = rgb2id(pan_seg) if pan_seg is not None: segments_info = dic["segments_info"] pan_seg = torch.tensor(pan_seg) self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5) return self.output def overlay_instances( self, *, boxes=None, labels=None, masks=None, keypoints=None, assigned_colors=None, alpha=0.5, is_text=True, ): """ Args: boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`, or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image, or a :class:`RotatedBoxes`, or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format for the N objects in a single image, labels (list[str]): the text to be displayed for each instance. masks (masks-like object): Supported types are: * :class:`detectron2.structures.PolygonMasks`, :class:`detectron2.structures.BitMasks`. * list[list[ndarray]]: contains the segmentation masks for all objects in one image. The first level of the list corresponds to individual instances. The second level to all the polygon that compose the instance, and the third level to the polygon coordinates. The third level should have the format of [x0, y0, x1, y1, ..., xn, yn] (n >= 3). * list[ndarray]: each ndarray is a binary mask of shape (H, W). * list[dict]: each dict is a COCO-style RLE. keypoints (Keypoint or array like): an array-like object of shape (N, K, 3), where the N is the number of instances and K is the number of keypoints. The last dimension corresponds to (x, y, visibility or score). assigned_colors (list[matplotlib.colors]): a list of colors, where each color corresponds to each mask or box in the image. Refer to 'matplotlib.colors' for full list of formats that the colors are accepted in. Returns: output (VisImage): image object with visualizations. """ num_instances = 0 if boxes is not None: boxes = self._convert_boxes(boxes) num_instances = len(boxes) if masks is not None: masks = self._convert_masks(masks) if num_instances: assert len(masks) == num_instances else: num_instances = len(masks) if keypoints is not None: if num_instances: assert len(keypoints) == num_instances else: num_instances = len(keypoints) keypoints = self._convert_keypoints(keypoints) if labels is not None: assert len(labels) == num_instances if assigned_colors is None: # assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] assigned_colors = [instance_color(rgb=True, idx=i, maximum=1) for i in range(num_instances)] if num_instances == 0: return self.output if boxes is not None and boxes.shape[1] == 5: return self.overlay_rotated_instances( boxes=boxes, labels=labels, assigned_colors=assigned_colors ) # Display in largest to smallest order to reduce occlusion. areas = None if boxes is not None: areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1) elif masks is not None: areas = np.asarray([x.area() for x in masks]) if areas is not None: sorted_idxs = np.argsort(-areas).tolist() # Re-order overlapped instances in descending order. boxes = boxes[sorted_idxs] if boxes is not None else None labels = [labels[k] for k in sorted_idxs] if labels is not None else None masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] keypoints = keypoints[sorted_idxs] if keypoints is not None else None for i in range(num_instances): color = assigned_colors[i] if boxes is not None: self.draw_box(boxes[i], edge_color=color) if masks is not None: for segment in masks[i].polygons: self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha) if labels is not None: # first get a box if boxes is not None: x0, y0, x1, y1 = boxes[i] text_pos = (x0, y0) # if drawing boxes, put text on the box corner. horiz_align = "left" elif masks is not None: # skip small mask without polygon if len(masks[i].polygons) == 0: continue x0, y0, x1, y1 = masks[i].bbox() # draw text in the center (defined by median) when box is not drawn # median is less sensitive to outliers. text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1] horiz_align = "center" else: continue # drawing the box confidence for keypoints isn't very useful. # for small objects, draw text at the side to avoid occlusion instance_area = (y1 - y0) * (x1 - x0) if ( instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale or y1 - y0 < 40 * self.output.scale ): if y1 >= self.output.height - 5: text_pos = (x1, y0) else: text_pos = (x0, y1) height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width) lighter_color = self._change_color_brightness(color, brightness_factor=0.7) font_size = ( np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size ) if is_text: self.draw_text( labels[i], text_pos, color=lighter_color, horizontal_alignment=horiz_align, font_size=font_size, ) # draw keypoints if keypoints is not None: for keypoints_per_instance in keypoints: self.draw_and_connect_keypoints(keypoints_per_instance) return self.output def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None): """ Args: boxes (ndarray): an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format for the N objects in a single image. labels (list[str]): the text to be displayed for each instance. assigned_colors (list[matplotlib.colors]): a list of colors, where each color corresponds to each mask or box in the image. Refer to 'matplotlib.colors' for full list of formats that the colors are accepted in. Returns: output (VisImage): image object with visualizations. """ num_instances = len(boxes) if assigned_colors is None: # assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] assigned_colors = [instance_color(rgb=True, idx=i, maximum=1) for i in range(num_instances)] if num_instances == 0: return self.output # Display in largest to smallest order to reduce occlusion. if boxes is not None: areas = boxes[:, 2] * boxes[:, 3] sorted_idxs = np.argsort(-areas).tolist() # Re-order overlapped instances in descending order. boxes = boxes[sorted_idxs] labels = [labels[k] for k in sorted_idxs] if labels is not None else None colors = [assigned_colors[idx] for idx in sorted_idxs] for i in range(num_instances): self.draw_rotated_box_with_label( boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None ) return self.output def draw_and_connect_keypoints(self, keypoints): """ Draws keypoints of an instance and follows the rules for keypoint connections to draw lines between appropriate keypoints. This follows color heuristics for line color. Args: keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints and the last dimension corresponds to (x, y, probability). Returns: output (VisImage): image object with visualizations. """ visible = {} keypoint_names = self.metadata.get("keypoint_names") for idx, keypoint in enumerate(keypoints): # draw keypoint x, y, prob = keypoint if prob > self.keypoint_threshold: self.draw_circle((x, y), color=_RED) if keypoint_names: keypoint_name = keypoint_names[idx] visible[keypoint_name] = (x, y) if self.metadata.get("keypoint_connection_rules"): for kp0, kp1, color in self.metadata.keypoint_connection_rules: if kp0 in visible and kp1 in visible: x0, y0 = visible[kp0] x1, y1 = visible[kp1] color = tuple(x / 255.0 for x in color) self.draw_line([x0, x1], [y0, y1], color=color) # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip # Note that this strategy is specific to person keypoints. # For other keypoints, it should just do nothing try: ls_x, ls_y = visible["left_shoulder"] rs_x, rs_y = visible["right_shoulder"] mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2 except KeyError: pass else: # draw line from nose to mid-shoulder nose_x, nose_y = visible.get("nose", (None, None)) if nose_x is not None: self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED) try: # draw line from mid-shoulder to mid-hip lh_x, lh_y = visible["left_hip"] rh_x, rh_y = visible["right_hip"] except KeyError: pass else: mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2 self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED) return self.output """ Primitive drawing functions: """ def draw_text( self, text, position, *, font_size=None, color="g", horizontal_alignment="center", rotation=0, ): """ Args: text (str): class label position (tuple): a tuple of the x and y coordinates to place text on image. font_size (int, optional): font of the text. If not provided, a font size proportional to the image width is calculated and used. color: color of the text. Refer to `matplotlib.colors` for full list of formats that are accepted. horizontal_alignment (str): see `matplotlib.text.Text` rotation: rotation angle in degrees CCW Returns: output (VisImage): image object with text drawn. """ if not font_size: font_size = self._default_font_size # since the text background is dark, we don't want the text to be dark color = np.maximum(list(mplc.to_rgb(color)), 0.2) color[np.argmax(color)] = max(0.8, np.max(color)) x, y = position self.output.ax.text( x, y, text, size=font_size * self.output.scale, family="sans-serif", bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"}, verticalalignment="top", horizontalalignment=horizontal_alignment, color=color, zorder=10, rotation=rotation, ) return self.output def draw_box(self, box_coord, alpha=1.0, edge_color="g", line_style="-"): """ Args: box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0 are the coordinates of the image's top left corner. x1 and y1 are the coordinates of the image's bottom right corner. alpha (float): blending efficient. Smaller values lead to more transparent masks. edge_color: color of the outline of the box. Refer to `matplotlib.colors` for full list of formats that are accepted. line_style (string): the string to use to create the outline of the boxes. Returns: output (VisImage): image object with box drawn. """ x0, y0, x1, y1 = box_coord width = x1 - x0 height = y1 - y0 linewidth = 2 self.output.ax.add_patch( mpl.patches.Rectangle( (x0, y0), width, height, fill=False, edgecolor=edge_color, linewidth=linewidth * self.output.scale, alpha=alpha, linestyle=line_style, ) ) return self.output def draw_rotated_box_with_label( self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None ): """ Draw a rotated box with label on its top-left corner. Args: rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle), where cnt_x and cnt_y are the center coordinates of the box. w and h are the width and height of the box. angle represents how many degrees the box is rotated CCW with regard to the 0-degree box. alpha (float): blending efficient. Smaller values lead to more transparent masks. edge_color: color of the outline of the box. Refer to `matplotlib.colors` for full list of formats that are accepted. line_style (string): the string to use to create the outline of the boxes. label (string): label for rotated box. It will not be rendered when set to None. Returns: output (VisImage): image object with box drawn. """ cnt_x, cnt_y, w, h, angle = rotated_box area = w * h # use thinner lines when the box is small linewidth = self._default_font_size / ( 6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3 ) theta = angle * math.pi / 180.0 c = math.cos(theta) s = math.sin(theta) rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)] # x: left->right ; y: top->down rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect] for k in range(4): j = (k + 1) % 4 self.draw_line( [rotated_rect[k][0], rotated_rect[j][0]], [rotated_rect[k][1], rotated_rect[j][1]], color=edge_color, linestyle="--" if k == 1 else line_style, linewidth=linewidth, ) if label is not None: text_pos = rotated_rect[1] # topleft corner height_ratio = h / np.sqrt(self.output.height * self.output.width) label_color = self._change_color_brightness(edge_color, brightness_factor=0.7) font_size = ( np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size ) self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle) return self.output def draw_circle(self, circle_coord, color, radius=3): """ Args: circle_coord (list(int) or tuple(int)): contains the x and y coordinates of the center of the circle. color: color of the polygon. Refer to `matplotlib.colors` for a full list of formats that are accepted. radius (int): radius of the circle. Returns: output (VisImage): image object with box drawn. """ x, y = circle_coord self.output.ax.add_patch( mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color) ) return self.output def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None): """ Args: x_data (list[int]): a list containing x values of all the points being drawn. Length of list should match the length of y_data. y_data (list[int]): a list containing y values of all the points being drawn. Length of list should match the length of x_data. color: color of the line. Refer to `matplotlib.colors` for a full list of formats that are accepted. linestyle: style of the line. Refer to `matplotlib.lines.Line2D` for a full list of formats that are accepted. linewidth (float or None): width of the line. When it's None, a default value will be computed and used. Returns: output (VisImage): image object with line drawn. """ if linewidth is None: linewidth = self._default_font_size / 3 linewidth = max(linewidth, 1) self.output.ax.add_line( mpl.lines.Line2D( x_data, y_data, linewidth=linewidth * self.output.scale, color=color, linestyle=linestyle, ) ) return self.output def draw_binary_mask( self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=10, is_text=True, ): """ Args: binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and W is the image width. Each value in the array is either a 0 or 1 value of uint8 type. color: color of the mask. Refer to `matplotlib.colors` for a full list of formats that are accepted. If None, will pick a random color. edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a full list of formats that are accepted. text (str): if None, will be drawn on the object alpha (float): blending efficient. Smaller values lead to more transparent masks. area_threshold (float): a connected component smaller than this area will not be shown. Returns: output (VisImage): image object with mask drawn. """ if color is None: color = random_color(rgb=True, maximum=1) color = mplc.to_rgb(color) has_valid_segment = False binary_mask = binary_mask.astype("uint8") # opencv needs uint8 mask = GenericMask(binary_mask, self.output.height, self.output.width) shape2d = (binary_mask.shape[0], binary_mask.shape[1]) if not mask.has_holes: # draw polygons for regular masks for segment in mask.polygons: # area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1])) # if area < (area_threshold or 0): # continue has_valid_segment = True segment = segment.reshape(-1, 2) self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha) else: # TODO: Use Path/PathPatch to draw vector graphics: # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon rgba = np.zeros(shape2d + (4,), dtype="float32") rgba[:, :, :3] = color rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha has_valid_segment = True self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0)) if is_text: if text is not None and has_valid_segment: lighter_color = self._change_color_brightness(color, brightness_factor=0.7) self._draw_text_in_mask(binary_mask, text, lighter_color) return self.output def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5): """ Args: soft_mask (ndarray): float array of shape (H, W), each value in [0, 1]. color: color of the mask. Refer to `matplotlib.colors` for a full list of formats that are accepted. If None, will pick a random color. text (str): if None, will be drawn on the object alpha (float): blending efficient. Smaller values lead to more transparent masks. Returns: output (VisImage): image object with mask drawn. """ if color is None: color = random_color(rgb=True, maximum=1) color = mplc.to_rgb(color) shape2d = (soft_mask.shape[0], soft_mask.shape[1]) rgba = np.zeros(shape2d + (4,), dtype="float32") rgba[:, :, :3] = color rgba[:, :, 3] = soft_mask * alpha self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0)) if text is not None: lighter_color = self._change_color_brightness(color, brightness_factor=0.7) binary_mask = (soft_mask > 0.5).astype("uint8") # self._draw_text_in_mask(binary_mask, text, lighter_color) return self.output def draw_polygon(self, segment, color, edge_color=None, alpha=0.5): """ Args: segment: numpy array of shape Nx2, containing all the points in the polygon. color: color of the polygon. Refer to `matplotlib.colors` for a full list of formats that are accepted. edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a full list of formats that are accepted. If not provided, a darker shade of the polygon color will be used instead. alpha (float): blending efficient. Smaller values lead to more transparent masks. Returns: output (VisImage): image object with polygon drawn. """ if edge_color is None: # make edge color darker than the polygon color if alpha > 0.8: edge_color = self._change_color_brightness(color, brightness_factor=-0.7) else: edge_color = color edge_color = mplc.to_rgb(edge_color) + (1,) polygon = mpl.patches.Polygon( segment, fill=True, facecolor=mplc.to_rgb(color) + (alpha,), edgecolor=edge_color, linewidth=max(self._default_font_size // 15 * self.output.scale, 1), ) self.output.ax.add_patch(polygon) return self.output """ Internal methods: """ def _jitter(self, color): """ Randomly modifies given color to produce a slightly different color than the color given. Args: color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color picked. The values in the list are in the [0.0, 1.0] range. Returns: jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color after being jittered. The values in the list are in the [0.0, 1.0] range. """ color = mplc.to_rgb(color) vec = np.random.rand(3) # better to do it in another color space vec = vec / np.linalg.norm(vec) * 0.5 res = np.clip(vec + color, 0, 1) return tuple(res) def _create_grayscale_image(self, mask=None): """ Create a grayscale version of the original image. The colors in masked area, if given, will be kept. """ img_bw = self.img.astype("f4").mean(axis=2) img_bw = np.stack([img_bw] * 3, axis=2) if mask is not None: img_bw[mask] = self.img[mask] return img_bw def _change_color_brightness(self, color, brightness_factor): """ Depending on the brightness_factor, gives a lighter or darker color i.e. a color with less or more saturation than the original color. Args: color: color of the polygon. Refer to `matplotlib.colors` for a full list of formats that are accepted. brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of 0 will correspond to no change, a factor in [-1.0, 0) range will result in a darker color and a factor in (0, 1.0] range will result in a lighter color. Returns: modified_color (tuple[double]): a tuple containing the RGB values of the modified color. Each value in the tuple is in the [0.0, 1.0] range. """ assert brightness_factor >= -1.0 and brightness_factor <= 1.0 color = mplc.to_rgb(color) polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color)) modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1]) modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2]) return modified_color def _convert_boxes(self, boxes): """ Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension. """ if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes): return boxes.tensor.detach().numpy() else: return np.asarray(boxes) def _convert_masks(self, masks_or_polygons): """ Convert different format of masks or polygons to a tuple of masks and polygons. Returns: list[GenericMask]: """ m = masks_or_polygons if isinstance(m, PolygonMasks): m = m.polygons if isinstance(m, BitMasks): m = m.tensor.numpy() if isinstance(m, torch.Tensor): m = m.numpy() ret = [] for x in m: if isinstance(x, GenericMask): ret.append(x) else: ret.append(GenericMask(x, self.output.height, self.output.width)) return ret def _draw_text_in_mask(self, binary_mask, text, color): """ Find proper places to draw text given a binary mask. """ # TODO sometimes drawn on wrong objects. the heuristics here can improve. _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8) if stats[1:, -1].size == 0: return largest_component_id = np.argmax(stats[1:, -1]) + 1 # draw text on the largest component, as well as other very large components. for cid in range(1, _num_cc): if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH: # median is more stable than centroid # center = centroids[largest_component_id] center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1] self.draw_text(text, center, color=color) def _convert_keypoints(self, keypoints): if isinstance(keypoints, Keypoints): keypoints = keypoints.tensor keypoints = np.asarray(keypoints) return keypoints def get_output(self): """ Returns: output (VisImage): the image output containing the visualizations added to the image. """ return self.output ================================================ FILE: annotator/oneformer/oneformer/evaluation/__init__.py ================================================ from .detection_coco_evaluator import * from .coco_evaluator import * from .cityscapes_evaluation import CityscapesInstanceEvaluator ================================================ FILE: annotator/oneformer/oneformer/evaluation/cityscapes_evaluation.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/cityscapes_evaluation.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import glob import logging import numpy as np import os import tempfile from collections import OrderedDict import torch from PIL import Image from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.utils import comm from annotator.oneformer.detectron2.utils.file_io import PathManager from .evaluator import DatasetEvaluator class CityscapesEvaluator(DatasetEvaluator): """ Base class for evaluation using cityscapes API. """ def __init__(self, dataset_name): """ Args: dataset_name (str): the name of the dataset. It must have the following metadata associated with it: "thing_classes", "gt_dir". """ self._metadata = MetadataCatalog.get(dataset_name) self._cpu_device = torch.device("cpu") self._logger = logging.getLogger(__name__) def reset(self): self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_") self._temp_dir = self._working_dir.name # All workers will write to the same results directory # TODO this does not work in distributed training assert ( comm.get_local_size() == comm.get_world_size() ), "CityscapesEvaluator currently do not work with multiple machines." self._temp_dir = comm.all_gather(self._temp_dir)[0] if self._temp_dir != self._working_dir.name: self._working_dir.cleanup() self._logger.info( "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir) ) class CityscapesInstanceEvaluator(CityscapesEvaluator): """ Evaluate instance segmentation results on cityscapes dataset using cityscapes API. Note: * It does not work in multi-machine distributed training. * It contains a synchronization, therefore has to be used on all ranks. * Only the main process runs evaluation. """ def process(self, inputs, outputs): from cityscapesscripts.helpers.labels import name2label for input, output in zip(inputs, outputs): file_name = input["file_name"] basename = os.path.splitext(os.path.basename(file_name))[0] pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt") if "instances" in output: output = output["instances"].to(self._cpu_device) num_instances = len(output) with open(pred_txt, "w") as fout: for i in range(num_instances): pred_class = output.pred_classes[i] classes = self._metadata.stuff_classes[pred_class] class_id = name2label[classes].id score = output.scores[i] mask = output.pred_masks[i].numpy().astype("uint8") png_filename = os.path.join( self._temp_dir, basename + "_{}_{}.png".format(i, classes) ) Image.fromarray(mask * 255).save(png_filename) fout.write( "{} {} {}\n".format(os.path.basename(png_filename), class_id, score) ) else: # Cityscapes requires a prediction file for every ground truth image. with open(pred_txt, "w") as fout: pass def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP" and "AP50". """ comm.synchronize() if comm.get_rank() > 0: return import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) # set some global states in cityscapes evaluation API, before evaluating cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) cityscapes_eval.args.predictionWalk = None cityscapes_eval.args.JSONOutput = False cityscapes_eval.args.colorized = False cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json") # These lines are adopted from # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa gt_dir = PathManager.get_local_path(self._metadata.gt_dir) groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png")) assert len( groundTruthImgList ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( cityscapes_eval.args.groundTruthSearch ) predictionImgList = [] for gt in groundTruthImgList: predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args)) results = cityscapes_eval.evaluateImgLists( predictionImgList, groundTruthImgList, cityscapes_eval.args )["averages"] ret = OrderedDict() ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100} self._working_dir.cleanup() return ret class CityscapesSemSegEvaluator(CityscapesEvaluator): """ Evaluate semantic segmentation results on cityscapes dataset using cityscapes API. Note: * It does not work in multi-machine distributed training. * It contains a synchronization, therefore has to be used on all ranks. * Only the main process runs evaluation. """ def process(self, inputs, outputs): from cityscapesscripts.helpers.labels import trainId2label for input, output in zip(inputs, outputs): file_name = input["file_name"] basename = os.path.splitext(os.path.basename(file_name))[0] pred_filename = os.path.join(self._temp_dir, basename + "_pred.png") output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy() pred = 255 * np.ones(output.shape, dtype=np.uint8) for train_id, label in trainId2label.items(): if label.ignoreInEval: continue pred[output == train_id] = label.id Image.fromarray(pred).save(pred_filename) def evaluate(self): comm.synchronize() if comm.get_rank() > 0: return # Load the Cityscapes eval script *after* setting the required env var, # since the script reads CITYSCAPES_DATASET into global variables at load time. import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) # set some global states in cityscapes evaluation API, before evaluating cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) cityscapes_eval.args.predictionWalk = None cityscapes_eval.args.JSONOutput = False cityscapes_eval.args.colorized = False # These lines are adopted from # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa gt_dir = PathManager.get_local_path(self._metadata.gt_dir) groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png")) assert len( groundTruthImgList ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( cityscapes_eval.args.groundTruthSearch ) predictionImgList = [] for gt in groundTruthImgList: predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt)) results = cityscapes_eval.evaluateImgLists( predictionImgList, groundTruthImgList, cityscapes_eval.args ) ret = OrderedDict() ret["sem_seg"] = { "IoU": 100.0 * results["averageScoreClasses"], "iIoU": 100.0 * results["averageScoreInstClasses"], "IoU_sup": 100.0 * results["averageScoreCategories"], "iIoU_sup": 100.0 * results["averageScoreInstCategories"], } self._working_dir.cleanup() return ret ================================================ FILE: annotator/oneformer/oneformer/evaluation/coco_evaluator.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/coco_evaluation.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import contextlib import copy import io import itertools import json import logging import numpy as np import os import pickle from collections import OrderedDict import annotator.oneformer.pycocotools.mask as mask_util import torch from annotator.oneformer.pycocotools.coco import COCO from annotator.oneformer.pycocotools.cocoeval import COCOeval from tabulate import tabulate import annotator.oneformer.detectron2.utils.comm as comm from annotator.oneformer.detectron2.config import CfgNode from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.data.datasets.coco import convert_to_coco_json from annotator.oneformer.detectron2.structures import Boxes, BoxMode, pairwise_iou from annotator.oneformer.detectron2.utils.file_io import PathManager from annotator.oneformer.detectron2.utils.logger import create_small_table from .evaluator import DatasetEvaluator try: from annotator.oneformer.detectron2.evaluation.fast_eval_api import COCOeval_opt except ImportError: COCOeval_opt = COCOeval class COCOEvaluator(DatasetEvaluator): """ Evaluate AP for instance detection/segmentation, AP for keypoint detection outputs using COCO's metrics. See http://cocodataset.org/#detection-eval and http://cocodataset.org/#keypoints-eval to understand its metrics. The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means the metric cannot be computed (e.g. due to no predictions made). In addition to COCO, this evaluator is able to support any bounding box detection, instance segmentation, or keypoint detection dataset. """ def __init__( self, dataset_name, tasks=None, distributed=True, output_dir=None, *, max_dets_per_image=None, use_fast_impl=True, kpt_oks_sigmas=(), allow_cached_coco=True, ): """ Args: dataset_name (str): name of the dataset to be evaluated. It must have either the following corresponding metadata: "json_file": the path to the COCO format annotation Or it must be in detectron2's standard dataset format so it can be converted to COCO format automatically. tasks (tuple[str]): tasks that can be evaluated under the given configuration. A task is one of "bbox", "segm", "keypoints". By default, will infer this automatically from predictions. distributed (True): if True, will collect results from all ranks and run evaluation in the main process. Otherwise, will only evaluate the results in the current process. output_dir (str): optional, an output directory to dump all results predicted on the dataset. The dump contains two files: 1. "instances_predictions.pth" a file that can be loaded with `torch.load` and contains all the results in the format they are produced by the model. 2. "coco_instances_results.json" a json file in COCO's result format. max_dets_per_image (int): limit on the maximum number of detections per image. By default in COCO, this limit is to 100, but this can be customized to be greater, as is needed in evaluation metrics AP fixed and AP pool (see https://arxiv.org/pdf/2102.01066.pdf) This doesn't affect keypoint evaluation. use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP. Although the results should be very close to the official implementation in COCO API, it is still recommended to compute results with the official API for use in papers. The faster implementation also uses more RAM. kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval When empty, it will use the defaults in COCO. Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS. allow_cached_coco (bool): Whether to use cached coco json from previous validation runs. You should set this to False if you need to use different validation data. Defaults to True. """ self._logger = logging.getLogger(__name__) self._distributed = distributed self._output_dir = output_dir if use_fast_impl and (COCOeval_opt is COCOeval): self._logger.info("Fast COCO eval is not built. Falling back to official COCO eval.") use_fast_impl = False self._use_fast_impl = use_fast_impl # COCOeval requires the limit on the number of detections per image (maxDets) to be a list # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the # 3rd element (100) is used as the limit on the number of detections per image when # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval, # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults. if max_dets_per_image is None: max_dets_per_image = [1, 10, 100] else: max_dets_per_image = [1, 10, max_dets_per_image] self._max_dets_per_image = max_dets_per_image if tasks is not None and isinstance(tasks, CfgNode): kpt_oks_sigmas = ( tasks.TEST.KEYPOINT_OKS_SIGMAS if not kpt_oks_sigmas else kpt_oks_sigmas ) self._logger.warn( "COCO Evaluator instantiated using config, this is deprecated behavior." " Please pass in explicit arguments instead." ) self._tasks = None # Infering it from predictions should be better else: self._tasks = tasks self._cpu_device = torch.device("cpu") self._metadata = MetadataCatalog.get(dataset_name) if not hasattr(self._metadata, "json_file"): if output_dir is None: raise ValueError( "output_dir must be provided to COCOEvaluator " "for datasets not in COCO format." ) self._logger.info(f"Trying to convert '{dataset_name}' to COCO format ...") cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json") self._metadata.json_file = cache_path convert_to_coco_json(dataset_name, cache_path, allow_cached=allow_cached_coco) json_file = PathManager.get_local_path(self._metadata.json_file) with contextlib.redirect_stdout(io.StringIO()): self._coco_api = COCO(json_file) # Test set json files do not contain annotations (evaluation must be # performed using the COCO evaluation server). self._do_evaluation = "annotations" in self._coco_api.dataset if self._do_evaluation: self._kpt_oks_sigmas = kpt_oks_sigmas def reset(self): self._predictions = [] def process(self, inputs, outputs): """ Args: inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id". outputs: the outputs of a COCO model. It is a list of dicts with key "instances" that contains :class:`Instances`. """ for input, output in zip(inputs, outputs): prediction = {"image_id": input["image_id"]} if "instances" in output: instances = output["instances"].to(self._cpu_device) prediction["instances"] = instances_to_coco_json(instances, input["image_id"]) if len(prediction) > 1: self._predictions.append(prediction) def evaluate(self, img_ids=None): """ Args: img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset """ if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions if len(predictions) == 0: self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) self._results = OrderedDict() if "instances" in predictions[0]: self._eval_predictions(predictions, img_ids=img_ids) # Copy so the caller can do whatever with results return copy.deepcopy(self._results) def _tasks_from_predictions(self, predictions): """ Get COCO API "tasks" (i.e. iou_type) from COCO-format predictions. """ for pred in predictions: if "segmentation" in pred: tasks = {"segm"} if "keypoints" in pred: tasks.add("keypoints") return sorted(tasks) def _eval_predictions(self, predictions, img_ids=None): """ Evaluate predictions. Fill self._results with the metrics of the tasks. """ self._logger.info("Preparing results for COCO format ...") coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) tasks = self._tasks or self._tasks_from_predictions(coco_results) # unmap the category ids for COCO if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) num_classes = len(all_contiguous_ids) assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} for result in coco_results: category_id = result["category_id"] assert category_id < num_classes, ( f"A prediction has class={category_id}, " f"but the dataset only has {num_classes} classes and " f"predicted class id should be in [0, {num_classes - 1}]." ) result["category_id"] = reverse_id_mapping[category_id] if self._output_dir: file_path = os.path.join(self._output_dir, "coco_instances_results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(coco_results)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info( "Evaluating predictions with {} COCO API...".format( "unofficial" if self._use_fast_impl else "official" ) ) for task in sorted(tasks): assert task in {"segm", "keypoints"}, f"Got unknown task: {task}!" coco_eval = ( _evaluate_predictions_on_coco( self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas, use_fast_impl=self._use_fast_impl, img_ids=img_ids, max_dets_per_image=self._max_dets_per_image, ) if len(coco_results) > 0 else None # cocoapi does not handle empty results very well ) res = self._derive_coco_results( coco_eval, task, class_names=self._metadata.get("thing_classes") ) self._results[task] = res def _derive_coco_results(self, coco_eval, iou_type, class_names=None): """ Derive the desired score numbers from summarized COCOeval. Args: coco_eval (None or COCOEval): None represents no predictions from model. iou_type (str): class_names (None or list[str]): if provided, will use it to predict per-category AP. Returns: a dict of {metric name: score} """ metrics = { "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], }[iou_type] if coco_eval is None: self._logger.warn("No predictions from the model!") return {metric: float("nan") for metric in metrics} # the standard metrics results = { metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") for idx, metric in enumerate(metrics) } self._logger.info( "Evaluation results for {}: \n".format(iou_type) + create_small_table(results) ) if not np.isfinite(sum(results.values())): self._logger.info("Some metrics cannot be computed and is shown as NaN.") if class_names is None or len(class_names) <= 1: return results # Compute per-category AP # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa precisions = coco_eval.eval["precision"] # precision has dims (iou, recall, cls, area range, max dets) assert len(class_names) == precisions.shape[2] results_per_category = [] for idx, name in enumerate(class_names): # area range index 0: all area ranges # max dets index -1: typically 100 per image precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] ap = np.mean(precision) if precision.size else float("nan") results_per_category.append(("{}".format(name), float(ap * 100))) # tabulate it N_COLS = min(6, len(results_per_category) * 2) results_flatten = list(itertools.chain(*results_per_category)) results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) table = tabulate( results_2d, tablefmt="pipe", floatfmt=".3f", headers=["category", "AP"] * (N_COLS // 2), numalign="left", ) self._logger.info("Per-category {} AP: \n".format(iou_type) + table) results.update({"AP-" + name: ap for name, ap in results_per_category}) return results def instances_to_coco_json(instances, img_id): """ Dump an "Instances" object to a COCO-format json that's used for evaluation. Args: instances (Instances): img_id (int): the image id Returns: list[dict]: list of json annotations in COCO format. """ num_instance = len(instances) if num_instance == 0: return [] scores = instances.scores.tolist() classes = instances.pred_classes.tolist() has_mask = instances.has("pred_masks") if has_mask: # use RLE to encode the masks, because they are too large and takes memory # since this evaluator stores outputs of the entire dataset rles = [ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks ] for rle in rles: # "counts" is an array encoded by mask_util as a byte-stream. Python3's # json writer which always produces strings cannot serialize a bytestream # unless you decode it. Thankfully, utf-8 works out (which is also what # the annotator.oneformer.pycocotools/_mask.pyx does). rle["counts"] = rle["counts"].decode("utf-8") has_keypoints = instances.has("pred_keypoints") if has_keypoints: keypoints = instances.pred_keypoints results = [] for k in range(num_instance): result = { "image_id": img_id, "category_id": classes[k], "score": scores[k], } if has_mask: result["segmentation"] = rles[k] if has_keypoints: # In COCO annotations, # keypoints coordinates are pixel indices. # However our predictions are floating point coordinates. # Therefore we subtract 0.5 to be consistent with the annotation format. # This is the inverse of data loading logic in `datasets/coco.py`. keypoints[k][:, :2] -= 0.5 result["keypoints"] = keypoints[k].flatten().tolist() results.append(result) return results def _evaluate_predictions_on_coco( coco_gt, coco_results, iou_type, kpt_oks_sigmas=None, use_fast_impl=True, img_ids=None, max_dets_per_image=None, ): """ Evaluate the coco results using COCOEval API. """ assert len(coco_results) > 0 if iou_type == "segm": coco_results = copy.deepcopy(coco_results) # When evaluating mask AP, if the results contain bbox, cocoapi will # use the box area as the area of the instance, instead of the mask area. # This leads to a different definition of small/medium/large. # We remove the bbox field to let mask AP use mask area. for c in coco_results: c.pop("bbox", None) coco_dt = coco_gt.loadRes(coco_results) coco_eval = (COCOeval_opt if use_fast_impl else COCOeval)(coco_gt, coco_dt, iou_type) # For COCO, the default max_dets_per_image is [1, 10, 100]. if max_dets_per_image is None: max_dets_per_image = [1, 10, 100] # Default from COCOEval else: assert ( len(max_dets_per_image) >= 3 ), "COCOeval requires maxDets (and max_dets_per_image) to have length at least 3" # In the case that user supplies a custom input for max_dets_per_image, # apply COCOevalMaxDets to evaluate AP with the custom input. if max_dets_per_image[2] != 100: coco_eval = COCOevalMaxDets(coco_gt, coco_dt, iou_type) if iou_type != "keypoints": coco_eval.params.maxDets = max_dets_per_image if img_ids is not None: coco_eval.params.imgIds = img_ids if iou_type == "keypoints": # Use the COCO default keypoint OKS sigmas unless overrides are specified if kpt_oks_sigmas: assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "annotator.oneformer.pycocotools is too old!" coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas) # COCOAPI requires every detection and every gt to have keypoints, so # we just take the first entry from both num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3 num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3 num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas) assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, ( f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. " f"Ground truth contains {num_keypoints_gt} keypoints. " f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. " "They have to agree with each other. For meaning of OKS, please refer to " "http://cocodataset.org/#keypoints-eval." ) coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return coco_eval class COCOevalMaxDets(COCOeval): """ Modified version of COCOeval for evaluating AP with a custom maxDets (by default for COCO, maxDets is 100) """ def summarize(self): """ Compute and display summary metrics for evaluation results given a custom value for max_dets_per_image """ def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100): p = self.params iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}" titleStr = "Average Precision" if ap == 1 else "Average Recall" typeStr = "(AP)" if ap == 1 else "(AR)" iouStr = ( "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1]) if iouThr is None else "{:0.2f}".format(iouThr) ) aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] if ap == 1: # dimension of precision: [TxRxKxAxM] s = self.eval["precision"] # IoU if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:, :, :, aind, mind] else: # dimension of recall: [TxKxAxM] s = self.eval["recall"] if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:, :, aind, mind] if len(s[s > -1]) == 0: mean_s = -1 else: mean_s = np.mean(s[s > -1]) print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) return mean_s def _summarizeDets(): stats = np.zeros((12,)) # Evaluate AP using the custom limit on maximum detections per image stats[0] = _summarize(1, maxDets=self.params.maxDets[2]) stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2]) stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2]) stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2]) stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2]) stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2]) stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2]) stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2]) stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2]) return stats def _summarizeKps(): stats = np.zeros((10,)) stats[0] = _summarize(1, maxDets=20) stats[1] = _summarize(1, maxDets=20, iouThr=0.5) stats[2] = _summarize(1, maxDets=20, iouThr=0.75) stats[3] = _summarize(1, maxDets=20, areaRng="medium") stats[4] = _summarize(1, maxDets=20, areaRng="large") stats[5] = _summarize(0, maxDets=20) stats[6] = _summarize(0, maxDets=20, iouThr=0.5) stats[7] = _summarize(0, maxDets=20, iouThr=0.75) stats[8] = _summarize(0, maxDets=20, areaRng="medium") stats[9] = _summarize(0, maxDets=20, areaRng="large") return stats if not self.eval: raise Exception("Please run accumulate() first") iouType = self.params.iouType if iouType == "segm": summarize = _summarizeDets elif iouType == "keypoints": summarize = _summarizeKps self.stats = summarize() def __str__(self): self.summarize() ================================================ FILE: annotator/oneformer/oneformer/evaluation/detection_coco_evaluator.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/coco_evaluation.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import contextlib import copy import io import itertools import json import logging import numpy as np import os import pickle from collections import OrderedDict import annotator.oneformer.pycocotools.mask as mask_util import torch from annotator.oneformer.pycocotools.coco import COCO from annotator.oneformer.pycocotools.cocoeval import COCOeval from tabulate import tabulate import annotator.oneformer.detectron2.utils.comm as comm from annotator.oneformer.detectron2.config import CfgNode from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.data.datasets.coco import convert_to_coco_json from annotator.oneformer.detectron2.structures import Boxes, BoxMode, pairwise_iou from annotator.oneformer.detectron2.utils.file_io import PathManager from annotator.oneformer.detectron2.utils.logger import create_small_table from .evaluator import DatasetEvaluator try: from annotator.oneformer.detectron2.evaluation.fast_eval_api import COCOeval_opt except ImportError: COCOeval_opt = COCOeval class DetectionCOCOEvaluator(DatasetEvaluator): """ Evaluate AR for object proposals, AP for instance detection/segmentation, AP for keypoint detection outputs using COCO's metrics. See http://cocodataset.org/#detection-eval and http://cocodataset.org/#keypoints-eval to understand its metrics. The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means the metric cannot be computed (e.g. due to no predictions made). In addition to COCO, this evaluator is able to support any bounding box detection, instance segmentation, or keypoint detection dataset. """ def __init__( self, dataset_name, tasks=None, distributed=True, output_dir=None, *, max_dets_per_image=None, use_fast_impl=True, kpt_oks_sigmas=(), allow_cached_coco=True, ): """ Args: dataset_name (str): name of the dataset to be evaluated. It must have either the following corresponding metadata: "json_file": the path to the COCO format annotation Or it must be in detectron2's standard dataset format so it can be converted to COCO format automatically. tasks (tuple[str]): tasks that can be evaluated under the given configuration. A task is one of "bbox", "segm", "keypoints". By default, will infer this automatically from predictions. distributed (True): if True, will collect results from all ranks and run evaluation in the main process. Otherwise, will only evaluate the results in the current process. output_dir (str): optional, an output directory to dump all results predicted on the dataset. The dump contains two files: 1. "instances_predictions.pth" a file that can be loaded with `torch.load` and contains all the results in the format they are produced by the model. 2. "coco_instances_results.json" a json file in COCO's result format. max_dets_per_image (int): limit on the maximum number of detections per image. By default in COCO, this limit is to 100, but this can be customized to be greater, as is needed in evaluation metrics AP fixed and AP pool (see https://arxiv.org/pdf/2102.01066.pdf) This doesn't affect keypoint evaluation. use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP. Although the results should be very close to the official implementation in COCO API, it is still recommended to compute results with the official API for use in papers. The faster implementation also uses more RAM. kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval When empty, it will use the defaults in COCO. Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS. allow_cached_coco (bool): Whether to use cached coco json from previous validation runs. You should set this to False if you need to use different validation data. Defaults to True. """ self._logger = logging.getLogger(__name__) self._distributed = distributed self._output_dir = output_dir if use_fast_impl and (COCOeval_opt is COCOeval): self._logger.info("Fast COCO eval is not built. Falling back to official COCO eval.") use_fast_impl = False self._use_fast_impl = use_fast_impl # COCOeval requires the limit on the number of detections per image (maxDets) to be a list # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the # 3rd element (100) is used as the limit on the number of detections per image when # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval, # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults. if max_dets_per_image is None: max_dets_per_image = [1, 10, 100] else: max_dets_per_image = [1, 10, max_dets_per_image] self._max_dets_per_image = max_dets_per_image if tasks is not None and isinstance(tasks, CfgNode): kpt_oks_sigmas = ( tasks.TEST.KEYPOINT_OKS_SIGMAS if not kpt_oks_sigmas else kpt_oks_sigmas ) self._logger.warn( "COCO Evaluator instantiated using config, this is deprecated behavior." " Please pass in explicit arguments instead." ) self._tasks = None # Infering it from predictions should be better else: self._tasks = tasks self._cpu_device = torch.device("cpu") self._metadata = MetadataCatalog.get(dataset_name) if not hasattr(self._metadata, "json_file"): if output_dir is None: raise ValueError( "output_dir must be provided to COCOEvaluator " "for datasets not in COCO format." ) self._logger.info(f"Trying to convert '{dataset_name}' to COCO format ...") cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json") self._metadata.json_file = cache_path convert_to_coco_json(dataset_name, cache_path, allow_cached=allow_cached_coco) json_file = PathManager.get_local_path(self._metadata.json_file) with contextlib.redirect_stdout(io.StringIO()): self._coco_api = COCO(json_file) # Test set json files do not contain annotations (evaluation must be # performed using the COCO evaluation server). self._do_evaluation = "annotations" in self._coco_api.dataset if self._do_evaluation: self._kpt_oks_sigmas = kpt_oks_sigmas def reset(self): self._predictions = [] def process(self, inputs, outputs): """ Args: inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). It is a list of dict. Each dict corresponds to an image and contains keys like "height", "width", "file_name", "image_id". outputs: the outputs of a COCO model. It is a list of dicts with key "box_instances" that contains :class:`Instances`. """ for input, output in zip(inputs, outputs): prediction = {"image_id": input["image_id"]} if "box_instances" in output: instances = output["box_instances"].to(self._cpu_device) prediction["box_instances"] = instances_to_coco_json(instances, input["image_id"]) if "proposals" in output: prediction["proposals"] = output["proposals"].to(self._cpu_device) if len(prediction) > 1: self._predictions.append(prediction) def evaluate(self, img_ids=None): """ Args: img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset """ if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions if len(predictions) == 0: self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) self._results = OrderedDict() if "proposals" in predictions[0]: self._eval_box_proposals(predictions) if "box_instances" in predictions[0]: self._eval_predictions(predictions, img_ids=img_ids) # Copy so the caller can do whatever with results return copy.deepcopy(self._results) def _tasks_from_predictions(self, predictions): """ Get COCO API "tasks" (i.e. iou_type) from COCO-format predictions. """ tasks = {"bbox"} for pred in predictions: if "keypoints" in pred: tasks.add("keypoints") return sorted(tasks) def _eval_predictions(self, predictions, img_ids=None): """ Evaluate predictions. Fill self._results with the metrics of the tasks. """ self._logger.info("Preparing results for COCO format ...") coco_results = list(itertools.chain(*[x["box_instances"] for x in predictions])) tasks = self._tasks or self._tasks_from_predictions(coco_results) # unmap the category ids for COCO if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) num_classes = len(all_contiguous_ids) assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} for result in coco_results: category_id = result["category_id"] assert category_id < num_classes, ( f"A prediction has class={category_id}, " f"but the dataset only has {num_classes} classes and " f"predicted class id should be in [0, {num_classes - 1}]." ) result["category_id"] = reverse_id_mapping[category_id] if self._output_dir: file_path = os.path.join(self._output_dir, "coco_instances_results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(coco_results)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info( "Evaluating predictions with {} COCO API...".format( "unofficial" if self._use_fast_impl else "official" ) ) for task in sorted(tasks): assert task in {"bbox", "keypoints"}, f"Got unknown task: {task}!" coco_eval = ( _evaluate_predictions_on_coco( self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas, use_fast_impl=self._use_fast_impl, img_ids=img_ids, max_dets_per_image=self._max_dets_per_image, ) if len(coco_results) > 0 else None # cocoapi does not handle empty results very well ) res = self._derive_coco_results( coco_eval, task, class_names=self._metadata.get("thing_classes") ) self._results[task] = res def _eval_box_proposals(self, predictions): """ Evaluate the box proposals in predictions. Fill self._results with the metrics for "box_proposals" task. """ if self._output_dir: # Saving generated box proposals to file. # Predicted box_proposals are in XYXY_ABS mode. bbox_mode = BoxMode.XYXY_ABS.value ids, boxes, objectness_logits = [], [], [] for prediction in predictions: ids.append(prediction["image_id"]) boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy()) objectness_logits.append(prediction["proposals"].objectness_logits.numpy()) proposal_data = { "boxes": boxes, "objectness_logits": objectness_logits, "ids": ids, "bbox_mode": bbox_mode, } with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f: pickle.dump(proposal_data, f) if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info("Evaluating bbox proposals ...") res = {} areas = {"all": "", "small": "s", "medium": "m", "large": "l"} for limit in [100, 1000]: for area, suffix in areas.items(): stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit) key = "AR{}@{:d}".format(suffix, limit) res[key] = float(stats["ar"].item() * 100) self._logger.info("Proposal metrics: \n" + create_small_table(res)) self._results["box_proposals"] = res def _derive_coco_results(self, coco_eval, iou_type, class_names=None): """ Derive the desired score numbers from summarized COCOeval. Args: coco_eval (None or COCOEval): None represents no predictions from model. iou_type (str): class_names (None or list[str]): if provided, will use it to predict per-category AP. Returns: a dict of {metric name: score} """ metrics = { "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], }[iou_type] if coco_eval is None: self._logger.warn("No predictions from the model!") return {metric: float("nan") for metric in metrics} # the standard metrics results = { metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") for idx, metric in enumerate(metrics) } self._logger.info( "Evaluation results for {}: \n".format(iou_type) + create_small_table(results) ) if not np.isfinite(sum(results.values())): self._logger.info("Some metrics cannot be computed and is shown as NaN.") if class_names is None or len(class_names) <= 1: return results # Compute per-category AP # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa precisions = coco_eval.eval["precision"] # precision has dims (iou, recall, cls, area range, max dets) assert len(class_names) == precisions.shape[2] results_per_category = [] for idx, name in enumerate(class_names): # area range index 0: all area ranges # max dets index -1: typically 100 per image precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] ap = np.mean(precision) if precision.size else float("nan") results_per_category.append(("{}".format(name), float(ap * 100))) # tabulate it N_COLS = min(6, len(results_per_category) * 2) results_flatten = list(itertools.chain(*results_per_category)) results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) table = tabulate( results_2d, tablefmt="pipe", floatfmt=".3f", headers=["category", "AP"] * (N_COLS // 2), numalign="left", ) self._logger.info("Per-category {} AP: \n".format(iou_type) + table) results.update({"AP-" + name: ap for name, ap in results_per_category}) return results def instances_to_coco_json(instances, img_id): """ Dump an "Instances" object to a COCO-format json that's used for evaluation. Args: instances (Instances): img_id (int): the image id Returns: list[dict]: list of json annotations in COCO format. """ num_instance = len(instances) if num_instance == 0: return [] boxes = instances.pred_boxes.tensor.numpy() boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) boxes = boxes.tolist() scores = instances.scores.tolist() classes = instances.pred_classes.tolist() has_mask = instances.has("pred_masks") if has_mask: # use RLE to encode the masks, because they are too large and takes memory # since this evaluator stores outputs of the entire dataset rles = [ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] for mask in instances.pred_masks ] for rle in rles: # "counts" is an array encoded by mask_util as a byte-stream. Python3's # json writer which always produces strings cannot serialize a bytestream # unless you decode it. Thankfully, utf-8 works out (which is also what # the annotator.oneformer.pycocotools/_mask.pyx does). rle["counts"] = rle["counts"].decode("utf-8") has_keypoints = instances.has("pred_keypoints") if has_keypoints: keypoints = instances.pred_keypoints results = [] for k in range(num_instance): result = { "image_id": img_id, "category_id": classes[k], "bbox": boxes[k], "score": scores[k], } if has_mask: result["segmentation"] = rles[k] if has_keypoints: # In COCO annotations, # keypoints coordinates are pixel indices. # However our predictions are floating point coordinates. # Therefore we subtract 0.5 to be consistent with the annotation format. # This is the inverse of data loading logic in `datasets/coco.py`. keypoints[k][:, :2] -= 0.5 result["keypoints"] = keypoints[k].flatten().tolist() results.append(result) return results # inspired from Detectron: # https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None): """ Evaluate detection proposal recall metrics. This function is a much faster alternative to the official COCO API recall evaluation code. However, it produces slightly different results. """ # Record max overlap value for each gt box # Return vector of overlap values areas = { "all": 0, "small": 1, "medium": 2, "large": 3, "96-128": 4, "128-256": 5, "256-512": 6, "512-inf": 7, } area_ranges = [ [0**2, 1e5**2], # all [0**2, 32**2], # small [32**2, 96**2], # medium [96**2, 1e5**2], # large [96**2, 128**2], # 96-128 [128**2, 256**2], # 128-256 [256**2, 512**2], # 256-512 [512**2, 1e5**2], ] # 512-inf assert area in areas, "Unknown area range: {}".format(area) area_range = area_ranges[areas[area]] gt_overlaps = [] num_pos = 0 for prediction_dict in dataset_predictions: predictions = prediction_dict["proposals"] # sort predictions in descending order # TODO maybe remove this and make it explicit in the documentation inds = predictions.objectness_logits.sort(descending=True)[1] predictions = predictions[inds] ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"]) anno = coco_api.loadAnns(ann_ids) gt_boxes = [ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno if obj["iscrowd"] == 0 ] gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes gt_boxes = Boxes(gt_boxes) gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) if len(gt_boxes) == 0 or len(predictions) == 0: continue valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) gt_boxes = gt_boxes[valid_gt_inds] num_pos += len(gt_boxes) if len(gt_boxes) == 0: continue if limit is not None and len(predictions) > limit: predictions = predictions[:limit] overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) _gt_overlaps = torch.zeros(len(gt_boxes)) for j in range(min(len(predictions), len(gt_boxes))): # find which proposal box maximally covers each gt box # and get the iou amount of coverage for each gt box max_overlaps, argmax_overlaps = overlaps.max(dim=0) # find which gt box is 'best' covered (i.e. 'best' = most iou) gt_ovr, gt_ind = max_overlaps.max(dim=0) assert gt_ovr >= 0 # find the proposal box that covers the best covered gt box box_ind = argmax_overlaps[gt_ind] # record the iou coverage of this gt box _gt_overlaps[j] = overlaps[box_ind, gt_ind] assert _gt_overlaps[j] == gt_ovr # mark the proposal box and the gt box as used overlaps[box_ind, :] = -1 overlaps[:, gt_ind] = -1 # append recorded iou coverage level gt_overlaps.append(_gt_overlaps) gt_overlaps = ( torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32) ) gt_overlaps, _ = torch.sort(gt_overlaps) if thresholds is None: step = 0.05 thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) recalls = torch.zeros_like(thresholds) # compute recall for each iou threshold for i, t in enumerate(thresholds): recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) # ar = 2 * np.trapz(recalls, thresholds) ar = recalls.mean() return { "ar": ar, "recalls": recalls, "thresholds": thresholds, "gt_overlaps": gt_overlaps, "num_pos": num_pos, } def _evaluate_predictions_on_coco( coco_gt, coco_results, iou_type, kpt_oks_sigmas=None, use_fast_impl=True, img_ids=None, max_dets_per_image=None, ): """ Evaluate the coco results using COCOEval API. """ assert len(coco_results) > 0 if iou_type == "segm": coco_results = copy.deepcopy(coco_results) # When evaluating mask AP, if the results contain bbox, cocoapi will # use the box area as the area of the instance, instead of the mask area. # This leads to a different definition of small/medium/large. # We remove the bbox field to let mask AP use mask area. for c in coco_results: c.pop("bbox", None) coco_dt = coco_gt.loadRes(coco_results) coco_eval = (COCOeval_opt if use_fast_impl else COCOeval)(coco_gt, coco_dt, iou_type) # For COCO, the default max_dets_per_image is [1, 10, 100]. if max_dets_per_image is None: max_dets_per_image = [1, 10, 100] # Default from COCOEval else: assert ( len(max_dets_per_image) >= 3 ), "COCOeval requires maxDets (and max_dets_per_image) to have length at least 3" # In the case that user supplies a custom input for max_dets_per_image, # apply COCOevalMaxDets to evaluate AP with the custom input. if max_dets_per_image[2] != 100: coco_eval = COCOevalMaxDets(coco_gt, coco_dt, iou_type) if iou_type != "keypoints": coco_eval.params.maxDets = max_dets_per_image if img_ids is not None: coco_eval.params.imgIds = img_ids if iou_type == "keypoints": # Use the COCO default keypoint OKS sigmas unless overrides are specified if kpt_oks_sigmas: assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "annotator.oneformer.pycocotools is too old!" coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas) # COCOAPI requires every detection and every gt to have keypoints, so # we just take the first entry from both num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3 num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3 num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas) assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, ( f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. " f"Ground truth contains {num_keypoints_gt} keypoints. " f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. " "They have to agree with each other. For meaning of OKS, please refer to " "http://cocodataset.org/#keypoints-eval." ) coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return coco_eval class COCOevalMaxDets(COCOeval): """ Modified version of COCOeval for evaluating AP with a custom maxDets (by default for COCO, maxDets is 100) """ def summarize(self): """ Compute and display summary metrics for evaluation results given a custom value for max_dets_per_image """ def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100): p = self.params iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}" titleStr = "Average Precision" if ap == 1 else "Average Recall" typeStr = "(AP)" if ap == 1 else "(AR)" iouStr = ( "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1]) if iouThr is None else "{:0.2f}".format(iouThr) ) aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] if ap == 1: # dimension of precision: [TxRxKxAxM] s = self.eval["precision"] # IoU if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:, :, :, aind, mind] else: # dimension of recall: [TxKxAxM] s = self.eval["recall"] if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:, :, aind, mind] if len(s[s > -1]) == 0: mean_s = -1 else: mean_s = np.mean(s[s > -1]) print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) return mean_s def _summarizeDets(): stats = np.zeros((12,)) # Evaluate AP using the custom limit on maximum detections per image stats[0] = _summarize(1, maxDets=self.params.maxDets[2]) stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2]) stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2]) stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2]) stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2]) stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2]) stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2]) stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2]) stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2]) return stats def _summarizeKps(): stats = np.zeros((10,)) stats[0] = _summarize(1, maxDets=20) stats[1] = _summarize(1, maxDets=20, iouThr=0.5) stats[2] = _summarize(1, maxDets=20, iouThr=0.75) stats[3] = _summarize(1, maxDets=20, areaRng="medium") stats[4] = _summarize(1, maxDets=20, areaRng="large") stats[5] = _summarize(0, maxDets=20) stats[6] = _summarize(0, maxDets=20, iouThr=0.5) stats[7] = _summarize(0, maxDets=20, iouThr=0.75) stats[8] = _summarize(0, maxDets=20, areaRng="medium") stats[9] = _summarize(0, maxDets=20, areaRng="large") return stats if not self.eval: raise Exception("Please run accumulate() first") iouType = self.params.iouType if iouType == "segm" or iouType == "bbox": summarize = _summarizeDets elif iouType == "keypoints": summarize = _summarizeKps self.stats = summarize() def __str__(self): self.summarize() ================================================ FILE: annotator/oneformer/oneformer/evaluation/evaluator.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/evaluator.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import datetime import logging import time from collections import OrderedDict, abc from contextlib import ExitStack, contextmanager from typing import List, Union import torch from torch import nn from annotator.oneformer.detectron2.utils.comm import get_world_size, is_main_process from annotator.oneformer.detectron2.utils.logger import log_every_n_seconds class DatasetEvaluator: """ Base class for a dataset evaluator. The function :func:`inference_on_dataset` runs the model over all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs. This class will accumulate information of the inputs/outputs (by :meth:`process`), and produce evaluation results in the end (by :meth:`evaluate`). """ def reset(self): """ Preparation for a new round of evaluation. Should be called before starting a round of evaluation. """ pass def process(self, inputs, outputs): """ Process the pair of inputs and outputs. If they contain batches, the pairs can be consumed one-by-one using `zip`: .. code-block:: python for input_, output in zip(inputs, outputs): # do evaluation on single input/output pair ... Args: inputs (list): the inputs that's used to call the model. outputs (list): the return value of `model(inputs)` """ pass def evaluate(self): """ Evaluate/summarize the performance, after processing all input/output pairs. Returns: dict: A new evaluator class can return a dict of arbitrary format as long as the user can process the results. In our train_net.py, we expect the following format: * key: the name of the task (e.g., bbox) * value: a dict of {metric name: score}, e.g.: {"AP50": 80} """ pass class DatasetEvaluators(DatasetEvaluator): """ Wrapper class to combine multiple :class:`DatasetEvaluator` instances. This class dispatches every evaluation call to all of its :class:`DatasetEvaluator`. """ def __init__(self, evaluators): """ Args: evaluators (list): the evaluators to combine. """ super().__init__() self._evaluators = evaluators def reset(self): for evaluator in self._evaluators: evaluator.reset() def process(self, inputs, outputs): for evaluator in self._evaluators: evaluator.process(inputs, outputs) def evaluate(self): results = OrderedDict() for evaluator in self._evaluators: result = evaluator.evaluate() if is_main_process() and result is not None: for k, v in result.items(): assert ( k not in results ), "Different evaluators produce results with the same key {}".format(k) results[k] = v return results def inference_on_dataset( model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None] ): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.__call__` accurately. The model will be used in eval mode. Args: model (callable): a callable which takes an object from `data_loader` and returns some outputs. If it's an nn.Module, it will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} batches".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) if isinstance(evaluator, abc.MutableSequence): evaluator = DatasetEvaluators(evaluator) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_data_time = 0 total_compute_time = 0 total_eval_time = 0 with ExitStack() as stack: if isinstance(model, nn.Module): stack.enter_context(inference_context(model)) stack.enter_context(torch.no_grad()) start_data_time = time.perf_counter() for idx, inputs in enumerate(data_loader): total_data_time += time.perf_counter() - start_data_time if idx == num_warmup: start_time = time.perf_counter() total_data_time = 0 total_compute_time = 0 total_eval_time = 0 start_compute_time = time.perf_counter() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time start_eval_time = time.perf_counter() evaluator.process(inputs, outputs) total_eval_time += time.perf_counter() - start_eval_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) data_seconds_per_iter = total_data_time / iters_after_start compute_seconds_per_iter = total_compute_time / iters_after_start eval_seconds_per_iter = total_eval_time / iters_after_start total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start if idx >= num_warmup * 2 or compute_seconds_per_iter > 5: eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1))) log_every_n_seconds( logging.INFO, ( f"Inference done {idx + 1}/{total}. " f"Dataloading: {data_seconds_per_iter:.4f} s/iter. " f"Inference: {compute_seconds_per_iter:.4f} s/iter. " f"Eval: {eval_seconds_per_iter:.4f} s/iter. " f"Total: {total_seconds_per_iter:.4f} s/iter. " f"ETA={eta}" ), n=5, ) start_data_time = time.perf_counter() # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format( total_time_str, total_time / (total - num_warmup), num_devices ) ) total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format( total_compute_time_str, total_compute_time / (total - num_warmup), num_devices ) ) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results @contextmanager def inference_context(model): """ A context where the model is temporarily changed to eval mode, and restored to previous mode afterwards. Args: model: a torch Module """ training_mode = model.training model.eval() yield model.train(training_mode) ================================================ FILE: annotator/oneformer/oneformer/evaluation/instance_evaluation.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/evaluation/instance_evaluation.py # ------------------------------------------------------------------------------ import contextlib import copy import io import itertools import json import logging import numpy as np import os import pickle from collections import OrderedDict import annotator.oneformer.pycocotools.mask as mask_util import torch from annotator.oneformer.pycocotools.coco import COCO from annotator.oneformer.pycocotools.cocoeval import COCOeval from tabulate import tabulate import annotator.oneformer.detectron2.utils.comm as comm from annotator.oneformer.detectron2.config import CfgNode from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.data.datasets.coco import convert_to_coco_json from annotator.oneformer.detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco from annotator.oneformer.detectron2.evaluation.fast_eval_api import COCOeval_opt from annotator.oneformer.detectron2.structures import Boxes, BoxMode, pairwise_iou from annotator.oneformer.detectron2.utils.file_io import PathManager from annotator.oneformer.detectron2.utils.logger import create_small_table # modified from COCOEvaluator for instance segmetnat class InstanceSegEvaluator(COCOEvaluator): """ Evaluate AR for object proposals, AP for instance detection/segmentation, AP for keypoint detection outputs using COCO's metrics. See http://cocodataset.org/#detection-eval and http://cocodataset.org/#keypoints-eval to understand its metrics. The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means the metric cannot be computed (e.g. due to no predictions made). In addition to COCO, this evaluator is able to support any bounding box detection, instance segmentation, or keypoint detection dataset. """ def _eval_predictions(self, predictions, img_ids=None): """ Evaluate predictions. Fill self._results with the metrics of the tasks. """ self._logger.info("Preparing results for COCO format ...") coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) tasks = self._tasks or self._tasks_from_predictions(coco_results) # unmap the category ids for COCO if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) # num_classes = len(all_contiguous_ids) # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} for result in coco_results: category_id = result["category_id"] # assert category_id < num_classes, ( # f"A prediction has class={category_id}, " # f"but the dataset only has {num_classes} classes and " # f"predicted class id should be in [0, {num_classes - 1}]." # ) assert category_id in reverse_id_mapping, ( f"A prediction has class={category_id}, " f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." ) result["category_id"] = reverse_id_mapping[category_id] if self._output_dir: file_path = os.path.join(self._output_dir, "coco_instances_results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(coco_results)) f.flush() if not self._do_evaluation: self._logger.info("Annotations are not available for evaluation.") return self._logger.info( "Evaluating predictions with {} COCO API...".format( "unofficial" if self._use_fast_impl else "official" ) ) for task in sorted(tasks): assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" coco_eval = ( _evaluate_predictions_on_coco( self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas, use_fast_impl=self._use_fast_impl, img_ids=img_ids, max_dets_per_image=self._max_dets_per_image, ) if len(coco_results) > 0 else None # cocoapi does not handle empty results very well ) res = self._derive_coco_results( coco_eval, task, class_names=self._metadata.get("thing_classes") ) self._results[task] = res ================================================ FILE: annotator/oneformer/oneformer/modeling/__init__.py ================================================ from .backbone.swin import D2SwinTransformer from .backbone.dinat import D2DiNAT from .pixel_decoder.fpn import BasePixelDecoder from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder from .meta_arch.oneformer_head import OneFormerHead ================================================ FILE: annotator/oneformer/oneformer/modeling/backbone/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: annotator/oneformer/oneformer/modeling/backbone/dinat.py ================================================ # -------------------------------------------------------- # Neighborhood Attention Transformer # Licensed under The MIT License # Written by Ali Hassani # -------------------------------------------------------- # Modified by Jitesh Jain import torch import torch.nn as nn from timm.models.layers import DropPath from annotator.oneformer.detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec class NeighborhoodAttention(nn.Module): """ Neighborhood Attention 2D Module """ def __init__( self, dim, num_heads, kernel_size, dilation=1, bias=True, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() def forward(self, x): return x def extra_repr(self) -> str: return ( f"head_dim={self.head_dim}, num_heads={self.num_heads}, " + f"kernel_size={self.kernel_size}, dilation={self.dilation}, " + f"rel_pos_bias={self.rpb is not None}" ) class ConvTokenizer(nn.Module): def __init__(self, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() self.proj = nn.Sequential( nn.Conv2d(in_chans, embed_dim // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), nn.Conv2d(embed_dim // 2, embed_dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), ) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): x = self.proj(x).permute(0, 2, 3, 1) if self.norm is not None: x = self.norm(x) return x class ConvDownsampler(nn.Module): def __init__(self, dim, norm_layer=nn.LayerNorm): super().__init__() self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) self.norm = norm_layer(2 * dim) def forward(self, x): x = self.reduction(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) x = self.norm(x) return x class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class NATLayer(nn.Module): def __init__(self, dim, num_heads, kernel_size=7, dilation=None, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, layer_scale=None): super().__init__() self.dim = dim self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.norm1 = norm_layer(dim) self.attn = NeighborhoodAttention( dim, kernel_size=kernel_size, dilation=dilation, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) self.layer_scale = False if layer_scale is not None and type(layer_scale) in [int, float]: self.layer_scale = True self.gamma1 = nn.Parameter(layer_scale * torch.ones(dim), requires_grad=True) self.gamma2 = nn.Parameter(layer_scale * torch.ones(dim), requires_grad=True) def forward(self, x): if not self.layer_scale: shortcut = x x = self.norm1(x) x = self.attn(x) x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x shortcut = x x = self.norm1(x) x = self.attn(x) x = shortcut + self.drop_path(self.gamma1 * x) x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x))) return x class NATBlock(nn.Module): def __init__(self, dim, depth, num_heads, kernel_size, dilations=None, downsample=True, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, layer_scale=None): super().__init__() self.dim = dim self.depth = depth self.blocks = nn.ModuleList([ NATLayer(dim=dim, num_heads=num_heads, kernel_size=kernel_size, dilation=None if dilations is None else dilations[i], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer, layer_scale=layer_scale) for i in range(depth)]) self.downsample = None if not downsample else ConvDownsampler(dim=dim, norm_layer=norm_layer) def forward(self, x): for blk in self.blocks: x = blk(x) if self.downsample is None: return x, x return self.downsample(x), x class DiNAT(nn.Module): def __init__(self, embed_dim, mlp_ratio, depths, num_heads, drop_path_rate=0.2, in_chans=3, kernel_size=7, dilations=None, out_indices=(0, 1, 2, 3), qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., norm_layer=nn.LayerNorm, frozen_stages=-1, layer_scale=None, **kwargs): super().__init__() self.num_levels = len(depths) self.embed_dim = embed_dim self.num_features = [int(embed_dim * 2 ** i) for i in range(self.num_levels)] self.mlp_ratio = mlp_ratio self.patch_embed = ConvTokenizer(in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer) self.pos_drop = nn.Dropout(p=drop_rate) dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] self.levels = nn.ModuleList() for i in range(self.num_levels): level = NATBlock(dim=int(embed_dim * 2 ** i), depth=depths[i], num_heads=num_heads[i], kernel_size=kernel_size, dilations=None if dilations is None else dilations[i], mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])], norm_layer=norm_layer, downsample=(i < self.num_levels - 1), layer_scale=layer_scale) self.levels.append(level) # add a norm layer for each output self.out_indices = out_indices for i_layer in self.out_indices: layer = norm_layer(self.num_features[i_layer]) layer_name = f'norm{i_layer}' self.add_module(layer_name, layer) self.frozen_stages = frozen_stages def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.frozen_stages >= 2: for i in range(0, self.frozen_stages - 1): m = self.network[i] m.eval() for param in m.parameters(): param.requires_grad = False def train(self, mode=True): super(DiNAT, self).train(mode) self._freeze_stages() def forward_embeddings(self, x): x = self.patch_embed(x) return x def forward_tokens(self, x): outs = {} for idx, level in enumerate(self.levels): x, xo = level(x) if idx in self.out_indices: norm_layer = getattr(self, f'norm{idx}') x_out = norm_layer(xo) outs["res{}".format(idx + 2)] = x_out.permute(0, 3, 1, 2).contiguous() return outs def forward(self, x): x = self.forward_embeddings(x) return self.forward_tokens(x) @BACKBONE_REGISTRY.register() class D2DiNAT(DiNAT, Backbone): def __init__(self, cfg, input_shape): embed_dim = cfg.MODEL.DiNAT.EMBED_DIM mlp_ratio = cfg.MODEL.DiNAT.MLP_RATIO depths = cfg.MODEL.DiNAT.DEPTHS num_heads = cfg.MODEL.DiNAT.NUM_HEADS drop_path_rate = cfg.MODEL.DiNAT.DROP_PATH_RATE kernel_size = cfg.MODEL.DiNAT.KERNEL_SIZE out_indices = cfg.MODEL.DiNAT.OUT_INDICES dilations = cfg.MODEL.DiNAT.DILATIONS super().__init__( embed_dim=embed_dim, mlp_ratio=mlp_ratio, depths=depths, num_heads=num_heads, drop_path_rate=drop_path_rate, kernel_size=kernel_size, out_indices=out_indices, dilations=dilations, ) self._out_features = cfg.MODEL.DiNAT.OUT_FEATURES self._out_feature_strides = { "res2": 4, "res3": 8, "res4": 16, "res5": 32, } self._out_feature_channels = { "res2": self.num_features[0], "res3": self.num_features[1], "res4": self.num_features[2], "res5": self.num_features[3], } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert ( x.dim() == 4 ), f"DiNAT takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} y = super().forward(x) for k in y.keys(): if k in self._out_features: outputs[k] = y[k] return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 ================================================ FILE: annotator/oneformer/oneformer/modeling/backbone/swin.py ================================================ # -------------------------------------------------------- # Swin Transformer # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu, Yutong Lin, Yixuan Wei # -------------------------------------------------------- # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former # ------------------------------------------------------------------------------ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from annotator.oneformer.detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec class Mlp(nn.Module): """Multilayer perceptron.""" def __init__( self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): """Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) ) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table, std=0.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """Forward function. Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = ( self.qkv(x) .reshape(B_, N, 3, self.num_heads, C // self.num_heads) .permute(2, 0, 3, 1, 4) ) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = q @ k.transpose(-2, -1) relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 ) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1 ).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x class SwinTransformerBlock(nn.Module): """Swin Transformer Block. Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__( self, dim, num_heads, window_size=7, shift_size=0, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, ): super().__init__() self.dim = dim self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop ) self.H = None self.W = None def forward(self, x, mask_matrix): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. mask_matrix: Attention mask for cyclic shift. """ B, L, C = x.shape H, W = self.H, self.W assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # pad feature maps to multiples of window size pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) _, Hp, Wp, _ = x.shape # cyclic shift if self.shift_size > 0: shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) attn_mask = mask_matrix else: shifted_x = x attn_mask = None # partition windows x_windows = window_partition( shifted_x, self.window_size ) # nW*B, window_size, window_size, C x_windows = x_windows.view( -1, self.window_size * self.window_size, C ) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class PatchMerging(nn.Module): """Patch Merging Layer Args: dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ B, L, C = x.shape assert L == H * W, "input feature has wrong size" x = x.view(B, H, W, C) # padding pad_input = (H % 2 == 1) or (W % 2 == 1) if pad_input: x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x class BasicLayer(nn.Module): """A basic Swin Transformer layer for one stage. Args: dim (int): Number of feature channels depth (int): Depths of this stage. num_heads (int): Number of attention head. window_size (int): Local window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, dim, depth, num_heads, window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, ): super().__init__() self.window_size = window_size self.shift_size = window_size // 2 self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList( [ SwinTransformerBlock( dim=dim, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer, ) for i in range(depth) ] ) # patch merging layer if downsample is not None: self.downsample = downsample(dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ # calculate attention mask for SW-MSA Hp = int(np.ceil(H / self.window_size)) * self.window_size Wp = int(np.ceil(W / self.window_size)) * self.window_size img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 h_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) w_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition( img_mask, self.window_size ) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0) ) for blk in self.blocks: blk.H, blk.W = H, W if self.use_checkpoint: x = checkpoint.checkpoint(blk, x, attn_mask) else: x = blk(x, attn_mask) if self.downsample is not None: x_down = self.downsample(x, H, W) Wh, Ww = (H + 1) // 2, (W + 1) // 2 return x, H, W, x_down, Wh, Ww else: return x, H, W, x, H, W class PatchEmbed(nn.Module): """Image to Patch Embedding Args: patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() patch_size = to_2tuple(patch_size) self.patch_size = patch_size self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): """Forward function.""" # padding _, _, H, W = x.size() if W % self.patch_size[1] != 0: x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) if H % self.patch_size[0] != 0: x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) x = self.proj(x) # B C Wh Ww if self.norm is not None: Wh, Ww = x.size(2), x.size(3) x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) return x class SwinTransformer(nn.Module): """Swin Transformer backbone. A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default 224. patch_size (int | tuple(int)): Patch size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. depths (tuple[int]): Depths of each Swin Transformer stage. num_heads (tuple[int]): Number of attention head of each stage. window_size (int): Window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. drop_rate (float): Dropout rate. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Default: 0.2. norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. patch_norm (bool): If True, add normalization after patch embedding. Default: True. out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, pretrain_img_size=224, patch_size=4, in_chans=3, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, out_indices=(0, 1, 2, 3), frozen_stages=-1, use_checkpoint=False, ): super().__init__() self.pretrain_img_size = pretrain_img_size self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.out_indices = out_indices self.frozen_stages = frozen_stages # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, ) # absolute position embedding if self.ape: pretrain_img_size = to_2tuple(pretrain_img_size) patch_size = to_2tuple(patch_size) patches_resolution = [ pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1], ] self.absolute_pos_embed = nn.Parameter( torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) ) trunc_normal_(self.absolute_pos_embed, std=0.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2 ** i_layer), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint, ) self.layers.append(layer) num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] self.num_features = num_features # add a norm layer for each output for i_layer in out_indices: layer = norm_layer(num_features[i_layer]) layer_name = f"norm{i_layer}" self.add_module(layer_name, layer) self._freeze_stages() def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.frozen_stages >= 1 and self.ape: self.absolute_pos_embed.requires_grad = False if self.frozen_stages >= 2: self.pos_drop.eval() for i in range(0, self.frozen_stages - 1): m = self.layers[i] m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ def _init_weights(m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): """Forward function.""" x = self.patch_embed(x) Wh, Ww = x.size(2), x.size(3) if self.ape: # interpolate the position embedding to the corresponding size absolute_pos_embed = F.interpolate( self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" ) x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C else: x = x.flatten(2).transpose(1, 2) x = self.pos_drop(x) outs = {} for i in range(self.num_layers): layer = self.layers[i] x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) if i in self.out_indices: norm_layer = getattr(self, f"norm{i}") x_out = norm_layer(x_out) out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs["res{}".format(i + 2)] = out return outs def train(self, mode=True): """Convert the model into training mode while keep layers freezed.""" super(SwinTransformer, self).train(mode) self._freeze_stages() @BACKBONE_REGISTRY.register() class D2SwinTransformer(SwinTransformer, Backbone): def __init__(self, cfg, input_shape): pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE patch_size = cfg.MODEL.SWIN.PATCH_SIZE in_chans = 3 embed_dim = cfg.MODEL.SWIN.EMBED_DIM depths = cfg.MODEL.SWIN.DEPTHS num_heads = cfg.MODEL.SWIN.NUM_HEADS window_size = cfg.MODEL.SWIN.WINDOW_SIZE mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO qkv_bias = cfg.MODEL.SWIN.QKV_BIAS qk_scale = cfg.MODEL.SWIN.QK_SCALE drop_rate = cfg.MODEL.SWIN.DROP_RATE attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE norm_layer = nn.LayerNorm ape = cfg.MODEL.SWIN.APE patch_norm = cfg.MODEL.SWIN.PATCH_NORM use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT super().__init__( pretrain_img_size, patch_size, in_chans, embed_dim, depths, num_heads, window_size, mlp_ratio, qkv_bias, qk_scale, drop_rate, attn_drop_rate, drop_path_rate, norm_layer, ape, patch_norm, use_checkpoint=use_checkpoint, ) self._out_features = cfg.MODEL.SWIN.OUT_FEATURES self._out_feature_strides = { "res2": 4, "res3": 8, "res4": 16, "res5": 32, } self._out_feature_channels = { "res2": self.num_features[0], "res3": self.num_features[1], "res4": self.num_features[2], "res5": self.num_features[3], } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert ( x.dim() == 4 ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} y = super().forward(x) for k in y.keys(): if k in self._out_features: outputs[k] = y[k] return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 ================================================ FILE: annotator/oneformer/oneformer/modeling/matcher.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/matcher.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ """ Modules to compute the matching cost and solve the corresponding LSAP. """ import torch import torch.nn.functional as F from scipy.optimize import linear_sum_assignment from torch import nn from torch.cuda.amp import autocast import numpy as np # from annotator.oneformer.detectron2.projects.point_rend.point_features import point_sample def linear_sum_assignment_with_nan(cost_matrix): cost_matrix = np.asarray(cost_matrix) nan = np.isnan(cost_matrix).any() nan_all = np.isnan(cost_matrix).all() empty = cost_matrix.size == 0 if not empty: if nan_all: print('Matrix contains all NaN values!') elif nan: print('Matrix contains NaN values!') if nan_all: cost_matrix = np.empty(shape=(0, 0)) elif nan: cost_matrix[np.isnan(cost_matrix)] = 100 return linear_sum_assignment(cost_matrix) def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Compute the DICE loss, similar to generalized IOU for masks Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). """ inputs = inputs.sigmoid() inputs = inputs.flatten(1) numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] loss = 1 - (numerator + 1) / (denominator + 1) return loss batch_dice_loss_jit = torch.jit.script( batch_dice_loss ) # type: torch.jit.ScriptModule def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor): """ Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). Returns: Loss tensor """ hw = inputs.shape[1] pos = F.binary_cross_entropy_with_logits( inputs, torch.ones_like(inputs), reduction="none" ) neg = F.binary_cross_entropy_with_logits( inputs, torch.zeros_like(inputs), reduction="none" ) loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( "nc,mc->nm", neg, (1 - targets) ) return loss / hw batch_sigmoid_ce_loss_jit = torch.jit.script( batch_sigmoid_ce_loss ) # type: torch.jit.ScriptModule class HungarianMatcher(nn.Module): """This class computes an assignment between the targets and the predictions of the network For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are un-matched (and thus treated as non-objects). """ def __init__(self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1, num_points: int = 0): """Creates the matcher Params: cost_class: This is the relative weight of the classification error in the matching cost cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost """ super().__init__() self.cost_class = cost_class self.cost_mask = cost_mask self.cost_dice = cost_dice assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0" self.num_points = num_points @torch.no_grad() def memory_efficient_forward(self, outputs, targets): """More memory-friendly matching""" bs, num_queries = outputs["pred_logits"].shape[:2] indices = [] # Iterate through batch size for b in range(bs): out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes] tgt_ids = targets[b]["labels"] # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that doesn't change the matching, it can be ommitted. cost_class = -out_prob[:, tgt_ids] out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred] # gt masks are already padded when preparing target tgt_mask = targets[b]["masks"].to(out_mask) out_mask = out_mask[:, None] tgt_mask = tgt_mask[:, None] # all masks share the same set of points for efficient matching! point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device) # get gt labels tgt_mask = point_sample( tgt_mask, point_coords.repeat(tgt_mask.shape[0], 1, 1), align_corners=False, ).squeeze(1) out_mask = point_sample( out_mask, point_coords.repeat(out_mask.shape[0], 1, 1), align_corners=False, ).squeeze(1) with autocast(enabled=False): out_mask = out_mask.float() tgt_mask = tgt_mask.float() # Compute the focal loss between masks cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask) # Compute the dice loss betwen masks cost_dice = batch_dice_loss(out_mask, tgt_mask) # Final cost matrix C = ( self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice ) C = C.reshape(num_queries, -1).cpu() indices.append(linear_sum_assignment_with_nan(C)) return [ (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices ] @torch.no_grad() def forward(self, outputs, targets): """Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ return self.memory_efficient_forward(outputs, targets) def __repr__(self, _repr_indent=4): head = "Matcher " + self.__class__.__name__ body = [ "cost_class: {}".format(self.cost_class), "cost_mask: {}".format(self.cost_mask), "cost_dice: {}".format(self.cost_dice), ] lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: annotator/oneformer/oneformer/modeling/meta_arch/__init__.py ================================================ ================================================ FILE: annotator/oneformer/oneformer/modeling/meta_arch/oneformer_head.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/meta_arch/mask_former_head.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import logging from copy import deepcopy from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init from torch import nn from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm from annotator.oneformer.detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..pixel_decoder.fpn import build_pixel_decoder from ..transformer_decoder.oneformer_transformer_decoder import build_transformer_decoder @SEM_SEG_HEADS_REGISTRY.register() class OneFormerHead(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): newk = k.replace(prefix, prefix + "pixel_decoder.") # logger.debug(f"{k} ==> {newk}") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, num_classes: int, pixel_decoder: nn.Module, loss_weight: float = 1.0, ignore_value: int = -1, # extra parameters transformer_predictor: nn.Module, transformer_in_feature: str, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features num_classes: number of classes to predict pixel_decoder: the pixel decoder module loss_weight: loss weight ignore_value: category id to be ignored during training. transformer_predictor: the transformer decoder that makes prediction transformer_in_feature: input feature name to the transformer_predictor """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] self.ignore_value = ignore_value self.common_stride = 4 self.loss_weight = loss_weight self.pixel_decoder = pixel_decoder self.predictor = transformer_predictor self.transformer_in_feature = transformer_in_feature self.num_classes = num_classes @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): # figure out in_channels to transformer predictor if cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM elif cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM elif cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM else: transformer_predictor_in_channels = input_shape[cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE].channels return { "input_shape": { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES }, "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, "pixel_decoder": build_pixel_decoder(cfg, input_shape), "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, "transformer_in_feature": cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE, "transformer_predictor": build_transformer_decoder( cfg, transformer_predictor_in_channels, mask_classification=True, ), } def forward(self, features, tasks, mask=None): return self.layers(features, tasks, mask) def layers(self, features, tasks, mask=None): mask_features, transformer_encoder_features, multi_scale_features, _, _ = self.pixel_decoder.forward_features(features) if self.transformer_in_feature == "multi_scale_pixel_decoder": predictions = self.predictor(multi_scale_features, mask_features, tasks, mask) else: if self.transformer_in_feature == "transformer_encoder": assert ( transformer_encoder_features is not None ), "Please use the TransformerEncoderPixelDecoder." predictions = self.predictor(transformer_encoder_features, mask_features, mask) elif self.transformer_in_feature == "pixel_embedding": predictions = self.predictor(mask_features, mask_features, mask) else: predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) return predictions ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/fpn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ from torch.cuda.amp import autocast from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import Conv2d, DeformConv, ShapeSpec, get_norm from annotator.oneformer.detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.position_encoding import PositionEmbeddingSine from ..transformer_decoder.transformer import TransformerEncoder, TransformerEncoderLayer, _get_clones, _get_activation_fn def build_pixel_decoder(cfg, input_shape): """ Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`. """ name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape) forward_features = getattr(model, "forward_features", None) if not callable(forward_features): raise ValueError( "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. " f"Please implement forward_features for {name} to only return mask features." ) return model # This is a modified FPN decoder. @SEM_SEG_HEADS_REGISTRY.register() class BasePixelDecoder(nn.Module): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__() input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" feature_channels = [v.channels for k, v in input_shape] lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(feature_channels): if idx == len(self.in_features) - 1: output_norm = get_norm(norm, conv_dim) output_conv = Conv2d( in_channels, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(output_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(None) output_convs.append(output_conv) else: lateral_norm = get_norm(norm, conv_dim) output_norm = get_norm(norm, conv_dim) lateral_conv = Conv2d( in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) self.add_module("adapter_{}".format(idx + 1), lateral_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] self.mask_dim = mask_dim self.mask_features = Conv2d( conv_dim, mask_dim, kernel_size=3, stride=1, padding=1, ) weight_init.c2_xavier_fill(self.mask_features) self.oneformer_num_feature_levels = 3 # always use 3 scales @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = {} ret["input_shape"] = { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES } ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM return ret def forward_features(self, features): multi_scale_features = [] num_cur_levels = 0 # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[::-1]): x = features[f] lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] if lateral_conv is None: y = output_conv(x) else: cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") y = output_conv(y) if num_cur_levels < self.oneformer_num_feature_levels: multi_scale_features.append(y) num_cur_levels += 1 return self.mask_features(y), None, multi_scale_features def forward(self, features, targets=None): logger = logging.getLogger(__name__) logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.") return self.forward_features(features) class TransformerEncoderOnly(nn.Module): def __init__( self, d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) encoder_norm = nn.LayerNorm(d_model) if normalize_before else None self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) self._reset_parameters() self.d_model = d_model self.nhead = nhead def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, src, mask, pos_embed): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) if mask is not None: mask = mask.flatten(1) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) return memory.permute(1, 2, 0).view(bs, c, h, w) # This is a modified FPN decoder with extra Transformer encoder that processes the lowest-resolution feature map. @SEM_SEG_HEADS_REGISTRY.register() class TransformerEncoderPixelDecoder(BasePixelDecoder): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, transformer_dropout: float, transformer_nheads: int, transformer_dim_feedforward: int, transformer_enc_layers: int, transformer_pre_norm: bool, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features transformer_dropout: dropout probability in transformer transformer_nheads: number of heads in transformer transformer_dim_feedforward: dimension of feedforward network transformer_enc_layers: number of transformer encoder layers transformer_pre_norm: whether to use pre-layernorm or not conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm) input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" feature_strides = [v.stride for k, v in input_shape] feature_channels = [v.channels for k, v in input_shape] in_channels = feature_channels[len(self.in_features) - 1] self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1) weight_init.c2_xavier_fill(self.input_proj) self.transformer = TransformerEncoderOnly( d_model=conv_dim, dropout=transformer_dropout, nhead=transformer_nheads, dim_feedforward=transformer_dim_feedforward, num_encoder_layers=transformer_enc_layers, normalize_before=transformer_pre_norm, ) N_steps = conv_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) # update layer use_bias = norm == "" output_norm = get_norm(norm, conv_dim) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(output_conv) delattr(self, "layer_{}".format(len(self.in_features))) self.add_module("layer_{}".format(len(self.in_features)), output_conv) self.output_convs[0] = output_conv @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = super().from_config(cfg, input_shape) ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD ret[ "transformer_enc_layers" ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM return ret def forward_features(self, features): multi_scale_features = [] num_cur_levels = 0 # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[::-1]): x = features[f] lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] if lateral_conv is None: transformer = self.input_proj(x) pos = self.pe_layer(x) transformer = self.transformer(transformer, None, pos) y = output_conv(transformer) # save intermediate feature as input to Transformer decoder transformer_encoder_features = transformer else: cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") y = output_conv(y) if num_cur_levels < self.oneformer_num_feature_levels: multi_scale_features.append(y) num_cur_levels += 1 return self.mask_features(y), transformer_encoder_features, multi_scale_features def forward(self, features, targets=None): logger = logging.getLogger(__name__) logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.") return self.forward_features(features) ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/msdeformattn.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import logging import numpy as np from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ from torch.cuda.amp import autocast from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm from annotator.oneformer.detectron2.modeling import SEM_SEG_HEADS_REGISTRY from ..transformer_decoder.position_encoding import PositionEmbeddingSine from ..transformer_decoder.transformer import _get_clones, _get_activation_fn from .ops.modules import MSDeformAttn # MSDeformAttn Transformer encoder in deformable detr class MSDeformAttnTransformerEncoderOnly(nn.Module): def __init__(self, d_model=256, nhead=8, num_encoder_layers=6, dim_feedforward=1024, dropout=0.1, activation="relu", num_feature_levels=4, enc_n_points=4, ): super().__init__() self.d_model = d_model self.nhead = nhead encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points) self.encoder = MSDeformAttnTransformerEncoder(encoder_layer, num_encoder_layers) self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, MSDeformAttn): m._reset_parameters() normal_(self.level_embed) def get_valid_ratio(self, mask): _, H, W = mask.shape valid_H = torch.sum(~mask[:, :, 0], 1) valid_W = torch.sum(~mask[:, 0, :], 1) valid_ratio_h = valid_H.float() / H valid_ratio_w = valid_W.float() / W valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) return valid_ratio def forward(self, srcs, pos_embeds): masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs] # prepare input for encoder src_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): bs, c, h, w = src.shape spatial_shape = (h, w) spatial_shapes.append(spatial_shape) src = src.flatten(2).transpose(1, 2) mask = mask.flatten(1) pos_embed = pos_embed.flatten(2).transpose(1, 2) lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) lvl_pos_embed_flatten.append(lvl_pos_embed) src_flatten.append(src) mask_flatten.append(mask) src_flatten = torch.cat(src_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device) level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) # encoder memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten) return memory, spatial_shapes, level_start_index, valid_ratios class MSDeformAttnTransformerEncoderLayer(nn.Module): def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4): super().__init__() # self attention self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) # ffn self.linear1 = nn.Linear(d_model, d_ffn) self.activation = _get_activation_fn(activation) self.dropout2 = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ffn, d_model) self.dropout3 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm(d_model) @staticmethod def with_pos_embed(tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, src): src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) src = src + self.dropout3(src2) src = self.norm2(src) return src def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None): # self attention src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask) src = src + self.dropout1(src2) src = self.norm1(src) # ffn src = self.forward_ffn(src) return src class MSDeformAttnTransformerEncoder(nn.Module): def __init__(self, encoder_layer, num_layers): super().__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers @staticmethod def get_reference_points(spatial_shapes, valid_ratios, device): reference_points_list = [] for lvl, (H_, W_) in enumerate(spatial_shapes): ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device)) ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_) ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_) ref = torch.stack((ref_x, ref_y), -1) reference_points_list.append(ref) reference_points = torch.cat(reference_points_list, 1) reference_points = reference_points[:, :, None] * valid_ratios[:, None] return reference_points def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None): output = src reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device) for _, layer in enumerate(self.layers): output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask) return output @SEM_SEG_HEADS_REGISTRY.register() class MSDeformAttnPixelDecoder(nn.Module): @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, transformer_dropout: float, transformer_nheads: int, transformer_dim_feedforward: int, transformer_enc_layers: int, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, # deformable transformer encoder args transformer_in_features: List[str], common_stride: int, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features transformer_dropout: dropout probability in transformer transformer_nheads: number of heads in transformer transformer_dim_feedforward: dimension of feedforward network transformer_enc_layers: number of transformer encoder layers conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers """ super().__init__() transformer_input_shape = { k: v for k, v in input_shape.items() if k in transformer_in_features } # this is the input shape of pixel decoder input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" self.feature_strides = [v.stride for k, v in input_shape] self.feature_channels = [v.channels for k, v in input_shape] # this is the input shape of transformer encoder (could use less features than pixel decoder transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride) self.transformer_in_features = [k for k, v in transformer_input_shape] # starting from "res2" to "res5" transformer_in_channels = [v.channels for k, v in transformer_input_shape] self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape] # to decide extra FPN layers self.transformer_num_feature_levels = len(self.transformer_in_features) if self.transformer_num_feature_levels > 1: input_proj_list = [] # from low resolution to high resolution (res5 -> res2) for in_channels in transformer_in_channels[::-1]: input_proj_list.append(nn.Sequential( nn.Conv2d(in_channels, conv_dim, kernel_size=1), nn.GroupNorm(32, conv_dim), )) self.input_proj = nn.ModuleList(input_proj_list) else: self.input_proj = nn.ModuleList([ nn.Sequential( nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1), nn.GroupNorm(32, conv_dim), )]) for proj in self.input_proj: nn.init.xavier_uniform_(proj[0].weight, gain=1) nn.init.constant_(proj[0].bias, 0) self.transformer = MSDeformAttnTransformerEncoderOnly( d_model=conv_dim, dropout=transformer_dropout, nhead=transformer_nheads, dim_feedforward=transformer_dim_feedforward, num_encoder_layers=transformer_enc_layers, num_feature_levels=self.transformer_num_feature_levels, ) N_steps = conv_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) self.mask_dim = mask_dim # use 1x1 conv instead self.mask_features = Conv2d( conv_dim, mask_dim, kernel_size=1, stride=1, padding=0, ) weight_init.c2_xavier_fill(self.mask_features) self.oneformer_num_feature_levels = 3 # always use 3 scales self.common_stride = common_stride # extra fpn levels stride = min(self.transformer_feature_strides) self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride)) lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]): lateral_norm = get_norm(norm, conv_dim) output_norm = get_norm(norm, conv_dim) lateral_conv = Conv2d( in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) self.add_module("adapter_{}".format(idx + 1), lateral_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = {} ret["input_shape"] = { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES } ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM ret["transformer_dropout"] = cfg.MODEL.ONE_FORMER.DROPOUT ret["transformer_nheads"] = cfg.MODEL.ONE_FORMER.NHEADS # ret["transformer_dim_feedforward"] = cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD ret["transformer_dim_feedforward"] = 1024 # use 1024 for deformable transformer encoder ret[ "transformer_enc_layers" ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config ret["transformer_in_features"] = cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES ret["common_stride"] = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE return ret @autocast(enabled=False) def forward_features(self, features): srcs = [] pos = [] # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.transformer_in_features[::-1]): x = features[f].float() # deformable detr does not support half precision srcs.append(self.input_proj[idx](x)) pos.append(self.pe_layer(x)) y, spatial_shapes, level_start_index, valid_ratios = self.transformer(srcs, pos) bs = y.shape[0] split_size_or_sections = [None] * self.transformer_num_feature_levels for i in range(self.transformer_num_feature_levels): if i < self.transformer_num_feature_levels - 1: split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i] else: split_size_or_sections[i] = y.shape[1] - level_start_index[i] y = torch.split(y, split_size_or_sections, dim=1) out = [] multi_scale_features = [] num_cur_levels = 0 for i, z in enumerate(y): out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1])) # append `out` with extra FPN levels # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]): x = features[f].float() lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(out[-1], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False) y = output_conv(y) out.append(y) for o in out: if num_cur_levels < self.oneformer_num_feature_levels: multi_scale_features.append(o) num_cur_levels += 1 return self.mask_features(out[-1]), out[0], multi_scale_features, spatial_shapes, level_start_index ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/functions/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn_func import MSDeformAttnFunction ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import torch import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable # if torch.cuda.is_available(): # try: # import MultiScaleDeformableAttention as MSDA # except ModuleNotFoundError as e: # info_string = ( # "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" # "\t`cd oneformer/modeling/pixel_decoder/ops`\n" # "\t`sh make.sh`\n" # ) # raise ModuleNotFoundError(info_string) # else: # MultiScaleDeformableAttention = None class MSDeformAttnFunction(Function): @staticmethod def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): # ctx.im2col_step = im2col_step output = ms_deform_attn_core_pytorch( value, value_spatial_shapes, sampling_locations, attention_weights) # ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output # @staticmethod # @once_differentiable # def backward(ctx, grad_output): # value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors # grad_value, grad_sampling_loc, grad_attn_weight = \ # MSDA.ms_deform_attn_backward( # value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) # # return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): # for debug and test only, # need to use cuda version instead N_, S_, M_, D_ = value.shape _, Lq_, M_, L_, P_, _ = sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for lid_, (H_, W_) in enumerate(value_spatial_shapes): # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) # N_*M_, D_, Lq_, P_ sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) return output.transpose(1, 2).contiguous() ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/make.sh ================================================ #!/usr/bin/env bash # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR FORCE_CUDA=1 python setup.py build install ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/modules/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn import MSDeformAttn ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import warnings import math import torch from torch import nn import torch.nn.functional as F from torch.nn.init import xavier_uniform_, constant_ MSDeformAttnFunction = None from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) return (n & (n-1) == 0) and n != 0 class MSDeformAttn(nn.Module): def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): """ Multi-Scale Deformable Attention Module :param d_model hidden dimension :param n_levels number of feature levels :param n_heads number of attention heads :param n_points number of sampling points per attention head per feature level """ super().__init__() if d_model % n_heads != 0: raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) _d_per_head = d_model // n_heads # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_head): warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " "which is more efficient in our CUDA implementation.") self.im2col_step = 128 self.d_model = d_model self.n_levels = n_levels self.n_heads = n_heads self.n_points = n_points self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) self.value_proj = nn.Linear(d_model, d_model) self.output_proj = nn.Linear(d_model, d_model) self._reset_parameters() def _reset_parameters(self): constant_(self.sampling_offsets.weight.data, 0.) thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) for i in range(self.n_points): grid_init[:, :, i, :] *= i + 1 with torch.no_grad(): self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) constant_(self.attention_weights.weight.data, 0.) constant_(self.attention_weights.bias.data, 0.) xavier_uniform_(self.value_proj.weight.data) constant_(self.value_proj.bias.data, 0.) xavier_uniform_(self.output_proj.weight.data) constant_(self.output_proj.bias.data, 0.) def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): """ :param query (N, Length_{query}, C) :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements :return output (N, Length_{query}, C) """ N, Len_q, _ = query.shape N, Len_in, _ = input_flatten.shape assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in value = self.value_proj(input_flatten) if input_padding_mask is not None: value = value.masked_fill(input_padding_mask[..., None], float(0)) value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) # N, Len_q, n_heads, n_levels, n_points, 2 if reference_points.shape[-1] == 2: offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: sampling_locations = reference_points[:, :, None, :, None, :2] \ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 else: raise ValueError( 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) # try: output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) # # For FLOPs calculation only # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) output = self.output_proj(output) return output ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/setup.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR import os import glob import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) sources = main_file + source_cpu extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] # Force cuda since torch ask for a device, not if cuda is in fact available. if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] else: if CUDA_HOME is None: raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') else: raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "MultiScaleDeformableAttention", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="MultiScaleDeformableAttention", version="1.0", author="Weijie Su", url="https://github.com/fundamentalvision/Deformable-DETR", description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", packages=find_packages(exclude=("configs", "tests",)), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ERROR("Not implement on cpu"); } std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ERROR("Not implement on cpu"); } ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include "cuda/ms_deform_im2col_cuda.cuh" #include #include #include #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); const int batch_n = im2col_step_; auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; for (int n = 0; n < batch/im2col_step_; ++n) { auto columns = output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, columns.data()); })); } output = output.view({batch, num_query, num_heads*channels}); return output; } std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto grad_value = at::zeros_like(value); auto grad_sampling_loc = at::zeros_like(sampling_loc); auto grad_attn_weight = at::zeros_like(attn_weight); const int batch_n = im2col_step_; auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); for (int n = 0; n < batch/im2col_step_; ++n) { auto grad_output_g = grad_output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), grad_output_g.data(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value.data() + n * im2col_step_ * per_value_size, grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); })); } return { grad_value, grad_sampling_loc, grad_attn_weight }; } ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh ================================================ /*! ************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************** * Modified from DCN (https://github.com/msracver/Deformable-ConvNets) * Copyright (c) 2018 Microsoft ************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include #include #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } template __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_attn_weight = top_grad * val; *grad_sampling_loc = width * grad_w_weight * top_grad_value; *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; } template __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_attn_weight, top_grad * val); atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); } template __global__ void ms_deformable_im2col_gpu_kernel(const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *data_col) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; scalar_t *data_col_ptr = data_col + index; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; scalar_t col = 0; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_col_ptr = col; } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockSize; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockSize/2; s>0; s>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockDim.x; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, grad_sampling_loc, grad_attn_weight); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t* data_value, const int64_t* data_spatial_shapes, const int64_t* data_level_start_index, const scalar_t* data_sampling_loc, const scalar_t* data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* data_col) { const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; const int num_threads = CUDA_NUM_THREADS; ms_deformable_im2col_gpu_kernel <<>>( num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } template void ms_deformable_col2im_cuda(cudaStream_t stream, const scalar_t* grad_col, const scalar_t* data_value, const int64_t * data_spatial_shapes, const int64_t * data_level_start_index, const scalar_t * data_sampling_loc, const scalar_t * data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; if (channels > 1024) { if ((channels & 1023) == 0) { ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_gm <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } else{ switch(channels) { case 1: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 2: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 4: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 8: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 16: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 32: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 64: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 128: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 256: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 512: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 1024: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; default: if (channels < 64) { ms_deformable_col2im_gpu_kernel_shm_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_shm_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include "cpu/ms_deform_attn_cpu.h" #ifdef WITH_CUDA #include "cuda/ms_deform_attn_cuda.h" #endif at::Tensor ms_deform_attn_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_forward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } std::vector ms_deform_attn_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_backward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/src/vision.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include "ms_deform_attn.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); } ================================================ FILE: annotator/oneformer/oneformer/modeling/pixel_decoder/ops/test.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import time import torch import torch.nn as nn from torch.autograd import gradcheck from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch N, M, D = 1, 2, 2 Lq, L, P = 2, 2, 2 shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) S = sum([(H*W).item() for H, W in shapes]) torch.manual_seed(3) @torch.no_grad() def check_forward_equal_with_pytorch_double(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') @torch.no_grad() def check_forward_equal_with_pytorch_float(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): value = torch.rand(N, S, M, channels).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 func = MSDeformAttnFunction.apply value.requires_grad = grad_value sampling_locations.requires_grad = grad_sampling_loc attention_weights.requires_grad = grad_attn_weight gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) print(f'* {gradok} check_gradient_numerical(D={channels})') if __name__ == '__main__': check_forward_equal_with_pytorch_double() check_forward_equal_with_pytorch_float() for channels in [30, 32, 64, 71, 1025, 2048, 3096]: check_gradient_numerical(channels, True, True, True) ================================================ FILE: annotator/oneformer/oneformer/modeling/transformer_decoder/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .oneformer_transformer_decoder import ContrastiveMultiScaleMaskedTransformerDecoder ================================================ FILE: annotator/oneformer/oneformer/modeling/transformer_decoder/oneformer_transformer_decoder.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ import logging import fvcore.nn.weight_init as weight_init from typing import Optional import torch from torch import nn, Tensor from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.layers import Conv2d from .position_encoding import PositionEmbeddingSine from .transformer import Transformer from annotator.oneformer.detectron2.utils.registry import Registry TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE") TRANSFORMER_DECODER_REGISTRY.__doc__ = """ Registry for transformer module in OneFormer. """ def build_transformer_decoder(cfg, in_channels, mask_classification=True): """ Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`. """ name = cfg.MODEL.ONE_FORMER.TRANSFORMER_DECODER_NAME return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification) class SelfAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): q = k = self.with_pos_embed(tgt, query_pos) tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) q = k = self.with_pos_embed(tgt2, query_pos) tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, tgt_mask, tgt_key_padding_mask, query_pos) return self.forward_post(tgt, tgt_mask, tgt_key_padding_mask, query_pos) class CrossAttentionLayer(nn.Module): def __init__(self, d_model, nhead, dropout=0.0, activation="relu", normalize_before=False): super().__init__() self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.norm = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): tgt2 = self.norm(tgt) tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)[0] tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt, memory, memory_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None): if self.normalize_before: return self.forward_pre(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) return self.forward_post(tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos) class FFNLayer(nn.Module): def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False): super().__init__() # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm = nn.LayerNorm(d_model) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post(self, tgt): tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout(tgt2) tgt = self.norm(tgt) return tgt def forward_pre(self, tgt): tgt2 = self.norm(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout(tgt2) return tgt def forward(self, tgt): if self.normalize_before: return self.forward_pre(tgt) return self.forward_post(tgt) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") class MLP(nn.Module): """ Very simple multi-layer perceptron (also called FFN)""" def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x @TRANSFORMER_DECODER_REGISTRY.register() class ContrastiveMultiScaleMaskedTransformerDecoder(nn.Module): _version = 2 def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ): version = local_metadata.get("version", None) if version is None or version < 2: # Do not warn if train from scratch scratch = True logger = logging.getLogger(__name__) for k in list(state_dict.keys()): newk = k if "static_query" in k: newk = k.replace("static_query", "query_feat") if newk != k: state_dict[newk] = state_dict[k] del state_dict[k] scratch = False if not scratch: logger.warning( f"Weight format of {self.__class__.__name__} have changed! " "Please upgrade your models. Applying automatic conversion now ..." ) @configurable def __init__( self, in_channels, mask_classification=True, *, num_classes: int, hidden_dim: int, num_queries: int, nheads: int, dropout: float, dim_feedforward: int, enc_layers: int, is_train: bool, dec_layers: int, class_dec_layers: int, pre_norm: bool, mask_dim: int, enforce_input_project: bool, use_task_norm: bool, ): """ NOTE: this interface is experimental. Args: in_channels: channels of the input features mask_classification: whether to add mask classifier or not num_classes: number of classes hidden_dim: Transformer feature dimension num_queries: number of queries nheads: number of heads dim_feedforward: feature dimension in feedforward network enc_layers: number of Transformer encoder layers dec_layers: number of Transformer decoder layers pre_norm: whether to use pre-LayerNorm or not mask_dim: mask feature dimension enforce_input_project: add input project 1x1 conv even if input channels and hidden dim is identical """ super().__init__() assert mask_classification, "Only support mask classification model" self.mask_classification = mask_classification self.is_train = is_train self.use_task_norm = use_task_norm # positional encoding N_steps = hidden_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) self.class_transformer = Transformer( d_model=hidden_dim, dropout=dropout, nhead=nheads, dim_feedforward=dim_feedforward, num_encoder_layers=enc_layers, num_decoder_layers=class_dec_layers, normalize_before=pre_norm, return_intermediate_dec=False, ) # define Transformer decoder here self.num_heads = nheads self.num_layers = dec_layers self.transformer_self_attention_layers = nn.ModuleList() self.transformer_cross_attention_layers = nn.ModuleList() self.transformer_ffn_layers = nn.ModuleList() for _ in range(self.num_layers): self.transformer_self_attention_layers.append( SelfAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_cross_attention_layers.append( CrossAttentionLayer( d_model=hidden_dim, nhead=nheads, dropout=0.0, normalize_before=pre_norm, ) ) self.transformer_ffn_layers.append( FFNLayer( d_model=hidden_dim, dim_feedforward=dim_feedforward, dropout=0.0, normalize_before=pre_norm, ) ) self.decoder_norm = nn.LayerNorm(hidden_dim) self.num_queries = num_queries # learnable query p.e. self.query_embed = nn.Embedding(num_queries, hidden_dim) # level embedding (we always use 3 scales) self.num_feature_levels = 3 self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim) self.input_proj = nn.ModuleList() for _ in range(self.num_feature_levels): if in_channels != hidden_dim or enforce_input_project: self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1)) weight_init.c2_xavier_fill(self.input_proj[-1]) else: self.input_proj.append(nn.Sequential()) self.class_input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) weight_init.c2_xavier_fill(self.class_input_proj) # output FFNs if self.mask_classification: self.class_embed = nn.Linear(hidden_dim, num_classes + 1) self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) @classmethod def from_config(cls, cfg, in_channels, mask_classification): ret = {} ret["in_channels"] = in_channels ret["mask_classification"] = mask_classification ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.ONE_FORMER.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES # Transformer parameters: ret["nheads"] = cfg.MODEL.ONE_FORMER.NHEADS ret["dim_feedforward"] = cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD # NOTE: because we add learnable query features which requires supervision, # we add minus 1 to decoder layers to be consistent with our loss # implementation: that is, number of auxiliary losses is always # equal to number of decoder layers. With learnable query features, the number of # auxiliary losses equals number of decoders plus 1. assert cfg.MODEL.ONE_FORMER.DEC_LAYERS >= 1 ret["dec_layers"] = cfg.MODEL.ONE_FORMER.DEC_LAYERS - 1 ret["class_dec_layers"] = cfg.MODEL.ONE_FORMER.CLASS_DEC_LAYERS ret["enc_layers"] = cfg.MODEL.ONE_FORMER.ENC_LAYERS ret["dropout"] = cfg.MODEL.ONE_FORMER.DROPOUT ret["pre_norm"] = cfg.MODEL.ONE_FORMER.PRE_NORM ret["enforce_input_project"] = cfg.MODEL.ONE_FORMER.ENFORCE_INPUT_PROJ ret["is_train"] = cfg.MODEL.IS_TRAIN ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["use_task_norm"] = cfg.MODEL.ONE_FORMER.USE_TASK_NORM return ret def forward(self, x, mask_features, tasks, mask = None): # x is a list of multi-scale feature assert len(x) == self.num_feature_levels src = [] pos = [] size_list = [] # disable mask, it does not affect performance del mask for i in range(self.num_feature_levels): size_list.append(x[i].shape[-2:]) pos.append(self.pe_layer(x[i], None).flatten(2)) src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None]) # flatten NxCxHxW to HWxNxC pos[-1] = pos[-1].permute(2, 0, 1) src[-1] = src[-1].permute(2, 0, 1) _, bs, _ = src[0].shape # QxNxC query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1) tasks = tasks.unsqueeze(0) if self.use_task_norm: tasks = self.decoder_norm(tasks) feats = self.pe_layer(mask_features, None) out_t, _ = self.class_transformer(feats, None, self.query_embed.weight[:-1], self.class_input_proj(mask_features), tasks if self.use_task_norm else None) out_t = out_t[0].permute(1, 0, 2) out = torch.cat([out_t, tasks], dim=0) output = out.clone() predictions_class = [] predictions_mask = [] # prediction heads on learnable query features outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0], i=0) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) for i in range(self.num_layers): level_index = i % self.num_feature_levels attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False # attention: cross-attention first output = self.transformer_cross_attention_layers[i]( output, src[level_index], memory_mask=attn_mask, memory_key_padding_mask=None, # here we do not apply masking on padded region pos=pos[level_index], query_pos=query_embed ) output = self.transformer_self_attention_layers[i]( output, tgt_mask=None, tgt_key_padding_mask=None, query_pos=query_embed ) # FFN output = self.transformer_ffn_layers[i]( output ) outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels], i=i+1) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) assert len(predictions_class) == self.num_layers + 1 if self.is_train: query_class = out.permute(1, 0, 2) else: query_class = None out = { 'contrastive_logits': query_class, 'pred_logits': predictions_class[-1], 'pred_masks': predictions_mask[-1], 'aux_outputs': self._set_aux_loss( predictions_class if self.mask_classification else None, predictions_mask, ) } return out def forward_prediction_heads(self, output, mask_features, attn_mask_target_size, i): decoder_output = self.decoder_norm(output) decoder_output = decoder_output.transpose(0, 1) outputs_class = self.class_embed(decoder_output) mask_embed = self.mask_embed(decoder_output) outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) # NOTE: prediction is of higher-resolution # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW] attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False) # save_attn_masks(attn_mask.sigmoid() < 0.5, fname=f'demo/maps/{i}_pre_bool') # must use bool type # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged. attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool() attn_mask = attn_mask.detach() return outputs_class, outputs_mask, attn_mask @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_seg_masks): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. if self.mask_classification: aux_list = [ {"pred_logits": a, "pred_masks": b} for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) ] else: aux_list = [{"pred_masks": b} for b, in outputs_seg_masks[:-1]] return aux_list ================================================ FILE: annotator/oneformer/oneformer/modeling/transformer_decoder/position_encoding.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/position_encoding.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ """ Various positional encodings for the transformer. """ import math import torch from torch import nn class PositionEmbeddingSine(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. """ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, x, mask=None): if mask is None: mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) not_mask = ~mask y_embed = not_mask.cumsum(1, dtype=torch.float32) x_embed = not_mask.cumsum(2, dtype=torch.float32) if self.normalize: eps = 1e-6 y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) pos_x = x_embed[:, :, :, None] / dim_t pos_y = y_embed[:, :, :, None] / dim_t pos_x = torch.stack( (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos_y = torch.stack( (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) return pos def __repr__(self, _repr_indent=4): head = "Positional encoding " + self.__class__.__name__ body = [ "num_pos_feats: {}".format(self.num_pos_feats), "temperature: {}".format(self.temperature), "normalize: {}".format(self.normalize), "scale: {}".format(self.scale), ] # _repr_indent = 4 lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: annotator/oneformer/oneformer/modeling/transformer_decoder/text_transformer.py ================================================ # ------------------------------------------------------------------------- # MIT License # # Copyright (c) 2021 OpenAI # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # # ------------------------------------------------------------------------- import torch import torch.utils.checkpoint as checkpoint from torch import nn from collections import OrderedDict from timm.models.layers import trunc_normal_ class Attention(nn.Module): def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights self.scale = qk_scale or head_dim ** -0.5 self.q_proj = nn.Linear(dim, dim, bias=qkv_bias) self.k_proj = nn.Linear(dim, dim, bias=qkv_bias) self.v_proj = nn.Linear(dim, dim, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, q, k, v): B, N, C = q.shape assert k.shape == v.shape B, M, C = k.shape q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads) k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads) v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads) attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale attn = attn.softmax(dim=-1) x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class TransformerDecoderLayer(nn.Module): def __init__( self, d_model, nhead, dropout=0.1, ): super().__init__() self.self_attn = Attention(d_model, nhead, proj_drop=dropout) self.cross_attn = Attention(d_model, nhead, proj_drop=dropout) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) self.mlp = nn.Sequential( nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Dropout(dropout), nn.Linear(d_model * 4, d_model) ) def forward(self, x, mem): q = k = v = self.norm1(x) x = x + self.self_attn(q, k, v) q = self.norm2(x) x = x + self.cross_attn(q, mem, mem) x = x + self.dropout(self.mlp(self.norm3(x))) return x class ContextDecoder(nn.Module): def __init__(self, transformer_width=256, transformer_heads=4, transformer_layers=6, visual_dim=1024, dropout=0.1, **kwargs): super().__init__() self.memory_proj = nn.Sequential( nn.LayerNorm(visual_dim), nn.Linear(visual_dim, transformer_width), nn.LayerNorm(transformer_width), ) self.text_proj = nn.Sequential( nn.LayerNorm(visual_dim), nn.Linear(visual_dim, transformer_width), ) self.decoder = nn.ModuleList([ TransformerDecoderLayer(transformer_width, transformer_heads, dropout) for _ in range(transformer_layers) ]) self.out_proj = nn.Sequential( nn.LayerNorm(transformer_width), nn.Linear(transformer_width, visual_dim) ) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, text, visual): B, N, C = visual.shape visual = self.memory_proj(visual) x = self.text_proj(text) for layer in self.decoder: x = layer(x, visual) return self.out_proj(x) class QuickGELU(nn.Module): def forward(self, x: torch.Tensor): return x * torch.sigmoid(1.702 * x) class ResidualAttentionBlock(nn.Module): def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): super().__init__() self.attn = nn.MultiheadAttention(d_model, n_head) self.ln_1 = nn.LayerNorm(d_model) self.mlp = nn.Sequential( OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), ('gelu', QuickGELU()), ('c_proj', nn.Linear(d_model * 4, d_model))])) self.ln_2 = nn.LayerNorm(d_model) self.attn_mask = attn_mask def attention(self, x: torch.Tensor, key_padding_mask: torch.Tensor): self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask, key_padding_mask=key_padding_mask)[0] def forward(self, x: torch.Tensor, key_padding_mask=None): x = x + self.attention(self.ln_1(x), key_padding_mask=key_padding_mask) x = x + self.mlp(self.ln_2(x)) return x class Transformer(nn.Module): def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_checkpoint=False): super().__init__() self.width = width self.layers = layers self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) proj_std = (self.width**-0.5) * ((2 * self.layers)**-0.5) attn_std = self.width**-0.5 fc_std = (2 * self.width)**-0.5 for block in self.resblocks: nn.init.normal_(block.attn.in_proj_weight, std=attn_std) nn.init.normal_(block.attn.out_proj.weight, std=proj_std) nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) self.use_checkpoint = use_checkpoint def forward(self, x: torch.Tensor): for resblock in self.resblocks: if self.use_checkpoint: x = checkpoint.checkpoint(resblock, x) else: x = resblock(x) return x class TextTransformer(nn.Module): def __init__( self, context_length: int, width: int, layers: int, vocab_size, use_checkpoint=False, ): super().__init__() heads = width // 64 self.context_length = context_length self.width = width self.transformer = Transformer( width=width, layers=layers, heads=heads, attn_mask=self.build_attention_mask(), use_checkpoint=use_checkpoint) self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width)) self.ln_final = nn.LayerNorm(width) self.token_embedding = nn.Embedding(vocab_size, width) nn.init.normal_(self.token_embedding.weight, std=0.02) # initialization nn.init.normal_(self.positional_embedding, std=0.01) def build_attention_mask(self): # lazily create causal attention mask, with full attention between the vision tokens # pytorch uses additive attention mask; fill with -inf mask = torch.empty(self.context_length, self.context_length) mask.fill_(float('-inf')) mask.triu_(1) # zero out the lower diagonal return mask def forward(self, text): x = self.token_embedding(text) x = x + self.positional_embedding x = x.permute(1, 0, 2) # NLD -> LND x = self.transformer(x) x = x.permute(1, 0, 2) # LND -> NLD x = self.ln_final(x) # x.shape = [batch_size, n_ctx, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] return x ================================================ FILE: annotator/oneformer/oneformer/modeling/transformer_decoder/transformer.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/transformer.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ """ Transformer class. Copy-paste from torch.nn.Transformer with modifications: * positional encodings are passed in MHattention * extra LN at the end of encoder is removed * decoder returns a stack of activations from all decoding layers """ import copy from typing import List, Optional import torch import torch.nn.functional as F from torch import Tensor, nn class Transformer(nn.Module): def __init__( self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, return_intermediate_dec=False, ): super().__init__() encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) encoder_norm = nn.LayerNorm(d_model) if normalize_before else None self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) decoder_layer = TransformerDecoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) decoder_norm = nn.LayerNorm(d_model) self.decoder = TransformerDecoder( decoder_layer, num_decoder_layers, decoder_norm, return_intermediate=return_intermediate_dec, ) self._reset_parameters() self.d_model = d_model self.nhead = nhead def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, src, mask, query_embed, pos_embed, task_token=None): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) if mask is not None: mask = mask.flatten(1) if task_token is None: tgt = torch.zeros_like(query_embed) else: tgt = task_token.repeat(query_embed.shape[0], 1, 1) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) hs = self.decoder( tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed ) return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) class TransformerEncoder(nn.Module): def __init__(self, encoder_layer, num_layers, norm=None): super().__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers self.norm = norm def forward( self, src, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): output = src for layer in self.layers: output = layer( output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos ) if self.norm is not None: output = self.norm(output) return output class TransformerDecoder(nn.Module): def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): super().__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers self.norm = norm self.return_intermediate = return_intermediate def forward( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): output = tgt intermediate = [] for layer in self.layers: output = layer( output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, pos=pos, query_pos=query_pos, ) if self.return_intermediate: intermediate.append(self.norm(output)) if self.norm is not None: output = self.norm(output) if self.return_intermediate: intermediate.pop() intermediate.append(output) if self.return_intermediate: return torch.stack(intermediate) return output.unsqueeze(0) class TransformerEncoderLayer(nn.Module): def __init__( self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post( self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): q = k = self.with_pos_embed(src, pos) src2 = self.self_attn( q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask )[0] src = src + self.dropout1(src2) src = self.norm1(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) src = src + self.dropout2(src2) src = self.norm2(src) return src def forward_pre( self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): src2 = self.norm1(src) q = k = self.with_pos_embed(src2, pos) src2 = self.self_attn( q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask )[0] src = src + self.dropout1(src2) src2 = self.norm2(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) src = src + self.dropout2(src2) return src def forward( self, src, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): if self.normalize_before: return self.forward_pre(src, src_mask, src_key_padding_mask, pos) return self.forward_post(src, src_mask, src_key_padding_mask, pos) class TransformerDecoderLayer(nn.Module): def __init__( self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False, ): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward_post( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): q = k = self.with_pos_embed(tgt, query_pos) tgt2 = self.self_attn( q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask )[0] tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) tgt2 = self.multihead_attn( query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask, )[0] tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout3(tgt2) tgt = self.norm3(tgt) return tgt def forward_pre( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): tgt2 = self.norm1(tgt) q = k = self.with_pos_embed(tgt2, query_pos) tgt2 = self.self_attn( q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask )[0] tgt = tgt + self.dropout1(tgt2) tgt2 = self.norm2(tgt) tgt2 = self.multihead_attn( query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask, )[0] tgt = tgt + self.dropout2(tgt2) tgt2 = self.norm3(tgt) tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) tgt = tgt + self.dropout3(tgt2) return tgt def forward( self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): if self.normalize_before: return self.forward_pre( tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos, ) return self.forward_post( tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos, ) def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu raise RuntimeError(f"activation should be relu/gelu, not {activation}.") ================================================ FILE: annotator/oneformer/oneformer/oneformer_model.py ================================================ # ------------------------------------------------------------------------------ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/maskformer_model.py # Modified by Jitesh Jain (https://github.com/praeclarumjj3) # ------------------------------------------------------------------------------ from typing import Tuple import torch from torch import nn from torch.nn import functional as F from annotator.oneformer.detectron2.config import configurable from annotator.oneformer.detectron2.data import MetadataCatalog from annotator.oneformer.detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head from annotator.oneformer.detectron2.modeling.backbone import Backbone from annotator.oneformer.detectron2.modeling.postprocessing import sem_seg_postprocess from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, BitMasks from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom from .modeling.matcher import HungarianMatcher from einops import rearrange from .modeling.transformer_decoder.text_transformer import TextTransformer from .modeling.transformer_decoder.oneformer_transformer_decoder import MLP from annotator.oneformer.oneformer.data.tokenizer import SimpleTokenizer, Tokenize @META_ARCH_REGISTRY.register() class OneFormer(nn.Module): """ Main class for mask classification semantic segmentation architectures. """ @configurable def __init__( self, *, backbone: Backbone, sem_seg_head: nn.Module, task_mlp: nn.Module, text_encoder: nn.Module, text_projector: nn.Module, prompt_ctx: nn.Embedding, num_queries: int, object_mask_threshold: float, overlap_threshold: float, metadata, size_divisibility: int, sem_seg_postprocess_before_inference: bool, pixel_mean: Tuple[float], pixel_std: Tuple[float], # inference semantic_on: bool, panoptic_on: bool, instance_on: bool, detection_on: bool, test_topk_per_image: int, task_seq_len: int, max_seq_len: int, is_demo: bool, ): """ Args: backbone: a backbone module, must follow detectron2's backbone interface sem_seg_head: a module that predicts semantic segmentation from backbone features criterion: a module that defines the loss num_queries: int, number of queries object_mask_threshold: float, threshold to filter query based on classification score for panoptic segmentation inference overlap_threshold: overlap threshold used in general inference for panoptic segmentation metadata: dataset meta, get `thing` and `stuff` category names for panoptic segmentation inference size_divisibility: Some backbones require the input height and width to be divisible by a specific integer. We can use this to override such requirement. sem_seg_postprocess_before_inference: whether to resize the prediction back to original input size before semantic segmentation inference or after. For high-resolution dataset like Mapillary, resizing predictions before inference will cause OOM error. pixel_mean, pixel_std: list or tuple with #channels element, representing the per-channel mean and std to be used to normalize the input image semantic_on: bool, whether to output semantic segmentation prediction instance_on: bool, whether to output instance segmentation prediction panoptic_on: bool, whether to output panoptic segmentation prediction test_topk_per_image: int, instance segmentation parameter, keep topk instances per image """ super().__init__() self.backbone = backbone self.sem_seg_head = sem_seg_head self.task_mlp = task_mlp self.text_encoder = text_encoder self.text_projector = text_projector self.prompt_ctx = prompt_ctx self.num_queries = num_queries self.overlap_threshold = overlap_threshold self.object_mask_threshold = object_mask_threshold self.metadata = metadata if size_divisibility < 0: # use backbone size_divisibility if not set size_divisibility = self.backbone.size_divisibility self.size_divisibility = size_divisibility self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) # additional args self.semantic_on = semantic_on self.instance_on = instance_on self.panoptic_on = panoptic_on self.detection_on = detection_on self.test_topk_per_image = test_topk_per_image self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len) self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len) self.is_demo = is_demo self.thing_indices = [k for k in self.metadata.thing_dataset_id_to_contiguous_id.keys()] if not self.semantic_on: assert self.sem_seg_postprocess_before_inference @classmethod def from_config(cls, cfg): backbone = build_backbone(cfg) sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape()) if cfg.MODEL.IS_TRAIN: text_encoder = TextTransformer(context_length=cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH, width=cfg.MODEL.TEXT_ENCODER.WIDTH, layers=cfg.MODEL.TEXT_ENCODER.NUM_LAYERS, vocab_size=cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE) text_projector = MLP(text_encoder.width, cfg.MODEL.ONE_FORMER.HIDDEN_DIM, cfg.MODEL.ONE_FORMER.HIDDEN_DIM, cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS) if cfg.MODEL.TEXT_ENCODER.N_CTX > 0: prompt_ctx = nn.Embedding(cfg.MODEL.TEXT_ENCODER.N_CTX, cfg.MODEL.TEXT_ENCODER.WIDTH) else: prompt_ctx = None else: text_encoder = None text_projector = None prompt_ctx = None task_mlp = MLP(cfg.INPUT.TASK_SEQ_LEN, cfg.MODEL.ONE_FORMER.HIDDEN_DIM, cfg.MODEL.ONE_FORMER.HIDDEN_DIM, 2) # Loss parameters: deep_supervision = cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION no_object_weight = cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT # loss weights class_weight = cfg.MODEL.ONE_FORMER.CLASS_WEIGHT dice_weight = cfg.MODEL.ONE_FORMER.DICE_WEIGHT mask_weight = cfg.MODEL.ONE_FORMER.MASK_WEIGHT contrastive_weight = cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT # building criterion matcher = HungarianMatcher( cost_class=class_weight, cost_mask=mask_weight, cost_dice=dice_weight, num_points=cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS, ) weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, "loss_dice": dice_weight, "loss_contrastive": contrastive_weight} if deep_supervision: dec_layers = cfg.MODEL.ONE_FORMER.DEC_LAYERS aux_weight_dict = {} for i in range(dec_layers - 1): aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) weight_dict.update(aux_weight_dict) losses = ["labels", "masks", "contrastive"] return { "backbone": backbone, "sem_seg_head": sem_seg_head, "task_mlp": task_mlp, "prompt_ctx": prompt_ctx, "text_encoder": text_encoder, "text_projector": text_projector, "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES, "object_mask_threshold": cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD, "overlap_threshold": cfg.MODEL.TEST.OVERLAP_THRESHOLD, "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), "size_divisibility": cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY, "sem_seg_postprocess_before_inference": ( cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE or cfg.MODEL.TEST.PANOPTIC_ON or cfg.MODEL.TEST.INSTANCE_ON ), "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, # inference "semantic_on": cfg.MODEL.TEST.SEMANTIC_ON, "instance_on": cfg.MODEL.TEST.INSTANCE_ON, "panoptic_on": cfg.MODEL.TEST.PANOPTIC_ON, "detection_on": cfg.MODEL.TEST.DETECTION_ON, "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE, "task_seq_len": cfg.INPUT.TASK_SEQ_LEN, "max_seq_len": cfg.INPUT.MAX_SEQ_LEN, "is_demo": cfg.MODEL.IS_DEMO, } @property def device(self): return self.pixel_mean.device def encode_text(self, text): assert text.ndim in [2, 3], text.ndim b = text.shape[0] squeeze_dim = False num_text = 1 if text.ndim == 3: num_text = text.shape[1] text = rearrange(text, 'b n l -> (b n) l', n=num_text) squeeze_dim = True # [B, C] x = self.text_encoder(text) text_x = self.text_projector(x) if squeeze_dim: text_x = rearrange(text_x, '(b n) c -> b n c', n=num_text) if self.prompt_ctx is not None: text_ctx = self.prompt_ctx.weight.unsqueeze(0).repeat(text_x.shape[0], 1, 1) text_x = torch.cat([text_x, text_ctx], dim=1) return {"texts": text_x} def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "instances": per-region ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: each dict has the results for one image. The dict contains the following keys: * "sem_seg": A Tensor that represents the per-pixel segmentation prediced by the head. The prediction has shape KxHxW that represents the logits of each class for each pixel. * "panoptic_seg": A tuple that represent panoptic output panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. segments_info (list[dict]): Describe each segment in `panoptic_seg`. Each dict contains keys "id", "category_id", "isthing". """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.size_divisibility) tasks = torch.cat([self.task_tokenizer(x["task"]).to(self.device).unsqueeze(0) for x in batched_inputs], dim=0) tasks = self.task_mlp(tasks.float()) features = self.backbone(images.tensor) outputs = self.sem_seg_head(features, tasks) if self.training: texts = torch.cat([self.text_tokenizer(x["text"]).to(self.device).unsqueeze(0) for x in batched_inputs], dim=0) texts_x = self.encode_text(texts) outputs = {**outputs, **texts_x} # mask classification target if "instances" in batched_inputs[0]: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] targets = self.prepare_targets(gt_instances, images) else: targets = None # bipartite matching-based loss losses = self.criterion(outputs, targets) for k in list(losses.keys()): if k in self.criterion.weight_dict: losses[k] *= self.criterion.weight_dict[k] else: # remove this loss if not specified in `weight_dict` losses.pop(k) return losses else: mask_cls_results = outputs["pred_logits"] mask_pred_results = outputs["pred_masks"] # upsample masks mask_pred_results = F.interpolate( mask_pred_results, size=(images.tensor.shape[-2], images.tensor.shape[-1]), mode="bilinear", align_corners=False, ) del outputs processed_results = [] for i, data in enumerate(zip( mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes )): mask_cls_result, mask_pred_result, input_per_image, image_size = data height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) processed_results.append({}) if self.sem_seg_postprocess_before_inference: mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)( mask_pred_result, image_size, height, width ) mask_cls_result = mask_cls_result.to(mask_pred_result) # semantic segmentation inference if self.semantic_on: r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result) if not self.sem_seg_postprocess_before_inference: r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width) processed_results[-1]["sem_seg"] = r # panoptic segmentation inference if self.panoptic_on: panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result) processed_results[-1]["panoptic_seg"] = panoptic_r # instance segmentation inference if self.instance_on: instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result) processed_results[-1]["instances"] = instance_r if self.detection_on: bbox_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result) processed_results[-1]["box_instances"] = bbox_r return processed_results def prepare_targets(self, targets, images): h_pad, w_pad = images.tensor.shape[-2:] new_targets = [] for targets_per_image in targets: # pad gt gt_masks = targets_per_image.gt_masks padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device) padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks new_targets.append( { "labels": targets_per_image.gt_classes, "masks": padded_masks, } ) return new_targets def semantic_inference(self, mask_cls, mask_pred): mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1] mask_pred = mask_pred.sigmoid() semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred) return semseg def panoptic_inference(self, mask_cls, mask_pred): scores, labels = F.softmax(mask_cls, dim=-1).max(-1) mask_pred = mask_pred.sigmoid() keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold) cur_scores = scores[keep] cur_classes = labels[keep] cur_masks = mask_pred[keep] cur_mask_cls = mask_cls[keep] cur_mask_cls = cur_mask_cls[:, :-1] cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks h, w = cur_masks.shape[-2:] panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device) segments_info = [] current_segment_id = 0 if cur_masks.shape[0] == 0: # We didn't detect any mask :( return panoptic_seg, segments_info else: # take argmax cur_mask_ids = cur_prob_masks.argmax(0) stuff_memory_list = {} for k in range(cur_classes.shape[0]): pred_class = cur_classes[k].item() isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values() mask_area = (cur_mask_ids == k).sum().item() original_area = (cur_masks[k] >= 0.5).sum().item() mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5) if mask_area > 0 and original_area > 0 and mask.sum().item() > 0: if mask_area / original_area < self.overlap_threshold: continue # merge stuff regions if not isthing: if int(pred_class) in stuff_memory_list.keys(): panoptic_seg[mask] = stuff_memory_list[int(pred_class)] continue else: stuff_memory_list[int(pred_class)] = current_segment_id + 1 current_segment_id += 1 panoptic_seg[mask] = current_segment_id segments_info.append( { "id": current_segment_id, "isthing": bool(isthing), "category_id": int(pred_class), } ) return panoptic_seg, segments_info def instance_inference(self, mask_cls, mask_pred): # mask_pred is already processed to have the same shape as original input image_size = mask_pred.shape[-2:] # [Q, K] scores = F.softmax(mask_cls, dim=-1)[:, :-1] labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1) # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False) scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False) labels_per_image = labels[topk_indices] topk_indices = topk_indices // self.sem_seg_head.num_classes # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1) mask_pred = mask_pred[topk_indices] # Only consider scores with confidence over [self.object_mask_threshold] for demo if self.is_demo: keep = scores_per_image > self.object_mask_threshold scores_per_image = scores_per_image[keep] labels_per_image = labels_per_image[keep] mask_pred = mask_pred[keep] # if this is panoptic segmentation, we only keep the "thing" classes if self.panoptic_on: keep = torch.zeros_like(scores_per_image).bool() for i, lab in enumerate(labels_per_image): keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values() scores_per_image = scores_per_image[keep] labels_per_image = labels_per_image[keep] mask_pred = mask_pred[keep] if 'ade20k' in self.metadata.name: for i in range(labels_per_image.shape[0]): labels_per_image[i] = self.thing_indices.index(labels_per_image[i].item()) result = Instances(image_size) # mask (before sigmoid) result.pred_masks = (mask_pred > 0).float() if self.detection_on: # Uncomment the following to get boxes from masks (this is slow) result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes() else: result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4)) # calculate average mask prob mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6) result.scores = scores_per_image * mask_scores_per_image result.pred_classes = labels_per_image return result ================================================ FILE: annotator/oneformer/oneformer/utils/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. from .events import setup_wandb, WandbWriter ================================================ FILE: annotator/oneformer/oneformer/utils/box_ops.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ Utilities for bounding box manipulation and GIoU. """ import torch, os from torchvision.ops.boxes import box_area def box_cxcywh_to_xyxy(x): x_c, y_c, w, h = x.unbind(-1) b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return torch.stack(b, dim=-1) def box_xyxy_to_cxcywh(x): x0, y0, x1, y1 = x.unbind(-1) b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] return torch.stack(b, dim=-1) # modified from torchvision to also return the union def box_iou(boxes1, boxes2): area1 = box_area(boxes1) area2 = box_area(boxes2) # import ipdb; ipdb.set_trace() lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] wh = (rb - lt).clamp(min=0) # [N,M,2] inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] union = area1[:, None] + area2 - inter iou = inter / (union + 1e-6) return iou, union def generalized_box_iou(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/ The boxes should be in [x0, y0, x1, y1] format Returns a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) """ # degenerate boxes gives inf / nan results # so do an early check assert (boxes1[:, 2:] >= boxes1[:, :2]).all() assert (boxes2[:, 2:] >= boxes2[:, :2]).all() # except: # import ipdb; ipdb.set_trace() iou, union = box_iou(boxes1, boxes2) lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) wh = (rb - lt).clamp(min=0) # [N,M,2] area = wh[:, :, 0] * wh[:, :, 1] return iou - (area - union) / (area + 1e-6) # modified from torchvision to also return the union def box_iou_pairwise(boxes1, boxes2): area1 = box_area(boxes1) area2 = box_area(boxes2) lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2] rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2] wh = (rb - lt).clamp(min=0) # [N,2] inter = wh[:, 0] * wh[:, 1] # [N] union = area1 + area2 - inter iou = inter / union return iou, union def generalized_box_iou_pairwise(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/ Input: - boxes1, boxes2: N,4 Output: - giou: N, 4 """ # degenerate boxes gives inf / nan results # so do an early check assert (boxes1[:, 2:] >= boxes1[:, :2]).all() assert (boxes2[:, 2:] >= boxes2[:, :2]).all() assert boxes1.shape == boxes2.shape iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4 lt = torch.min(boxes1[:, :2], boxes2[:, :2]) rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) wh = (rb - lt).clamp(min=0) # [N,2] area = wh[:, 0] * wh[:, 1] return iou - (area - union) / area def masks_to_boxes(masks): """Compute the bounding boxes around the provided masks The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. Returns a [N, 4] tensors, with the boxes in xyxy format """ if masks.numel() == 0: return torch.zeros((0, 4), device=masks.device) h, w = masks.shape[-2:] y = torch.arange(0, h, dtype=torch.float) x = torch.arange(0, w, dtype=torch.float) y, x = torch.meshgrid(y, x) x_mask = (masks * x.unsqueeze(0)) x_max = x_mask.flatten(1).max(-1)[0] x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] y_mask = (masks * y.unsqueeze(0)) y_max = y_mask.flatten(1).max(-1)[0] y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] return torch.stack([x_min, y_min, x_max, y_max], 1) if __name__ == '__main__': x = torch.rand(5, 4) y = torch.rand(3, 4) iou, union = box_iou(x, y) ================================================ FILE: annotator/oneformer/oneformer/utils/events.py ================================================ import os import wandb from annotator.oneformer.detectron2.utils import comm from annotator.oneformer.detectron2.utils.events import EventWriter, get_event_storage def setup_wandb(cfg, args): if comm.is_main_process(): init_args = { k.lower(): v for k, v in cfg.WANDB.items() if isinstance(k, str) and k not in ["config"] } # only include most related part to avoid too big table # TODO: add configurable params to select which part of `cfg` should be saved in config if "config_exclude_keys" in init_args: init_args["config"] = cfg init_args["config"]["cfg_file"] = args.config_file else: init_args["config"] = { "model": cfg.MODEL, "solver": cfg.SOLVER, "cfg_file": args.config_file, } if ("name" not in init_args) or (init_args["name"] is None): init_args["name"] = os.path.basename(args.config_file) else: init_args["name"] = init_args["name"] + '_' + os.path.basename(args.config_file) wandb.init(**init_args) class BaseRule(object): def __call__(self, target): return target class IsIn(BaseRule): def __init__(self, keyword: str): self.keyword = keyword def __call__(self, target): return self.keyword in target class Prefix(BaseRule): def __init__(self, keyword: str): self.keyword = keyword def __call__(self, target): return "/".join([self.keyword, target]) class WandbWriter(EventWriter): """ Write all scalars to a tensorboard file. """ def __init__(self): """ Args: log_dir (str): the directory to save the output events kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)` """ self._last_write = -1 self._group_rules = [ (IsIn("/"), BaseRule()), (IsIn("loss"), Prefix("train")), ] def write(self): storage = get_event_storage() def _group_name(scalar_name): for (rule, op) in self._group_rules: if rule(scalar_name): return op(scalar_name) return scalar_name stats = { _group_name(name): scalars[0] for name, scalars in storage.latest().items() if scalars[1] > self._last_write } if len(stats) > 0: self._last_write = max([v[1] for k, v in storage.latest().items()]) # storage.put_{image,histogram} is only meant to be used by # tensorboard writer. So we access its internal fields directly from here. if len(storage._vis_data) >= 1: stats["image"] = [ wandb.Image(img, caption=img_name) for img_name, img, step_num in storage._vis_data ] # Storage stores all image data and rely on this writer to clear them. # As a result it assumes only one writer will use its image data. # An alternative design is to let storage store limited recent # data (e.g. only the most recent image) that all writers can access. # In that case a writer may not see all image data if its period is long. storage.clear_images() if len(storage._histograms) >= 1: def create_bar(tag, bucket_limits, bucket_counts, **kwargs): data = [ [label, val] for (label, val) in zip(bucket_limits, bucket_counts) ] table = wandb.Table(data=data, columns=["label", "value"]) return wandb.plot.bar(table, "label", "value", title=tag) stats["hist"] = [create_bar(**params) for params in storage._histograms] storage.clear_histograms() if len(stats) == 0: return wandb.log(stats, step=storage.iter) def close(self): wandb.finish() ================================================ FILE: annotator/oneformer/oneformer/utils/misc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py """ Misc functions, including distributed helpers. Mostly copy-paste from torchvision references. """ from typing import List, Optional import torch import torch.distributed as dist import torchvision from torch import Tensor import warnings import torch.nn.functional as F import math def inverse_sigmoid(x, eps=1e-3): x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1/x2) def _no_grad_trunc_normal_(tensor, mean, std, a, b): # Cut & paste from PyTorch official master until it's in a few official releases - RW # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf def norm_cdf(x): # Computes standard normal cumulative distribution function return (1. + math.erf(x / math.sqrt(2.))) / 2. if (mean < a - 2 * std) or (mean > b + 2 * std): warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " "The distribution of values may be incorrect.", stacklevel=2) with torch.no_grad(): # Values are generated by using a truncated uniform distribution and # then using the inverse CDF for the normal distribution. # Get upper and lower cdf values l = norm_cdf((a - mean) / std) u = norm_cdf((b - mean) / std) # Uniformly fill tensor with values from [l, u], then translate to # [2l-1, 2u-1]. tensor.uniform_(2 * l - 1, 2 * u - 1) # Use inverse cdf transform for normal distribution to get truncated # standard normal tensor.erfinv_() # Transform to proper mean, std tensor.mul_(std * math.sqrt(2.)) tensor.add_(mean) # Clamp to ensure it's in the proper range tensor.clamp_(min=a, max=b) return tensor def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): # type: (Tensor, float, float, float, float) -> Tensor r"""Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for generating the random values works best when :math:`a \leq \text{mean} \leq b`. Args: tensor: an n-dimensional `torch.Tensor` mean: the mean of the normal distribution std: the standard deviation of the normal distribution a: the minimum cutoff value b: the maximum cutoff value Examples: >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w) """ return _no_grad_trunc_normal_(tensor, mean, std, a, b) def resize(input, size=None, scale_factor=None, mode='nearest', align_corners=None, warning=True): if warning: if size is not None and align_corners: input_h, input_w = tuple(int(x) for x in input.shape[2:]) output_h, output_w = tuple(int(x) for x in size) if output_h > input_h or output_w > output_h: if ((output_h > 1 and output_w > 1 and input_h > 1 and input_w > 1) and (output_h - 1) % (input_h - 1) and (output_w - 1) % (input_w - 1)): warnings.warn( f'When align_corners={align_corners}, ' 'the output would more aligned if ' f'input size {(input_h, input_w)} is `x+1` and ' f'out size {(output_h, output_w)} is `nx+1`') if isinstance(size, torch.Size): size = tuple(int(x) for x in size) return F.interpolate(input, size, scale_factor, mode, align_corners) def _max_by_axis(the_list): # type: (List[List[int]]) -> List[int] maxes = the_list[0] for sublist in the_list[1:]: for index, item in enumerate(sublist): maxes[index] = max(maxes[index], item) return maxes class NestedTensor(object): def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask def to(self, device): # type: (Device) -> NestedTensor # noqa cast_tensor = self.tensors.to(device) mask = self.mask if mask is not None: assert mask is not None cast_mask = mask.to(device) else: cast_mask = None return NestedTensor(cast_tensor, cast_mask) def decompose(self): return self.tensors, self.mask def __repr__(self): return str(self.tensors) def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): # TODO make this more general if tensor_list[0].ndim == 3: if torchvision._is_tracing(): # nested_tensor_from_tensor_list() does not export well to ONNX # call _onnx_nested_tensor_from_tensor_list() instead return _onnx_nested_tensor_from_tensor_list(tensor_list) # TODO make it support different-sized images max_size = _max_by_axis([list(img.shape) for img in tensor_list]) # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) batch_shape = [len(tensor_list)] + max_size b, c, h, w = batch_shape dtype = tensor_list[0].dtype device = tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, h, w), dtype=torch.bool, device=device) for img, pad_img, m in zip(tensor_list, tensor, mask): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) m[: img.shape[1], : img.shape[2]] = False else: raise ValueError("not supported") return NestedTensor(tensor, mask) # _onnx_nested_tensor_from_tensor_list() is an implementation of # nested_tensor_from_tensor_list() that is supported by ONNX tracing. @torch.jit.unused def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: max_size = [] for i in range(tensor_list[0].dim()): max_size_i = torch.max( torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) ).to(torch.int64) max_size.append(max_size_i) max_size = tuple(max_size) # work around for # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) # m[: img.shape[1], :img.shape[2]] = False # which is not yet supported in onnx padded_imgs = [] padded_masks = [] for img in tensor_list: padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) padded_imgs.append(padded_img) m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) padded_masks.append(padded_mask.to(torch.bool)) tensor = torch.stack(padded_imgs) mask = torch.stack(padded_masks) return NestedTensor(tensor, mask=mask) def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True ================================================ FILE: annotator/oneformer/oneformer/utils/pos_embed.py ================================================ # -------------------------------------------------------- # Position embedding utils # -------------------------------------------------------- from typing import Tuple import numpy as np import torch # -------------------------------------------------------- # 2D sine-cosine position embedding # References: # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py # MoCo v3: https://github.com/facebookresearch/moco-v3 # -------------------------------------------------------- def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): """ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) """ grid_h = np.arange(grid_size, dtype=np.float32) grid_w = np.arange(grid_size, dtype=np.float32) grid = np.meshgrid(grid_w, grid_h) # here w goes first grid = np.stack(grid, axis=0) grid = grid.reshape([2, 1, grid_size, grid_size]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token: pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) return pos_embed def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) return emb def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D) """ assert embed_dim % 2 == 0 omega = np.arange(embed_dim // 2, dtype=np.float) omega /= embed_dim / 2.0 omega = 1.0 / 10000 ** omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product emb_sin = np.sin(out) # (M, D/2) emb_cos = np.cos(out) # (M, D/2) emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) return emb # -------------------------------------------------------- # Interpolate position embeddings for high-resolution # References: # DeiT: https://github.com/facebookresearch/deit # -------------------------------------------------------- def interpolate_pos_embed(model, checkpoint_model, pos_embed_key): if pos_embed_key in checkpoint_model: pos_embed_checkpoint = checkpoint_model[pos_embed_key] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.num_patches if pos_embed_key.startswith("decoder"): num_extra_tokens = model.decoder_pos_embed.shape[-2] - num_patches else: num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) # height (== width) for the new position embedding new_size = int(num_patches ** 0.5) # class_token and dist_token are kept unchanged if orig_size != new_size: print( "Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size) ) extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape( -1, orig_size, orig_size, embedding_size ).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False, ) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model[pos_embed_key] = new_pos_embed def interpolate_pos_embed_online( pos_embed, orig_size: Tuple[int], new_size: Tuple[int], num_extra_tokens: int ): extra_tokens = pos_embed[:, :num_extra_tokens] pos_tokens = pos_embed[:, num_extra_tokens:] embedding_size = pos_tokens.shape[-1] pos_tokens = pos_tokens.reshape( -1, orig_size[0], orig_size[1], embedding_size ).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=new_size, mode="bicubic", align_corners=False, ) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) return new_pos_embed ================================================ FILE: annotator/oneformer/pycocotools/__init__.py ================================================ __author__ = 'tylin' ================================================ FILE: annotator/oneformer/pycocotools/coco.py ================================================ __author__ = 'tylin' __version__ = '2.0' # Interface for accessing the Microsoft COCO dataset. # Microsoft COCO is a large image dataset designed for object detection, # segmentation, and caption generation. annotator.oneformer.pycocotools is a Python API that # assists in loading, parsing and visualizing the annotations in COCO. # Please visit http://mscoco.org/ for more information on COCO, including # for the data, paper, and tutorials. The exact format of the annotations # is also described on the COCO website. For example usage of the annotator.oneformer.pycocotools # please see annotator.oneformer.pycocotools_demo.ipynb. In addition to this API, please download both # the COCO images and annotations in order to run the demo. # An alternative to using the API is to load the annotations directly # into Python dictionary # Using the API provides additional utility functions. Note that this API # supports both *instance* and *caption* annotations. In the case of # captions not all functions are defined (e.g. categories are undefined). # The following API functions are defined: # COCO - COCO api class that loads COCO annotation file and prepare data structures. # decodeMask - Decode binary mask M encoded via run-length encoding. # encodeMask - Encode binary mask M using run-length encoding. # getAnnIds - Get ann ids that satisfy given filter conditions. # getCatIds - Get cat ids that satisfy given filter conditions. # getImgIds - Get img ids that satisfy given filter conditions. # loadAnns - Load anns with the specified ids. # loadCats - Load cats with the specified ids. # loadImgs - Load imgs with the specified ids. # annToMask - Convert segmentation in an annotation to binary mask. # showAnns - Display the specified annotations. # loadRes - Load algorithm results and create API for accessing them. # download - Download COCO images from mscoco.org server. # Throughout the API "ann"=annotation, "cat"=category, and "img"=image. # Help on each functions can be accessed by: "help COCO>function". # See also COCO>decodeMask, # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, # COCO>getImgIds, COCO>loadAnns, COCO>loadCats, # COCO>loadImgs, COCO>annToMask, COCO>showAnns # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2014. # Licensed under the Simplified BSD License [see bsd.txt] import json import time import numpy as np import copy import itertools from . import mask as maskUtils import os from collections import defaultdict import sys PYTHON_VERSION = sys.version_info[0] if PYTHON_VERSION == 2: from urllib import urlretrieve elif PYTHON_VERSION == 3: from urllib.request import urlretrieve def _isArrayLike(obj): return hasattr(obj, '__iter__') and hasattr(obj, '__len__') class COCO: def __init__(self, annotation_file=None): """ Constructor of Microsoft COCO helper class for reading and visualizing annotations. :param annotation_file (str): location of annotation file :param image_folder (str): location to the folder that hosts images. :return: """ # load dataset self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict() self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list) if not annotation_file == None: print('loading annotations into memory...') tic = time.time() with open(annotation_file, 'r') as f: dataset = json.load(f) assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) print('Done (t={:0.2f}s)'.format(time.time()- tic)) self.dataset = dataset self.createIndex() def createIndex(self): # create index print('creating index...') anns, cats, imgs = {}, {}, {} imgToAnns,catToImgs = defaultdict(list),defaultdict(list) if 'annotations' in self.dataset: for ann in self.dataset['annotations']: imgToAnns[ann['image_id']].append(ann) anns[ann['id']] = ann if 'images' in self.dataset: for img in self.dataset['images']: imgs[img['id']] = img if 'categories' in self.dataset: for cat in self.dataset['categories']: cats[cat['id']] = cat if 'annotations' in self.dataset and 'categories' in self.dataset: for ann in self.dataset['annotations']: catToImgs[ann['category_id']].append(ann['image_id']) print('index created!') # create class members self.anns = anns self.imgToAnns = imgToAnns self.catToImgs = catToImgs self.imgs = imgs self.cats = cats def info(self): """ Print information about the annotation file. :return: """ for key, value in self.dataset['info'].items(): print('{}: {}'.format(key, value)) def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): """ Get ann ids that satisfy given filter conditions. default skips that filter :param imgIds (int array) : get anns for given imgs catIds (int array) : get anns for given cats areaRng (float array) : get anns for given area range (e.g. [0 inf]) iscrowd (boolean) : get anns for given crowd label (False or True) :return: ids (int array) : integer array of ann ids """ imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(imgIds) == len(catIds) == len(areaRng) == 0: anns = self.dataset['annotations'] else: if not len(imgIds) == 0: lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns] anns = list(itertools.chain.from_iterable(lists)) else: anns = self.dataset['annotations'] anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] if not iscrowd == None: ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] else: ids = [ann['id'] for ann in anns] return ids def getCatIds(self, catNms=[], supNms=[], catIds=[]): """ filtering parameters. default skips that filter. :param catNms (str array) : get cats for given cat names :param supNms (str array) : get cats for given supercategory names :param catIds (int array) : get cats for given cat ids :return: ids (int array) : integer array of cat ids """ catNms = catNms if _isArrayLike(catNms) else [catNms] supNms = supNms if _isArrayLike(supNms) else [supNms] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(catNms) == len(supNms) == len(catIds) == 0: cats = self.dataset['categories'] else: cats = self.dataset['categories'] cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] ids = [cat['id'] for cat in cats] return ids def getImgIds(self, imgIds=[], catIds=[]): ''' Get img ids that satisfy given filter conditions. :param imgIds (int array) : get imgs for given ids :param catIds (int array) : get imgs with all given cats :return: ids (int array) : integer array of img ids ''' imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] catIds = catIds if _isArrayLike(catIds) else [catIds] if len(imgIds) == len(catIds) == 0: ids = self.imgs.keys() else: ids = set(imgIds) for i, catId in enumerate(catIds): if i == 0 and len(ids) == 0: ids = set(self.catToImgs[catId]) else: ids &= set(self.catToImgs[catId]) return list(ids) def loadAnns(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying anns :return: anns (object array) : loaded ann objects """ if _isArrayLike(ids): return [self.anns[id] for id in ids] elif type(ids) == int: return [self.anns[ids]] def loadCats(self, ids=[]): """ Load cats with the specified ids. :param ids (int array) : integer ids specifying cats :return: cats (object array) : loaded cat objects """ if _isArrayLike(ids): return [self.cats[id] for id in ids] elif type(ids) == int: return [self.cats[ids]] def loadImgs(self, ids=[]): """ Load anns with the specified ids. :param ids (int array) : integer ids specifying img :return: imgs (object array) : loaded img objects """ if _isArrayLike(ids): return [self.imgs[id] for id in ids] elif type(ids) == int: return [self.imgs[ids]] def showAnns(self, anns, draw_bbox=False): """ Display the specified annotations. :param anns (array of object): annotations to display :return: None """ if len(anns) == 0: return 0 if 'segmentation' in anns[0] or 'keypoints' in anns[0]: datasetType = 'instances' elif 'caption' in anns[0]: datasetType = 'captions' else: raise Exception('datasetType not supported') if datasetType == 'instances': import matplotlib.pyplot as plt from matplotlib.collections import PatchCollection from matplotlib.patches import Polygon ax = plt.gca() ax.set_autoscale_on(False) polygons = [] color = [] for ann in anns: c = (np.random.random((1, 3))*0.6+0.4).tolist()[0] if 'segmentation' in ann: if type(ann['segmentation']) == list: # polygon for seg in ann['segmentation']: poly = np.array(seg).reshape((int(len(seg)/2), 2)) polygons.append(Polygon(poly)) color.append(c) else: # mask t = self.imgs[ann['image_id']] if type(ann['segmentation']['counts']) == list: rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width']) else: rle = [ann['segmentation']] m = maskUtils.decode(rle) img = np.ones( (m.shape[0], m.shape[1], 3) ) if ann['iscrowd'] == 1: color_mask = np.array([2.0,166.0,101.0])/255 if ann['iscrowd'] == 0: color_mask = np.random.random((1, 3)).tolist()[0] for i in range(3): img[:,:,i] = color_mask[i] ax.imshow(np.dstack( (img, m*0.5) )) if 'keypoints' in ann and type(ann['keypoints']) == list: # turn skeleton into zero-based index sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1 kp = np.array(ann['keypoints']) x = kp[0::3] y = kp[1::3] v = kp[2::3] for sk in sks: if np.all(v[sk]>0): plt.plot(x[sk],y[sk], linewidth=3, color=c) plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2) plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2) if draw_bbox: [bbox_x, bbox_y, bbox_w, bbox_h] = ann['bbox'] poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]] np_poly = np.array(poly).reshape((4,2)) polygons.append(Polygon(np_poly)) color.append(c) p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4) ax.add_collection(p) p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2) ax.add_collection(p) elif datasetType == 'captions': for ann in anns: print(ann['caption']) def loadRes(self, resFile): """ Load result file and return a result api object. :param resFile (str) : file name of result file :return: res (obj) : result api object """ res = COCO() res.dataset['images'] = [img for img in self.dataset['images']] print('Loading and preparing results...') tic = time.time() if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode): with open(resFile) as f: anns = json.load(f) elif type(resFile) == np.ndarray: anns = self.loadNumpyAnnotations(resFile) else: anns = resFile assert type(anns) == list, 'results in not an array of objects' annsImgIds = [ann['image_id'] for ann in anns] assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ 'Results do not correspond to current coco set' if 'caption' in anns[0]: imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] for id, ann in enumerate(anns): ann['id'] = id+1 elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) for id, ann in enumerate(anns): bb = ann['bbox'] x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] if not 'segmentation' in ann: ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] ann['area'] = bb[2]*bb[3] ann['id'] = id+1 ann['iscrowd'] = 0 elif 'segmentation' in anns[0]: res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) for id, ann in enumerate(anns): # now only support compressed RLE format as segmentation results ann['area'] = maskUtils.area(ann['segmentation']) if not 'bbox' in ann: ann['bbox'] = maskUtils.toBbox(ann['segmentation']) ann['id'] = id+1 ann['iscrowd'] = 0 elif 'keypoints' in anns[0]: res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) for id, ann in enumerate(anns): s = ann['keypoints'] x = s[0::3] y = s[1::3] x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y) ann['area'] = (x1-x0)*(y1-y0) ann['id'] = id + 1 ann['bbox'] = [x0,y0,x1-x0,y1-y0] print('DONE (t={:0.2f}s)'.format(time.time()- tic)) res.dataset['annotations'] = anns res.createIndex() return res def download(self, tarDir = None, imgIds = [] ): ''' Download COCO images from mscoco.org server. :param tarDir (str): COCO results directory name imgIds (list): images to be downloaded :return: ''' if tarDir is None: print('Please specify target directory') return -1 if len(imgIds) == 0: imgs = self.imgs.values() else: imgs = self.loadImgs(imgIds) N = len(imgs) if not os.path.exists(tarDir): os.makedirs(tarDir) for i, img in enumerate(imgs): tic = time.time() fname = os.path.join(tarDir, img['file_name']) if not os.path.exists(fname): urlretrieve(img['coco_url'], fname) print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic)) def loadNumpyAnnotations(self, data): """ Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class} :param data (numpy.ndarray) :return: annotations (python nested list) """ print('Converting ndarray to lists...') assert(type(data) == np.ndarray) print(data.shape) assert(data.shape[1] == 7) N = data.shape[0] ann = [] for i in range(N): if i % 1000000 == 0: print('{}/{}'.format(i,N)) ann += [{ 'image_id' : int(data[i, 0]), 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ], 'score' : data[i, 5], 'category_id': int(data[i, 6]), }] return ann def annToRLE(self, ann): """ Convert annotation which can be polygons, uncompressed RLE to RLE. :return: binary mask (numpy 2D array) """ t = self.imgs[ann['image_id']] h, w = t['height'], t['width'] segm = ann['segmentation'] if type(segm) == list: # polygon -- a single object might consist of multiple parts # we merge all parts into one mask rle code rles = maskUtils.frPyObjects(segm, h, w) rle = maskUtils.merge(rles) elif type(segm['counts']) == list: # uncompressed RLE rle = maskUtils.frPyObjects(segm, h, w) else: # rle rle = ann['segmentation'] return rle def annToMask(self, ann): """ Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. :return: binary mask (numpy 2D array) """ rle = self.annToRLE(ann) m = maskUtils.decode(rle) return m ================================================ FILE: annotator/oneformer/pycocotools/cocoeval.py ================================================ __author__ = 'tsungyi' import numpy as np import datetime import time from collections import defaultdict from . import mask as maskUtils import copy class COCOeval: # Interface for evaluating detection on the Microsoft COCO dataset. # # The usage for CocoEval is as follows: # cocoGt=..., cocoDt=... # load dataset and results # E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object # E.params.recThrs = ...; # set parameters as desired # E.evaluate(); # run per image evaluation # E.accumulate(); # accumulate per image results # E.summarize(); # display summary metrics of results # For example usage see evalDemo.m and http://mscoco.org/. # # The evaluation parameters are as follows (defaults in brackets): # imgIds - [all] N img ids to use for evaluation # catIds - [all] K cat ids to use for evaluation # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation # recThrs - [0:.01:1] R=101 recall thresholds for evaluation # areaRng - [...] A=4 object area ranges for evaluation # maxDets - [1 10 100] M=3 thresholds on max detections per image # iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints' # iouType replaced the now DEPRECATED useSegm parameter. # useCats - [1] if true use category labels for evaluation # Note: if useCats=0 category labels are ignored as in proposal scoring. # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. # # evaluate(): evaluates detections on every image and every category and # concats the results into the "evalImgs" with fields: # dtIds - [1xD] id for each of the D detections (dt) # gtIds - [1xG] id for each of the G ground truths (gt) # dtMatches - [TxD] matching gt id at each IoU or 0 # gtMatches - [TxG] matching dt id at each IoU or 0 # dtScores - [1xD] confidence of each dt # gtIgnore - [1xG] ignore flag for each gt # dtIgnore - [TxD] ignore flag for each dt at each IoU # # accumulate(): accumulates the per-image, per-category evaluation # results in "evalImgs" into the dictionary "eval" with fields: # params - parameters used for evaluation # date - date evaluation was performed # counts - [T,R,K,A,M] parameter dimensions (see above) # precision - [TxRxKxAxM] precision for every evaluation setting # recall - [TxKxAxM] max recall for every evaluation setting # Note: precision and recall==-1 for settings with no gt objects. # # See also coco, mask, pycocoDemo, pycocoEvalDemo # # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. # Licensed under the Simplified BSD License [see coco/license.txt] def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'): ''' Initialize CocoEval using coco APIs for gt and dt :param cocoGt: coco object with ground truth annotations :param cocoDt: coco object with detection results :return: None ''' if not iouType: print('iouType not specified. use default iouType segm') self.cocoGt = cocoGt # ground truth COCO API self.cocoDt = cocoDt # detections COCO API self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements self.eval = {} # accumulated evaluation results self._gts = defaultdict(list) # gt for evaluation self._dts = defaultdict(list) # dt for evaluation self.params = Params(iouType=iouType) # parameters self._paramsEval = {} # parameters for evaluation self.stats = [] # result summarization self.ious = {} # ious between all gts and dts if not cocoGt is None: self.params.imgIds = sorted(cocoGt.getImgIds()) self.params.catIds = sorted(cocoGt.getCatIds()) def _prepare(self): ''' Prepare ._gts and ._dts for evaluation based on params :return: None ''' def _toMask(anns, coco): # modify ann['segmentation'] by reference for ann in anns: rle = coco.annToRLE(ann) ann['segmentation'] = rle p = self.params if p.useCats: gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) else: gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds)) dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds)) # convert ground truth to mask if iouType == 'segm' if p.iouType == 'segm': _toMask(gts, self.cocoGt) _toMask(dts, self.cocoDt) # set ignore flag for gt in gts: gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0 gt['ignore'] = 'iscrowd' in gt and gt['iscrowd'] if p.iouType == 'keypoints': gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore'] self._gts = defaultdict(list) # gt for evaluation self._dts = defaultdict(list) # dt for evaluation for gt in gts: self._gts[gt['image_id'], gt['category_id']].append(gt) for dt in dts: self._dts[dt['image_id'], dt['category_id']].append(dt) self.evalImgs = defaultdict(list) # per-image per-category evaluation results self.eval = {} # accumulated evaluation results def evaluate(self): ''' Run per image evaluation on given images and store results (a list of dict) in self.evalImgs :return: None ''' tic = time.time() print('Running per image evaluation...') p = self.params # add backward compatibility if useSegm is specified in params if not p.useSegm is None: p.iouType = 'segm' if p.useSegm == 1 else 'bbox' print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) print('Evaluate annotation type *{}*'.format(p.iouType)) p.imgIds = list(np.unique(p.imgIds)) if p.useCats: p.catIds = list(np.unique(p.catIds)) p.maxDets = sorted(p.maxDets) self.params=p self._prepare() # loop through images, area range, max detection number catIds = p.catIds if p.useCats else [-1] if p.iouType == 'segm' or p.iouType == 'bbox': computeIoU = self.computeIoU elif p.iouType == 'keypoints': computeIoU = self.computeOks self.ious = {(imgId, catId): computeIoU(imgId, catId) \ for imgId in p.imgIds for catId in catIds} evaluateImg = self.evaluateImg maxDet = p.maxDets[-1] self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet) for catId in catIds for areaRng in p.areaRng for imgId in p.imgIds ] self._paramsEval = copy.deepcopy(self.params) toc = time.time() print('DONE (t={:0.2f}s).'.format(toc-tic)) def computeIoU(self, imgId, catId): p = self.params if p.useCats: gt = self._gts[imgId,catId] dt = self._dts[imgId,catId] else: gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] if len(gt) == 0 and len(dt) ==0: return [] inds = np.argsort([-d['score'] for d in dt], kind='mergesort') dt = [dt[i] for i in inds] if len(dt) > p.maxDets[-1]: dt=dt[0:p.maxDets[-1]] if p.iouType == 'segm': g = [g['segmentation'] for g in gt] d = [d['segmentation'] for d in dt] elif p.iouType == 'bbox': g = [g['bbox'] for g in gt] d = [d['bbox'] for d in dt] else: raise Exception('unknown iouType for iou computation') # compute iou between each dt and gt region iscrowd = [int(o['iscrowd']) for o in gt] ious = maskUtils.iou(d,g,iscrowd) return ious def computeOks(self, imgId, catId): p = self.params # dimention here should be Nxm gts = self._gts[imgId, catId] dts = self._dts[imgId, catId] inds = np.argsort([-d['score'] for d in dts], kind='mergesort') dts = [dts[i] for i in inds] if len(dts) > p.maxDets[-1]: dts = dts[0:p.maxDets[-1]] # if len(gts) == 0 and len(dts) == 0: if len(gts) == 0 or len(dts) == 0: return [] ious = np.zeros((len(dts), len(gts))) sigmas = p.kpt_oks_sigmas vars = (sigmas * 2)**2 k = len(sigmas) # compute oks between each detection and ground truth object for j, gt in enumerate(gts): # create bounds for ignore regions(double the gt bbox) g = np.array(gt['keypoints']) xg = g[0::3]; yg = g[1::3]; vg = g[2::3] k1 = np.count_nonzero(vg > 0) bb = gt['bbox'] x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2 y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2 for i, dt in enumerate(dts): d = np.array(dt['keypoints']) xd = d[0::3]; yd = d[1::3] if k1>0: # measure the per-keypoint distance if keypoints visible dx = xd - xg dy = yd - yg else: # measure minimum distance to keypoints in (x0,y0) & (x1,y1) z = np.zeros((k)) dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0) dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0) e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2 if k1 > 0: e=e[vg > 0] ious[i, j] = np.sum(np.exp(-e)) / e.shape[0] return ious def evaluateImg(self, imgId, catId, aRng, maxDet): ''' perform evaluation for single category and image :return: dict (single image results) ''' p = self.params if p.useCats: gt = self._gts[imgId,catId] dt = self._dts[imgId,catId] else: gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] if len(gt) == 0 and len(dt) ==0: return None for g in gt: if g['ignore'] or (g['area']aRng[1]): g['_ignore'] = 1 else: g['_ignore'] = 0 # sort dt highest score first, sort gt ignore last gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort') gt = [gt[i] for i in gtind] dtind = np.argsort([-d['score'] for d in dt], kind='mergesort') dt = [dt[i] for i in dtind[0:maxDet]] iscrowd = [int(o['iscrowd']) for o in gt] # load computed ious ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId] T = len(p.iouThrs) G = len(gt) D = len(dt) gtm = np.zeros((T,G)) dtm = np.zeros((T,D)) gtIg = np.array([g['_ignore'] for g in gt]) dtIg = np.zeros((T,D)) if not len(ious)==0: for tind, t in enumerate(p.iouThrs): for dind, d in enumerate(dt): # information about best match so far (m=-1 -> unmatched) iou = min([t,1-1e-10]) m = -1 for gind, g in enumerate(gt): # if this gt already matched, and not a crowd, continue if gtm[tind,gind]>0 and not iscrowd[gind]: continue # if dt matched to reg gt, and on ignore gt, stop if m>-1 and gtIg[m]==0 and gtIg[gind]==1: break # continue to next gt unless better match made if ious[dind,gind] < iou: continue # if match successful and best so far, store appropriately iou=ious[dind,gind] m=gind # if match made store id of match for both dt and gt if m ==-1: continue dtIg[tind,dind] = gtIg[m] dtm[tind,dind] = gt[m]['id'] gtm[tind,m] = d['id'] # set unmatched detections outside of area range to ignore a = np.array([d['area']aRng[1] for d in dt]).reshape((1, len(dt))) dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0))) # store results for given image and category return { 'image_id': imgId, 'category_id': catId, 'aRng': aRng, 'maxDet': maxDet, 'dtIds': [d['id'] for d in dt], 'gtIds': [g['id'] for g in gt], 'dtMatches': dtm, 'gtMatches': gtm, 'dtScores': [d['score'] for d in dt], 'gtIgnore': gtIg, 'dtIgnore': dtIg, } def accumulate(self, p = None): ''' Accumulate per image evaluation results and store the result in self.eval :param p: input params for evaluation :return: None ''' print('Accumulating evaluation results...') tic = time.time() if not self.evalImgs: print('Please run evaluate() first') # allows input customized parameters if p is None: p = self.params p.catIds = p.catIds if p.useCats == 1 else [-1] T = len(p.iouThrs) R = len(p.recThrs) K = len(p.catIds) if p.useCats else 1 A = len(p.areaRng) M = len(p.maxDets) precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories recall = -np.ones((T,K,A,M)) scores = -np.ones((T,R,K,A,M)) # create dictionary for future indexing _pe = self._paramsEval catIds = _pe.catIds if _pe.useCats else [-1] setK = set(catIds) setA = set(map(tuple, _pe.areaRng)) setM = set(_pe.maxDets) setI = set(_pe.imgIds) # get inds to evaluate k_list = [n for n, k in enumerate(p.catIds) if k in setK] m_list = [m for n, m in enumerate(p.maxDets) if m in setM] a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA] i_list = [n for n, i in enumerate(p.imgIds) if i in setI] I0 = len(_pe.imgIds) A0 = len(_pe.areaRng) # retrieve E at each category, area range, and max number of detections for k, k0 in enumerate(k_list): Nk = k0*A0*I0 for a, a0 in enumerate(a_list): Na = a0*I0 for m, maxDet in enumerate(m_list): E = [self.evalImgs[Nk + Na + i] for i in i_list] E = [e for e in E if not e is None] if len(E) == 0: continue dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E]) # different sorting method generates slightly different results. # mergesort is used to be consistent as Matlab implementation. inds = np.argsort(-dtScores, kind='mergesort') dtScoresSorted = dtScores[inds] dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds] dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds] gtIg = np.concatenate([e['gtIgnore'] for e in E]) npig = np.count_nonzero(gtIg==0 ) if npig == 0: continue tps = np.logical_and( dtm, np.logical_not(dtIg) ) fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) ) tp_sum = np.cumsum(tps, axis=1).astype(dtype=float) fp_sum = np.cumsum(fps, axis=1).astype(dtype=float) for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): tp = np.array(tp) fp = np.array(fp) nd = len(tp) rc = tp / npig pr = tp / (fp+tp+np.spacing(1)) q = np.zeros((R,)) ss = np.zeros((R,)) if nd: recall[t,k,a,m] = rc[-1] else: recall[t,k,a,m] = 0 # numpy is slow without cython optimization for accessing elements # use python array gets significant speed improvement pr = pr.tolist(); q = q.tolist() for i in range(nd-1, 0, -1): if pr[i] > pr[i-1]: pr[i-1] = pr[i] inds = np.searchsorted(rc, p.recThrs, side='left') try: for ri, pi in enumerate(inds): q[ri] = pr[pi] ss[ri] = dtScoresSorted[pi] except: pass precision[t,:,k,a,m] = np.array(q) scores[t,:,k,a,m] = np.array(ss) self.eval = { 'params': p, 'counts': [T, R, K, A, M], 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'precision': precision, 'recall': recall, 'scores': scores, } toc = time.time() print('DONE (t={:0.2f}s).'.format( toc-tic)) def summarize(self): ''' Compute and display summary metrics for evaluation results. Note this functin can *only* be applied on the default parameter setting ''' def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ): p = self.params iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}' titleStr = 'Average Precision' if ap == 1 else 'Average Recall' typeStr = '(AP)' if ap==1 else '(AR)' iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \ if iouThr is None else '{:0.2f}'.format(iouThr) aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] if ap == 1: # dimension of precision: [TxRxKxAxM] s = self.eval['precision'] # IoU if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:,:,:,aind,mind] else: # dimension of recall: [TxKxAxM] s = self.eval['recall'] if iouThr is not None: t = np.where(iouThr == p.iouThrs)[0] s = s[t] s = s[:,:,aind,mind] if len(s[s>-1])==0: mean_s = -1 else: mean_s = np.mean(s[s>-1]) print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) return mean_s def _summarizeDets(): stats = np.zeros((12,)) stats[0] = _summarize(1) stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2]) stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2]) stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2]) stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2]) stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2]) stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2]) stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2]) stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2]) return stats def _summarizeKps(): stats = np.zeros((10,)) stats[0] = _summarize(1, maxDets=20) stats[1] = _summarize(1, maxDets=20, iouThr=.5) stats[2] = _summarize(1, maxDets=20, iouThr=.75) stats[3] = _summarize(1, maxDets=20, areaRng='medium') stats[4] = _summarize(1, maxDets=20, areaRng='large') stats[5] = _summarize(0, maxDets=20) stats[6] = _summarize(0, maxDets=20, iouThr=.5) stats[7] = _summarize(0, maxDets=20, iouThr=.75) stats[8] = _summarize(0, maxDets=20, areaRng='medium') stats[9] = _summarize(0, maxDets=20, areaRng='large') return stats if not self.eval: raise Exception('Please run accumulate() first') iouType = self.params.iouType if iouType == 'segm' or iouType == 'bbox': summarize = _summarizeDets elif iouType == 'keypoints': summarize = _summarizeKps self.stats = summarize() def __str__(self): self.summarize() class Params: ''' Params for coco evaluation api ''' def setDetParams(self): self.imgIds = [] self.catIds = [] # np.arange causes trouble. the data point on arange is slightly larger than the true value self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True) self.maxDets = [1, 10, 100] self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] self.areaRngLbl = ['all', 'small', 'medium', 'large'] self.useCats = 1 def setKpParams(self): self.imgIds = [] self.catIds = [] # np.arange causes trouble. the data point on arange is slightly larger than the true value self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True) self.maxDets = [20] self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] self.areaRngLbl = ['all', 'medium', 'large'] self.useCats = 1 self.kpt_oks_sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0 def __init__(self, iouType='segm'): if iouType == 'segm' or iouType == 'bbox': self.setDetParams() elif iouType == 'keypoints': self.setKpParams() else: raise Exception('iouType not supported') self.iouType = iouType # useSegm is deprecated self.useSegm = None ================================================ FILE: annotator/oneformer/pycocotools/mask.py ================================================ __author__ = 'tsungyi' # import annotator.oneformer.pycocotools._mask as _mask # Interface for manipulating masks stored in RLE format. # # RLE is a simple yet efficient format for storing binary masks. RLE # first divides a vector (or vectorized image) into a series of piecewise # constant regions and then for each piece simply stores the length of # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] # (note that the odd counts are always the numbers of zeros). Instead of # storing the counts directly, additional compression is achieved with a # variable bitrate representation based on a common scheme called LEB128. # # Compression is greatest given large piecewise constant regions. # Specifically, the size of the RLE is proportional to the number of # *boundaries* in M (or for an image the number of boundaries in the y # direction). Assuming fairly simple shapes, the RLE representation is # O(sqrt(n)) where n is number of pixels in the object. Hence space usage # is substantially lower, especially for large simple objects (large n). # # Many common operations on masks can be computed directly using the RLE # (without need for decoding). This includes computations such as area, # union, intersection, etc. All of these operations are linear in the # size of the RLE, in other words they are O(sqrt(n)) where n is the area # of the object. Computing these operations on the original mask is O(n). # Thus, using the RLE can result in substantial computational savings. # # The following API functions are defined: # encode - Encode binary masks using RLE. # decode - Decode binary masks encoded via RLE. # merge - Compute union or intersection of encoded masks. # iou - Compute intersection over union between masks. # area - Compute area of encoded masks. # toBbox - Get bounding boxes surrounding encoded masks. # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. # # Usage: # Rs = encode( masks ) # masks = decode( Rs ) # R = merge( Rs, intersect=false ) # o = iou( dt, gt, iscrowd ) # a = area( Rs ) # bbs = toBbox( Rs ) # Rs = frPyObjects( [pyObjects], h, w ) # # In the API the following formats are used: # Rs - [dict] Run-length encoding of binary masks # R - dict Run-length encoding of binary mask # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore # bbs - [nx4] Bounding box(es) stored as [x y w h] # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) # dt,gt - May be either bounding boxes or encoded masks # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). # # Finally, a note about the intersection over union (iou) computation. # The standard iou of a ground truth (gt) and detected (dt) object is # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) # For "crowd" regions, we use a modified criteria. If a gt object is # marked as "iscrowd", we allow a dt to match any subregion of the gt. # Choosing gt' in the crowd gt that best matches the dt can be done using # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) # For crowd gt regions we use this modified criteria above for the iou. # # To compile run "python setup.py build_ext --inplace" # Please do not contact us for help with compiling. # # Microsoft COCO Toolbox. version 2.0 # Data, paper, and tutorials available at: http://mscoco.org/ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. # Licensed under the Simplified BSD License [see coco/license.txt] # iou = _mask.iou # merge = _mask.merge # frPyObjects = _mask.frPyObjects def encode(bimask): pass # if len(bimask.shape) == 3: # return _mask.encode(bimask) # elif len(bimask.shape) == 2: # h, w = bimask.shape # return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] def decode(rleObjs): pass # if type(rleObjs) == list: # return _mask.decode(rleObjs) # else: # return _mask.decode([rleObjs])[:,:,0] def area(rleObjs): pass # if type(rleObjs) == list: # return _mask.area(rleObjs) # else: # return _mask.area([rleObjs])[0] def toBbox(rleObjs): pass # if type(rleObjs) == list: # return _mask.toBbox(rleObjs) # else: # return _mask.toBbox([rleObjs])[0] ================================================ FILE: annotator/openpose/LICENSE ================================================ OPENPOSE: MULTIPERSON KEYPOINT DETECTION SOFTWARE LICENSE AGREEMENT ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE. This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor. RESERVATION OF OWNERSHIP AND GRANT OF LICENSE: Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive, non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i). CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication. COPYRIGHT: The Software is owned by Licensor and is protected by United States copyright laws and applicable international treaties and/or conventions. PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto. DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement. BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies. USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor. You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software. ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void. TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below. The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement. FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement. DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS. SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement. EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage. EXPORT REGULATION: Licensee agrees to comply with any and all applicable U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control. SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby. NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor. GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania. ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto. ************************************************************************ THIRD-PARTY SOFTWARE NOTICES AND INFORMATION This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise. 1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/) COPYRIGHT All contributions by the University of California: Copyright (c) 2014-2017 The Regents of the University of California (Regents) All rights reserved. All other contributions: Copyright (c) 2014-2017, the respective contributors All rights reserved. Caffe uses a shared copyright model: each contributor holds copyright over their contributions to Caffe. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed. LICENSE Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. CONTRIBUTION AGREEMENT By contributing to the BVLC/caffe repository through pull-request, comment, or otherwise, the contributor releases their content to the license and copyright terms herein. ************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION********** ================================================ FILE: annotator/openpose/__init__.py ================================================ # Openpose # Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose # 2nd Edited by https://github.com/Hzzone/pytorch-openpose # 3rd Edited by ControlNet # 4th Edited by ControlNet (added face and correct hands) # 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs) # This preprocessor is licensed by CMU for non-commercial use only. import os os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" import torch import numpy as np from . import util from .body import Body, BodyResult, Keypoint from .hand import Hand from .face import Face from .types import HandResult, FaceResult, HumanPoseResult, AnimalPoseResult from modules import devices from annotator.annotator_path import models_path from .animalpose import draw_animalposes from typing import Tuple, List, Callable, Union, Optional body_model_path = ( "https://huggingface.co/lllyasviel/Annotators/resolve/main/body_pose_model.pth" ) hand_model_path = ( "https://huggingface.co/lllyasviel/Annotators/resolve/main/hand_pose_model.pth" ) face_model_path = ( "https://huggingface.co/lllyasviel/Annotators/resolve/main/facenet.pth" ) remote_onnx_det = "https://huggingface.co/yzd-v/DWPose/resolve/main/yolox_l.onnx" remote_onnx_pose = ( "https://huggingface.co/yzd-v/DWPose/resolve/main/dw-ll_ucoco_384.onnx" ) animal_onnx_pose = "https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite/resolve/main/Annotators/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.onnx" def draw_poses( poses: List[HumanPoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True ): """ Draw the detected poses on an empty canvas. Args: poses (List[HumanPoseResult]): A list of HumanPoseResult objects containing the detected poses. H (int): The height of the canvas. W (int): The width of the canvas. draw_body (bool, optional): Whether to draw body keypoints. Defaults to True. draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True. draw_face (bool, optional): Whether to draw face keypoints. Defaults to True. Returns: numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses. """ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8) for pose in poses: if draw_body: canvas = util.draw_bodypose(canvas, pose.body.keypoints) if draw_hand: canvas = util.draw_handpose(canvas, pose.left_hand) canvas = util.draw_handpose(canvas, pose.right_hand) if draw_face: canvas = util.draw_facepose(canvas, pose.face) return canvas def decode_json_as_poses( pose_json: dict, ) -> Tuple[List[HumanPoseResult], List[AnimalPoseResult], int, int]: """Decode the json_string complying with the openpose JSON output format to poses that controlnet recognizes. https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md Args: json_string: The json string to decode. Returns: human_poses animal_poses canvas_height canvas_width """ height = pose_json["canvas_height"] width = pose_json["canvas_width"] def chunks(lst, n): """Yield successive n-sized chunks from lst.""" for i in range(0, len(lst), n): yield lst[i : i + n] def decompress_keypoints( numbers: Optional[List[float]], ) -> Optional[List[Optional[Keypoint]]]: if not numbers: return None assert len(numbers) % 3 == 0 def create_keypoint(x, y, c): if c < 1.0: return None keypoint = Keypoint(x, y) return keypoint return [create_keypoint(x, y, c) for x, y, c in chunks(numbers, n=3)] return ( [ HumanPoseResult( body=BodyResult( keypoints=decompress_keypoints(pose.get("pose_keypoints_2d")) ), left_hand=decompress_keypoints(pose.get("hand_left_keypoints_2d")), right_hand=decompress_keypoints(pose.get("hand_right_keypoints_2d")), face=decompress_keypoints(pose.get("face_keypoints_2d")), ) for pose in pose_json.get("people", []) ], [decompress_keypoints(pose) for pose in pose_json.get("animals", [])], height, width, ) def encode_poses_as_json( poses: List[HumanPoseResult], animals: List[AnimalPoseResult], canvas_height: int, canvas_width: int, ) -> dict: """Encode the pose as a JSON compatible dict following openpose JSON output format: https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md """ def compress_keypoints( keypoints: Union[List[Keypoint], None] ) -> Union[List[float], None]: if not keypoints: return None return [ value for keypoint in keypoints for value in ( [float(keypoint.x), float(keypoint.y), 1.0] if keypoint is not None else [0.0, 0.0, 0.0] ) ] return { "people": [ { "pose_keypoints_2d": compress_keypoints(pose.body.keypoints), "face_keypoints_2d": compress_keypoints(pose.face), "hand_left_keypoints_2d": compress_keypoints(pose.left_hand), "hand_right_keypoints_2d": compress_keypoints(pose.right_hand), } for pose in poses ], "animals": [compress_keypoints(animal) for animal in animals], "canvas_height": canvas_height, "canvas_width": canvas_width, } class OpenposeDetector: """ A class for detecting human poses in images using the Openpose model. Attributes: model_dir (str): Path to the directory where the pose models are stored. """ model_dir = os.path.join(models_path, "openpose") def __init__(self): self.device = devices.get_device_for("controlnet") self.body_estimation = None self.hand_estimation = None self.face_estimation = None self.dw_pose_estimation = None self.animal_pose_estimation = None def load_model(self): """ Load the Openpose body, hand, and face models. """ body_modelpath = os.path.join(self.model_dir, "body_pose_model.pth") hand_modelpath = os.path.join(self.model_dir, "hand_pose_model.pth") face_modelpath = os.path.join(self.model_dir, "facenet.pth") if not os.path.exists(body_modelpath): from scripts.utils import load_file_from_url load_file_from_url(body_model_path, model_dir=self.model_dir) if not os.path.exists(hand_modelpath): from scripts.utils import load_file_from_url load_file_from_url(hand_model_path, model_dir=self.model_dir) if not os.path.exists(face_modelpath): from scripts.utils import load_file_from_url load_file_from_url(face_model_path, model_dir=self.model_dir) self.body_estimation = Body(body_modelpath) self.hand_estimation = Hand(hand_modelpath) self.face_estimation = Face(face_modelpath) def load_dw_model(self): from .wholebody import Wholebody # DW Pose def load_model(filename: str, remote_url: str): local_path = os.path.join(self.model_dir, filename) if not os.path.exists(local_path): from scripts.utils import load_file_from_url load_file_from_url(remote_url, model_dir=self.model_dir) return local_path onnx_det = load_model("yolox_l.onnx", remote_onnx_det) onnx_pose = load_model("dw-ll_ucoco_384.onnx", remote_onnx_pose) self.dw_pose_estimation = Wholebody(onnx_det, onnx_pose) def load_animalpose_model(self): from .animalpose import AnimalPose # Animalpose def load_model(filename: str, remote_url: str): """ Load the model from the specified filename and remote URL if it doesn't exist locally. Args: filename (str): The filename of the model. remote_url (str): The remote URL of the model. """ local_path = os.path.join(self.model_dir, filename) if not os.path.exists(local_path): from scripts.utils import load_file_from_url load_file_from_url(remote_url, model_dir=self.model_dir) return local_path onnx_det = load_model("yolox_l.onnx", remote_onnx_det) onnx_pose = load_model( "rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.onnx", animal_onnx_pose, ) self.animal_pose_estimation = AnimalPose(onnx_det, onnx_pose) def unload_model(self): """ Unload the Openpose models by moving them to the CPU. Note: DW Pose models always run on CPU, so no need to `unload` them. """ if self.body_estimation is not None: self.body_estimation.model.to("cpu") self.hand_estimation.model.to("cpu") self.face_estimation.model.to("cpu") def detect_hands( self, body: BodyResult, oriImg ) -> Tuple[Union[HandResult, None], Union[HandResult, None]]: left_hand = None right_hand = None H, W, _ = oriImg.shape for x, y, w, is_left in util.handDetect(body, oriImg): peaks = self.hand_estimation(oriImg[y : y + w, x : x + w, :]).astype( np.float32 ) if peaks.ndim == 2 and peaks.shape[1] == 2: peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float( W ) peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float( H ) hand_result = [Keypoint(x=peak[0], y=peak[1]) for peak in peaks] if is_left: left_hand = hand_result else: right_hand = hand_result return left_hand, right_hand def detect_face(self, body: BodyResult, oriImg) -> Union[FaceResult, None]: face = util.faceDetect(body, oriImg) if face is None: return None x, y, w = face H, W, _ = oriImg.shape heatmaps = self.face_estimation(oriImg[y : y + w, x : x + w, :]) peaks = self.face_estimation.compute_peaks_from_heatmaps(heatmaps).astype( np.float32 ) if peaks.ndim == 2 and peaks.shape[1] == 2: peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W) peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H) return [Keypoint(x=peak[0], y=peak[1]) for peak in peaks] return None def detect_poses( self, oriImg, include_hand=False, include_face=False ) -> List[HumanPoseResult]: """ Detect poses in the given image. Args: oriImg (numpy.ndarray): The input image for pose detection. include_hand (bool, optional): Whether to include hand detection. Defaults to False. include_face (bool, optional): Whether to include face detection. Defaults to False. Returns: List[HumanPoseResult]: A list of HumanPoseResult objects containing the detected poses. """ if self.body_estimation is None: self.load_model() self.body_estimation.model.to(self.device) self.hand_estimation.model.to(self.device) self.face_estimation.model.to(self.device) self.body_estimation.cn_device = self.device self.hand_estimation.cn_device = self.device self.face_estimation.cn_device = self.device oriImg = oriImg[:, :, ::-1].copy() H, W, C = oriImg.shape with torch.no_grad(): candidate, subset = self.body_estimation(oriImg) bodies = self.body_estimation.format_body_result(candidate, subset) results = [] for body in bodies: left_hand, right_hand, face = (None,) * 3 if include_hand: left_hand, right_hand = self.detect_hands(body, oriImg) if include_face: face = self.detect_face(body, oriImg) results.append( HumanPoseResult( BodyResult( keypoints=[ Keypoint( x=keypoint.x / float(W), y=keypoint.y / float(H) ) if keypoint is not None else None for keypoint in body.keypoints ], total_score=body.total_score, total_parts=body.total_parts, ), left_hand, right_hand, face, ) ) return results def detect_poses_dw(self, oriImg) -> List[HumanPoseResult]: """ Detect poses in the given image using DW Pose: https://github.com/IDEA-Research/DWPose Args: oriImg (numpy.ndarray): The input image for pose detection. Returns: List[HumanPoseResult]: A list of HumanPoseResult objects containing the detected poses. """ from .wholebody import Wholebody # DW Pose self.load_dw_model() with torch.no_grad(): keypoints_info = self.dw_pose_estimation(oriImg.copy()) return Wholebody.format_result(keypoints_info) def detect_poses_animal(self, oriImg) -> List[AnimalPoseResult]: """ Detect poses in the given image using RTMPose AP10k model: https://github.com/abehonest/ControlNet_AnimalPose Args: oriImg (numpy.ndarray): The input image for pose detection. Returns: A list of AnimalPoseResult objects containing the detected animal poses. """ self.load_animalpose_model() with torch.no_grad(): return self.animal_pose_estimation(oriImg.copy()) def __call__( self, oriImg, include_body=True, include_hand=False, include_face=False, use_dw_pose=False, use_animal_pose=False, json_pose_callback: Callable[[str], None] = None, ): """ Detect and draw poses in the given image. Args: oriImg (numpy.ndarray): The input image for pose detection and drawing. include_body (bool, optional): Whether to include body keypoints. Defaults to True. include_hand (bool, optional): Whether to include hand keypoints. Defaults to False. include_face (bool, optional): Whether to include face keypoints. Defaults to False. use_dw_pose (bool, optional): Whether to use DW pose detection algorithm. Defaults to False. json_pose_callback (Callable, optional): A callback that accepts the pose JSON string. Returns: numpy.ndarray: The image with detected and drawn poses. """ H, W, _ = oriImg.shape animals = [] poses = [] if use_animal_pose: animals = self.detect_poses_animal(oriImg) elif use_dw_pose: poses = self.detect_poses_dw(oriImg) else: poses = self.detect_poses(oriImg, include_hand, include_face) if json_pose_callback: json_pose_callback(encode_poses_as_json(poses, animals, H, W)) if poses: assert len(animals) == 0 return draw_poses( poses, H, W, draw_body=include_body, draw_hand=include_hand, draw_face=include_face, ) else: return draw_animalposes(animals, H, W) ================================================ FILE: annotator/openpose/animalpose.py ================================================ import cv2 import numpy as np from typing import List from .cv_ox_det import inference_detector from .cv_ox_pose import inference_pose from .types import AnimalPoseResult, Keypoint def draw_animalposes(animals: List[List[Keypoint]], H: int, W: int) -> np.ndarray: canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8) for animal_pose in animals: canvas = draw_animalpose(canvas, animal_pose) return canvas def draw_animalpose(canvas: np.ndarray, keypoints: List[Keypoint]) -> np.ndarray: # order of the keypoints for AP10k and a standardized list of colors for limbs keypointPairsList = [ (1, 2), (2, 3), (1, 3), (3, 4), (4, 9), (9, 10), (10, 11), (4, 6), (6, 7), (7, 8), (4, 5), (5, 15), (15, 16), (16, 17), (5, 12), (12, 13), (13, 14), ] colorsList = [ (255, 255, 255), (100, 255, 100), (150, 255, 255), (100, 50, 255), (50, 150, 200), (0, 255, 255), (0, 150, 0), (0, 0, 255), (0, 0, 150), (255, 50, 255), (255, 0, 255), (255, 0, 0), (150, 0, 0), (255, 255, 100), (0, 150, 0), (255, 255, 0), (150, 150, 150), ] # 16 colors needed for ind, (i, j) in enumerate(keypointPairsList): p1 = keypoints[i - 1] p2 = keypoints[j - 1] if p1 is not None and p2 is not None: cv2.line( canvas, (int(p1.x), int(p1.y)), (int(p2.x), int(p2.y)), colorsList[ind], 5, ) return canvas class AnimalPose: def __init__( self, onnx_det: str, onnx_pose: str, ): self.onnx_det = onnx_det self.onnx_pose = onnx_pose self.model_input_size = (256, 256) # Always loads to CPU to avoid building OpenCV. device = 'cpu' backend = cv2.dnn.DNN_BACKEND_OPENCV if device == 'cpu' else cv2.dnn.DNN_BACKEND_CUDA # You need to manually build OpenCV through cmake to work with your GPU. providers = cv2.dnn.DNN_TARGET_CPU if device == 'cpu' else cv2.dnn.DNN_TARGET_CUDA self.session_det = cv2.dnn.readNetFromONNX(onnx_det) self.session_det.setPreferableBackend(backend) self.session_det.setPreferableTarget(providers) self.session_pose = cv2.dnn.readNetFromONNX(onnx_pose) self.session_pose.setPreferableBackend(backend) self.session_pose.setPreferableTarget(providers) def __call__(self, oriImg) -> List[AnimalPoseResult]: detect_classes = list( range(14, 23 + 1) ) # https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/datasets/coco.yaml det_result = inference_detector( self.session_det, oriImg, detect_classes=detect_classes, ) if (det_result is None) or (det_result.shape[0] == 0): return [] keypoint_sets, scores = inference_pose( self.session_pose, det_result, oriImg, self.model_input_size, ) animals = [] for idx, keypoints in enumerate(keypoint_sets): score = scores[idx, ..., None] score[score > 1.0] = 1.0 score[score < 0.0] = 0.0 animals.append( [ Keypoint(x, y, c) for x, y, c in np.concatenate((keypoints, score), axis=-1).tolist() ] ) return animals ================================================ FILE: annotator/openpose/body.py ================================================ import cv2 import numpy as np import math import time from scipy.ndimage import gaussian_filter import matplotlib.pyplot as plt import matplotlib import torch from torchvision import transforms from typing import NamedTuple, List, Union from . import util from .model import bodypose_model from .types import Keypoint, BodyResult class Body(object): def __init__(self, model_path): self.model = bodypose_model() model_dict = util.transfer(self.model, torch.load(model_path)) self.model.load_state_dict(model_dict) self.model.eval() def __call__(self, oriImg): # scale_search = [0.5, 1.0, 1.5, 2.0] scale_search = [0.5] boxsize = 368 stride = 8 padValue = 128 thre1 = 0.1 thre2 = 0.05 multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search] heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19)) paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38)) for m in range(len(multiplier)): scale = multiplier[m] imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale) imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue) im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5 im = np.ascontiguousarray(im) data = torch.from_numpy(im).float() if torch.cuda.is_available(): data = data.cuda() # data = data.permute([2, 0, 1]).unsqueeze(0).float() with torch.no_grad(): data = data.to(self.cn_device) Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data) Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy() Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy() # extract outputs, resize, and remove padding # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride) heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :] heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1])) # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs paf = util.smart_resize_k(paf, fx=stride, fy=stride) paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :] paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1])) heatmap_avg += heatmap_avg + heatmap / len(multiplier) paf_avg += + paf / len(multiplier) all_peaks = [] peak_counter = 0 for part in range(18): map_ori = heatmap_avg[:, :, part] one_heatmap = gaussian_filter(map_ori, sigma=3) map_left = np.zeros(one_heatmap.shape) map_left[1:, :] = one_heatmap[:-1, :] map_right = np.zeros(one_heatmap.shape) map_right[:-1, :] = one_heatmap[1:, :] map_up = np.zeros(one_heatmap.shape) map_up[:, 1:] = one_heatmap[:, :-1] map_down = np.zeros(one_heatmap.shape) map_down[:, :-1] = one_heatmap[:, 1:] peaks_binary = np.logical_and.reduce( (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1)) peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks] peak_id = range(peak_counter, peak_counter + len(peaks)) peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))] all_peaks.append(peaks_with_score_and_id) peak_counter += len(peaks) # find connection in the specified sequence, center 29 is in the position 15 limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \ [1, 16], [16, 18], [3, 17], [6, 18]] # the middle joints heatmap correpondence mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \ [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \ [55, 56], [37, 38], [45, 46]] connection_all = [] special_k = [] mid_num = 10 for k in range(len(mapIdx)): score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]] candA = all_peaks[limbSeq[k][0] - 1] candB = all_peaks[limbSeq[k][1] - 1] nA = len(candA) nB = len(candB) indexA, indexB = limbSeq[k] if (nA != 0 and nB != 0): connection_candidate = [] for i in range(nA): for j in range(nB): vec = np.subtract(candB[j][:2], candA[i][:2]) norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1]) norm = max(0.001, norm) vec = np.divide(vec, norm) startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \ np.linspace(candA[i][1], candB[j][1], num=mid_num))) vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \ for I in range(len(startend))]) vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \ for I in range(len(startend))]) score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1]) score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min( 0.5 * oriImg.shape[0] / norm - 1, 0) criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts) criterion2 = score_with_dist_prior > 0 if criterion1 and criterion2: connection_candidate.append( [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]]) connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True) connection = np.zeros((0, 5)) for c in range(len(connection_candidate)): i, j, s = connection_candidate[c][0:3] if (i not in connection[:, 3] and j not in connection[:, 4]): connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]]) if (len(connection) >= min(nA, nB)): break connection_all.append(connection) else: special_k.append(k) connection_all.append([]) # last number in each row is the total parts number of that person # the second last number in each row is the score of the overall configuration subset = -1 * np.ones((0, 20)) candidate = np.array([item for sublist in all_peaks for item in sublist]) for k in range(len(mapIdx)): if k not in special_k: partAs = connection_all[k][:, 0] partBs = connection_all[k][:, 1] indexA, indexB = np.array(limbSeq[k]) - 1 for i in range(len(connection_all[k])): # = 1:size(temp,1) found = 0 subset_idx = [-1, -1] for j in range(len(subset)): # 1:size(subset,1): if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]: subset_idx[found] = j found += 1 if found == 1: j = subset_idx[0] if subset[j][indexB] != partBs[i]: subset[j][indexB] = partBs[i] subset[j][-1] += 1 subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2] elif found == 2: # if found 2 and disjoint, merge them j1, j2 = subset_idx membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2] if len(np.nonzero(membership == 2)[0]) == 0: # merge subset[j1][:-2] += (subset[j2][:-2] + 1) subset[j1][-2:] += subset[j2][-2:] subset[j1][-2] += connection_all[k][i][2] subset = np.delete(subset, j2, 0) else: # as like found == 1 subset[j1][indexB] = partBs[i] subset[j1][-1] += 1 subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2] # if find no partA in the subset, create a new subset elif not found and k < 17: row = -1 * np.ones(20) row[indexA] = partAs[i] row[indexB] = partBs[i] row[-1] = 2 row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2] subset = np.vstack([subset, row]) # delete some rows of subset which has few parts occur deleteIdx = [] for i in range(len(subset)): if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4: deleteIdx.append(i) subset = np.delete(subset, deleteIdx, axis=0) # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts # candidate: x, y, score, id return candidate, subset @staticmethod def format_body_result(candidate: np.ndarray, subset: np.ndarray) -> List[BodyResult]: """ Format the body results from the candidate and subset arrays into a list of BodyResult objects. Args: candidate (np.ndarray): An array of candidates containing the x, y coordinates, score, and id for each body part. subset (np.ndarray): An array of subsets containing indices to the candidate array for each person detected. The last two columns of each row hold the total score and total parts of the person. Returns: List[BodyResult]: A list of BodyResult objects, where each object represents a person with detected keypoints, total score, and total parts. """ return [ BodyResult( keypoints=[ Keypoint( x=candidate[candidate_index][0], y=candidate[candidate_index][1], score=candidate[candidate_index][2], id=candidate[candidate_index][3] ) if candidate_index != -1 else None for candidate_index in person[:18].astype(int) ], total_score=person[18], total_parts=person[19] ) for person in subset ] if __name__ == "__main__": body_estimation = Body('../model/body_pose_model.pth') test_image = '../images/ski.jpg' oriImg = cv2.imread(test_image) # B,G,R order candidate, subset = body_estimation(oriImg) bodies = body_estimation.format_body_result(candidate, subset) canvas = oriImg for body in bodies: canvas = util.draw_bodypose(canvas, body) plt.imshow(canvas[:, :, [2, 1, 0]]) plt.show() ================================================ FILE: annotator/openpose/cv_ox_det.py ================================================ import cv2 import numpy as np def nms(boxes, scores, nms_thr): """Single class NMS implemented in Numpy.""" x1 = boxes[:, 0] y1 = boxes[:, 1] x2 = boxes[:, 2] y2 = boxes[:, 3] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (areas[i] + areas[order[1:]] - inter) inds = np.where(ovr <= nms_thr)[0] order = order[inds + 1] return keep def multiclass_nms(boxes, scores, nms_thr, score_thr): """Multiclass NMS implemented in Numpy. Class-aware version.""" final_dets = [] num_classes = scores.shape[1] for cls_ind in range(num_classes): cls_scores = scores[:, cls_ind] valid_score_mask = cls_scores > score_thr if valid_score_mask.sum() == 0: continue else: valid_scores = cls_scores[valid_score_mask] valid_boxes = boxes[valid_score_mask] keep = nms(valid_boxes, valid_scores, nms_thr) if len(keep) > 0: cls_inds = np.ones((len(keep), 1)) * cls_ind dets = np.concatenate( [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1 ) final_dets.append(dets) if len(final_dets) == 0: return None return np.concatenate(final_dets, 0) def demo_postprocess(outputs, img_size, p6=False): grids = [] expanded_strides = [] strides = [8, 16, 32] if not p6 else [8, 16, 32, 64] hsizes = [img_size[0] // stride for stride in strides] wsizes = [img_size[1] // stride for stride in strides] for hsize, wsize, stride in zip(hsizes, wsizes, strides): xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize)) grid = np.stack((xv, yv), 2).reshape(1, -1, 2) grids.append(grid) shape = grid.shape[:2] expanded_strides.append(np.full((*shape, 1), stride)) grids = np.concatenate(grids, 1) expanded_strides = np.concatenate(expanded_strides, 1) outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides return outputs def preprocess(img, input_size, swap=(2, 0, 1)): if len(img.shape) == 3: padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114 else: padded_img = np.ones(input_size, dtype=np.uint8) * 114 r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) resized_img = cv2.resize( img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LINEAR, ).astype(np.uint8) padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img padded_img = padded_img.transpose(swap) padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) return padded_img, r def inference_detector(session, oriImg, detect_classes=[0]): input_shape = (640,640) img, ratio = preprocess(oriImg, input_shape) input = img[None, :, :, :] if "InferenceSession" in type(session).__name__: input_name = session.get_inputs()[0].name output = session.run(None, {input_name: input}) else: outNames = session.getUnconnectedOutLayersNames() session.setInput(input) output = session.forward(outNames) predictions = demo_postprocess(output[0], input_shape)[0] boxes = predictions[:, :4] scores = predictions[:, 4:5] * predictions[:, 5:] boxes_xyxy = np.ones_like(boxes) boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2. boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2. boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2. boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2. boxes_xyxy /= ratio dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1) if dets is None: return None final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5] isscore = final_scores>0.3 iscat = np.isin(final_cls_inds, detect_classes) isbbox = [ i and j for (i, j) in zip(isscore, iscat)] final_boxes = final_boxes[isbbox] return final_boxes ================================================ FILE: annotator/openpose/cv_ox_pose.py ================================================ from typing import List, Tuple import cv2 import numpy as np def preprocess( img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256) ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Do preprocessing for DWPose model inference. Args: img (np.ndarray): Input image in shape. input_size (tuple): Input image size in shape (w, h). Returns: tuple: - resized_img (np.ndarray): Preprocessed image. - center (np.ndarray): Center of image. - scale (np.ndarray): Scale of image. """ # get shape of image img_shape = img.shape[:2] out_img, out_center, out_scale = [], [], [] if len(out_bbox) == 0: out_bbox = [[0, 0, img_shape[1], img_shape[0]]] for i in range(len(out_bbox)): x0 = out_bbox[i][0] y0 = out_bbox[i][1] x1 = out_bbox[i][2] y1 = out_bbox[i][3] bbox = np.array([x0, y0, x1, y1]) # get center and scale center, scale = bbox_xyxy2cs(bbox, padding=1.25) # do affine transformation resized_img, scale = top_down_affine(input_size, scale, center, img) # normalize image mean = np.array([123.675, 116.28, 103.53]) std = np.array([58.395, 57.12, 57.375]) resized_img = (resized_img - mean) / std out_img.append(resized_img) out_center.append(center) out_scale.append(scale) return out_img, out_center, out_scale def inference(sess, img): """Inference DWPose model. Args: sess : ONNXRuntime session. img : Input image in shape. Returns: outputs : Output of DWPose model. """ all_out = [] # build input input = np.stack(img, axis=0).transpose(0, 3, 1, 2) input = input.astype(np.float32) if "InferenceSession" in type(sess).__name__: input_name = sess.get_inputs()[0].name all_outputs = sess.run(None, {input_name: input}) for batch_idx in range(len(all_outputs[0])): outputs = [all_outputs[i][batch_idx:batch_idx+1,...] for i in range(len(all_outputs))] all_out.append(outputs) return all_out for i in range(len(img)): input = img[i].transpose(2, 0, 1) input = input[None, :, :, :] outNames = sess.getUnconnectedOutLayersNames() sess.setInput(input) outputs = sess.forward(outNames) all_out.append(outputs) return all_out def postprocess(outputs: List[np.ndarray], model_input_size: Tuple[int, int], center: Tuple[int, int], scale: Tuple[int, int], simcc_split_ratio: float = 2.0 ) -> Tuple[np.ndarray, np.ndarray]: """Postprocess for DWPose model output. Args: outputs (np.ndarray): Output of RTMPose model. model_input_size (tuple): RTMPose model Input image size. center (tuple): Center of bbox in shape (x, y). scale (tuple): Scale of bbox in shape (w, h). simcc_split_ratio (float): Split ratio of simcc. Returns: tuple: - keypoints (np.ndarray): Rescaled keypoints. - scores (np.ndarray): Model predict scores. """ all_key = [] all_score = [] for i in range(len(outputs)): # use simcc to decode simcc_x, simcc_y = outputs[i] keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio) # rescale keypoints keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2 all_key.append(keypoints[0]) all_score.append(scores[0]) return np.array(all_key), np.array(all_score) def bbox_xyxy2cs(bbox: np.ndarray, padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]: """Transform the bbox format from (x,y,w,h) into (center, scale) Args: bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted as (left, top, right, bottom) padding (float): BBox padding factor that will be multilied to scale. Default: 1.0 Returns: tuple: A tuple containing center and scale. - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or (n, 2) - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or (n, 2) """ # convert single bbox from (4, ) to (1, 4) dim = bbox.ndim if dim == 1: bbox = bbox[None, :] # get bbox center and scale x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3]) center = np.hstack([x1 + x2, y1 + y2]) * 0.5 scale = np.hstack([x2 - x1, y2 - y1]) * padding if dim == 1: center = center[0] scale = scale[0] return center, scale def _fix_aspect_ratio(bbox_scale: np.ndarray, aspect_ratio: float) -> np.ndarray: """Extend the scale to match the given aspect ratio. Args: scale (np.ndarray): The image scale (w, h) in shape (2, ) aspect_ratio (float): The ratio of ``w/h`` Returns: np.ndarray: The reshaped image scale in (2, ) """ w, h = np.hsplit(bbox_scale, [1]) bbox_scale = np.where(w > h * aspect_ratio, np.hstack([w, w / aspect_ratio]), np.hstack([h * aspect_ratio, h])) return bbox_scale def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray: """Rotate a point by an angle. Args: pt (np.ndarray): 2D point coordinates (x, y) in shape (2, ) angle_rad (float): rotation angle in radian Returns: np.ndarray: Rotated point in shape (2, ) """ sn, cs = np.sin(angle_rad), np.cos(angle_rad) rot_mat = np.array([[cs, -sn], [sn, cs]]) return rot_mat @ pt def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray: """To calculate the affine matrix, three pairs of points are required. This function is used to get the 3rd point, given 2D points a & b. The 3rd point is defined by rotating vector `a - b` by 90 degrees anticlockwise, using b as the rotation center. Args: a (np.ndarray): The 1st point (x,y) in shape (2, ) b (np.ndarray): The 2nd point (x,y) in shape (2, ) Returns: np.ndarray: The 3rd point. """ direction = a - b c = b + np.r_[-direction[1], direction[0]] return c def get_warp_matrix(center: np.ndarray, scale: np.ndarray, rot: float, output_size: Tuple[int, int], shift: Tuple[float, float] = (0., 0.), inv: bool = False) -> np.ndarray: """Calculate the affine transformation matrix that can warp the bbox area in the input image to the output size. Args: center (np.ndarray[2, ]): Center of the bounding box (x, y). scale (np.ndarray[2, ]): Scale of the bounding box wrt [width, height]. rot (float): Rotation angle (degree). output_size (np.ndarray[2, ] | list(2,)): Size of the destination heatmaps. shift (0-100%): Shift translation ratio wrt the width/height. Default (0., 0.). inv (bool): Option to inverse the affine transform direction. (inv=False: src->dst or inv=True: dst->src) Returns: np.ndarray: A 2x3 transformation matrix """ shift = np.array(shift) src_w = scale[0] dst_w = output_size[0] dst_h = output_size[1] # compute transformation matrix rot_rad = np.deg2rad(rot) src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad) dst_dir = np.array([0., dst_w * -0.5]) # get four corners of the src rectangle in the original image src = np.zeros((3, 2), dtype=np.float32) src[0, :] = center + scale * shift src[1, :] = center + src_dir + scale * shift src[2, :] = _get_3rd_point(src[0, :], src[1, :]) # get four corners of the dst rectangle in the input image dst = np.zeros((3, 2), dtype=np.float32) dst[0, :] = [dst_w * 0.5, dst_h * 0.5] dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :]) if inv: warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src)) else: warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst)) return warp_mat def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict, img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Get the bbox image as the model input by affine transform. Args: input_size (dict): The input size of the model. bbox_scale (dict): The bbox scale of the img. bbox_center (dict): The bbox center of the img. img (np.ndarray): The original image. Returns: tuple: A tuple containing center and scale. - np.ndarray[float32]: img after affine transform. - np.ndarray[float32]: bbox scale after affine transform. """ w, h = input_size warp_size = (int(w), int(h)) # reshape bbox to fixed aspect ratio bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h) # get the affine matrix center = bbox_center scale = bbox_scale rot = 0 warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h)) # do affine transform img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR) return img, bbox_scale def get_simcc_maximum(simcc_x: np.ndarray, simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Get maximum response location and value from simcc representations. Note: instance number: N num_keypoints: K heatmap height: H heatmap width: W Args: simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx) simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy) Returns: tuple: - locs (np.ndarray): locations of maximum heatmap responses in shape (K, 2) or (N, K, 2) - vals (np.ndarray): values of maximum heatmap responses in shape (K,) or (N, K) """ N, K, Wx = simcc_x.shape simcc_x = simcc_x.reshape(N * K, -1) simcc_y = simcc_y.reshape(N * K, -1) # get maximum value locations x_locs = np.argmax(simcc_x, axis=1) y_locs = np.argmax(simcc_y, axis=1) locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32) max_val_x = np.amax(simcc_x, axis=1) max_val_y = np.amax(simcc_y, axis=1) # get maximum value across x and y axis mask = max_val_x > max_val_y max_val_x[mask] = max_val_y[mask] vals = max_val_x locs[vals <= 0.] = -1 # reshape locs = locs.reshape(N, K, 2) vals = vals.reshape(N, K) return locs, vals def decode(simcc_x: np.ndarray, simcc_y: np.ndarray, simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]: """Modulate simcc distribution with Gaussian. Args: simcc_x (np.ndarray[K, Wx]): model predicted simcc in x. simcc_y (np.ndarray[K, Wy]): model predicted simcc in y. simcc_split_ratio (int): The split ratio of simcc. Returns: tuple: A tuple containing center and scale. - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2) - np.ndarray[float32]: scores in shape (K,) or (n, K) """ keypoints, scores = get_simcc_maximum(simcc_x, simcc_y) keypoints /= simcc_split_ratio return keypoints, scores def inference_pose(session, out_bbox, oriImg, model_input_size: Tuple[int, int]= (288, 384) ): resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size) outputs = inference(session, resized_img) keypoints, scores = postprocess(outputs, model_input_size, center, scale) return keypoints, scores ================================================ FILE: annotator/openpose/face.py ================================================ import logging import numpy as np from torchvision.transforms import ToTensor, ToPILImage import torch import torch.nn.functional as F import cv2 from . import util from torch.nn import Conv2d, Module, ReLU, MaxPool2d, init class FaceNet(Module): """Model the cascading heatmaps. """ def __init__(self): super(FaceNet, self).__init__() # cnn to make feature map self.relu = ReLU() self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2) self.conv1_1 = Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1) self.conv1_2 = Conv2d( in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1) self.conv2_1 = Conv2d( in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1) self.conv2_2 = Conv2d( in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1) self.conv3_1 = Conv2d( in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1) self.conv3_2 = Conv2d( in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) self.conv3_3 = Conv2d( in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) self.conv3_4 = Conv2d( in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1) self.conv4_1 = Conv2d( in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) self.conv4_2 = Conv2d( in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) self.conv4_3 = Conv2d( in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) self.conv4_4 = Conv2d( in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) self.conv5_1 = Conv2d( in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) self.conv5_2 = Conv2d( in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1) self.conv5_3_CPM = Conv2d( in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=1) # stage1 self.conv6_1_CPM = Conv2d( in_channels=128, out_channels=512, kernel_size=1, stride=1, padding=0) self.conv6_2_CPM = Conv2d( in_channels=512, out_channels=71, kernel_size=1, stride=1, padding=0) # stage2 self.Mconv1_stage2 = Conv2d( in_channels=199, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv2_stage2 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv3_stage2 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv4_stage2 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv5_stage2 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv6_stage2 = Conv2d( in_channels=128, out_channels=128, kernel_size=1, stride=1, padding=0) self.Mconv7_stage2 = Conv2d( in_channels=128, out_channels=71, kernel_size=1, stride=1, padding=0) # stage3 self.Mconv1_stage3 = Conv2d( in_channels=199, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv2_stage3 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv3_stage3 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv4_stage3 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv5_stage3 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv6_stage3 = Conv2d( in_channels=128, out_channels=128, kernel_size=1, stride=1, padding=0) self.Mconv7_stage3 = Conv2d( in_channels=128, out_channels=71, kernel_size=1, stride=1, padding=0) # stage4 self.Mconv1_stage4 = Conv2d( in_channels=199, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv2_stage4 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv3_stage4 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv4_stage4 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv5_stage4 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv6_stage4 = Conv2d( in_channels=128, out_channels=128, kernel_size=1, stride=1, padding=0) self.Mconv7_stage4 = Conv2d( in_channels=128, out_channels=71, kernel_size=1, stride=1, padding=0) # stage5 self.Mconv1_stage5 = Conv2d( in_channels=199, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv2_stage5 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv3_stage5 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv4_stage5 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv5_stage5 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv6_stage5 = Conv2d( in_channels=128, out_channels=128, kernel_size=1, stride=1, padding=0) self.Mconv7_stage5 = Conv2d( in_channels=128, out_channels=71, kernel_size=1, stride=1, padding=0) # stage6 self.Mconv1_stage6 = Conv2d( in_channels=199, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv2_stage6 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv3_stage6 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv4_stage6 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv5_stage6 = Conv2d( in_channels=128, out_channels=128, kernel_size=7, stride=1, padding=3) self.Mconv6_stage6 = Conv2d( in_channels=128, out_channels=128, kernel_size=1, stride=1, padding=0) self.Mconv7_stage6 = Conv2d( in_channels=128, out_channels=71, kernel_size=1, stride=1, padding=0) for m in self.modules(): if isinstance(m, Conv2d): init.constant_(m.bias, 0) def forward(self, x): """Return a list of heatmaps.""" heatmaps = [] h = self.relu(self.conv1_1(x)) h = self.relu(self.conv1_2(h)) h = self.max_pooling_2d(h) h = self.relu(self.conv2_1(h)) h = self.relu(self.conv2_2(h)) h = self.max_pooling_2d(h) h = self.relu(self.conv3_1(h)) h = self.relu(self.conv3_2(h)) h = self.relu(self.conv3_3(h)) h = self.relu(self.conv3_4(h)) h = self.max_pooling_2d(h) h = self.relu(self.conv4_1(h)) h = self.relu(self.conv4_2(h)) h = self.relu(self.conv4_3(h)) h = self.relu(self.conv4_4(h)) h = self.relu(self.conv5_1(h)) h = self.relu(self.conv5_2(h)) h = self.relu(self.conv5_3_CPM(h)) feature_map = h # stage1 h = self.relu(self.conv6_1_CPM(h)) h = self.conv6_2_CPM(h) heatmaps.append(h) # stage2 h = torch.cat([h, feature_map], dim=1) # channel concat h = self.relu(self.Mconv1_stage2(h)) h = self.relu(self.Mconv2_stage2(h)) h = self.relu(self.Mconv3_stage2(h)) h = self.relu(self.Mconv4_stage2(h)) h = self.relu(self.Mconv5_stage2(h)) h = self.relu(self.Mconv6_stage2(h)) h = self.Mconv7_stage2(h) heatmaps.append(h) # stage3 h = torch.cat([h, feature_map], dim=1) # channel concat h = self.relu(self.Mconv1_stage3(h)) h = self.relu(self.Mconv2_stage3(h)) h = self.relu(self.Mconv3_stage3(h)) h = self.relu(self.Mconv4_stage3(h)) h = self.relu(self.Mconv5_stage3(h)) h = self.relu(self.Mconv6_stage3(h)) h = self.Mconv7_stage3(h) heatmaps.append(h) # stage4 h = torch.cat([h, feature_map], dim=1) # channel concat h = self.relu(self.Mconv1_stage4(h)) h = self.relu(self.Mconv2_stage4(h)) h = self.relu(self.Mconv3_stage4(h)) h = self.relu(self.Mconv4_stage4(h)) h = self.relu(self.Mconv5_stage4(h)) h = self.relu(self.Mconv6_stage4(h)) h = self.Mconv7_stage4(h) heatmaps.append(h) # stage5 h = torch.cat([h, feature_map], dim=1) # channel concat h = self.relu(self.Mconv1_stage5(h)) h = self.relu(self.Mconv2_stage5(h)) h = self.relu(self.Mconv3_stage5(h)) h = self.relu(self.Mconv4_stage5(h)) h = self.relu(self.Mconv5_stage5(h)) h = self.relu(self.Mconv6_stage5(h)) h = self.Mconv7_stage5(h) heatmaps.append(h) # stage6 h = torch.cat([h, feature_map], dim=1) # channel concat h = self.relu(self.Mconv1_stage6(h)) h = self.relu(self.Mconv2_stage6(h)) h = self.relu(self.Mconv3_stage6(h)) h = self.relu(self.Mconv4_stage6(h)) h = self.relu(self.Mconv5_stage6(h)) h = self.relu(self.Mconv6_stage6(h)) h = self.Mconv7_stage6(h) heatmaps.append(h) return heatmaps LOG = logging.getLogger(__name__) TOTEN = ToTensor() TOPIL = ToPILImage() params = { 'gaussian_sigma': 2.5, 'inference_img_size': 736, # 368, 736, 1312 'heatmap_peak_thresh': 0.1, 'crop_scale': 1.5, 'line_indices': [ [0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13], [13, 14], [14, 15], [15, 16], [17, 18], [18, 19], [19, 20], [20, 21], [22, 23], [23, 24], [24, 25], [25, 26], [27, 28], [28, 29], [29, 30], [31, 32], [32, 33], [33, 34], [34, 35], [36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36], [42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42], [48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54], [54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48], [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66], [66, 67], [67, 60] ], } class Face(object): """ The OpenPose face landmark detector model. Args: inference_size: set the size of the inference image size, suggested: 368, 736, 1312, default 736 gaussian_sigma: blur the heatmaps, default 2.5 heatmap_peak_thresh: return landmark if over threshold, default 0.1 """ def __init__(self, face_model_path, inference_size=None, gaussian_sigma=None, heatmap_peak_thresh=None): self.inference_size = inference_size or params["inference_img_size"] self.sigma = gaussian_sigma or params['gaussian_sigma'] self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"] self.model = FaceNet() self.model.load_state_dict(torch.load(face_model_path)) # if torch.cuda.is_available(): # self.model = self.model.cuda() # print('cuda') self.model.eval() def __call__(self, face_img): H, W, C = face_img.shape w_size = 384 x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5 x_data = x_data.to(self.cn_device) with torch.no_grad(): hs = self.model(x_data[None, ...]) heatmaps = F.interpolate( hs[-1], (H, W), mode='bilinear', align_corners=True).cpu().numpy()[0] return heatmaps def compute_peaks_from_heatmaps(self, heatmaps): all_peaks = [] for part in range(heatmaps.shape[0]): map_ori = heatmaps[part].copy() binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8) if np.sum(binary) == 0: continue positions = np.where(binary > 0.5) intensities = map_ori[positions] mi = np.argmax(intensities) y, x = positions[0][mi], positions[1][mi] all_peaks.append([x, y]) return np.array(all_peaks) ================================================ FILE: annotator/openpose/hand.py ================================================ import cv2 import json import numpy as np import math import time from scipy.ndimage import gaussian_filter import matplotlib.pyplot as plt import matplotlib import torch from skimage.measure import label from .model import handpose_model from . import util class Hand(object): def __init__(self, model_path): self.model = handpose_model() # if torch.cuda.is_available(): # self.model = self.model.cuda() # print('cuda') model_dict = util.transfer(self.model, torch.load(model_path)) self.model.load_state_dict(model_dict) self.model.eval() def __call__(self, oriImgRaw): scale_search = [0.5, 1.0, 1.5, 2.0] # scale_search = [0.5] boxsize = 368 stride = 8 padValue = 128 thre = 0.05 multiplier = [x * boxsize for x in scale_search] wsize = 128 heatmap_avg = np.zeros((wsize, wsize, 22)) Hr, Wr, Cr = oriImgRaw.shape oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8) for m in range(len(multiplier)): scale = multiplier[m] imageToTest = util.smart_resize(oriImg, (scale, scale)) imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue) im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5 im = np.ascontiguousarray(im) data = torch.from_numpy(im).float() if torch.cuda.is_available(): data = data.cuda() with torch.no_grad(): data = data.to(self.cn_device) output = self.model(data).cpu().numpy() # extract outputs, resize, and remove padding heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride) heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :] heatmap = util.smart_resize(heatmap, (wsize, wsize)) heatmap_avg += heatmap / len(multiplier) all_peaks = [] for part in range(21): map_ori = heatmap_avg[:, :, part] one_heatmap = gaussian_filter(map_ori, sigma=3) binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8) if np.sum(binary) == 0: all_peaks.append([0, 0]) continue label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim) max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1 label_img[label_img != max_index] = 0 map_ori[label_img == 0] = 0 y, x = util.npmax(map_ori) y = int(float(y) * float(Hr) / float(wsize)) x = int(float(x) * float(Wr) / float(wsize)) all_peaks.append([x, y]) return np.array(all_peaks) if __name__ == "__main__": hand_estimation = Hand('../model/hand_pose_model.pth') # test_image = '../images/hand.jpg' test_image = '../images/hand.jpg' oriImg = cv2.imread(test_image) # B,G,R order peaks = hand_estimation(oriImg) canvas = util.draw_handpose(oriImg, peaks, True) cv2.imshow('', canvas) cv2.waitKey(0) ================================================ FILE: annotator/openpose/model.py ================================================ import torch from collections import OrderedDict import torch import torch.nn as nn def make_layers(block, no_relu_layers): layers = [] for layer_name, v in block.items(): if 'pool' in layer_name: layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2]) layers.append((layer_name, layer)) else: conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1], kernel_size=v[2], stride=v[3], padding=v[4]) layers.append((layer_name, conv2d)) if layer_name not in no_relu_layers: layers.append(('relu_'+layer_name, nn.ReLU(inplace=True))) return nn.Sequential(OrderedDict(layers)) class bodypose_model(nn.Module): def __init__(self): super(bodypose_model, self).__init__() # these layers have no relu layer no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\ 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\ 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\ 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1'] blocks = {} block0 = OrderedDict([ ('conv1_1', [3, 64, 3, 1, 1]), ('conv1_2', [64, 64, 3, 1, 1]), ('pool1_stage1', [2, 2, 0]), ('conv2_1', [64, 128, 3, 1, 1]), ('conv2_2', [128, 128, 3, 1, 1]), ('pool2_stage1', [2, 2, 0]), ('conv3_1', [128, 256, 3, 1, 1]), ('conv3_2', [256, 256, 3, 1, 1]), ('conv3_3', [256, 256, 3, 1, 1]), ('conv3_4', [256, 256, 3, 1, 1]), ('pool3_stage1', [2, 2, 0]), ('conv4_1', [256, 512, 3, 1, 1]), ('conv4_2', [512, 512, 3, 1, 1]), ('conv4_3_CPM', [512, 256, 3, 1, 1]), ('conv4_4_CPM', [256, 128, 3, 1, 1]) ]) # Stage 1 block1_1 = OrderedDict([ ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]), ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]), ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]), ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]), ('conv5_5_CPM_L1', [512, 38, 1, 1, 0]) ]) block1_2 = OrderedDict([ ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]), ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]), ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]), ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]), ('conv5_5_CPM_L2', [512, 19, 1, 1, 0]) ]) blocks['block1_1'] = block1_1 blocks['block1_2'] = block1_2 self.model0 = make_layers(block0, no_relu_layers) # Stages 2 - 6 for i in range(2, 7): blocks['block%d_1' % i] = OrderedDict([ ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]), ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]), ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]), ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]), ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]), ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]), ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0]) ]) blocks['block%d_2' % i] = OrderedDict([ ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]), ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]), ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]), ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]), ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]), ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]), ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0]) ]) for k in blocks.keys(): blocks[k] = make_layers(blocks[k], no_relu_layers) self.model1_1 = blocks['block1_1'] self.model2_1 = blocks['block2_1'] self.model3_1 = blocks['block3_1'] self.model4_1 = blocks['block4_1'] self.model5_1 = blocks['block5_1'] self.model6_1 = blocks['block6_1'] self.model1_2 = blocks['block1_2'] self.model2_2 = blocks['block2_2'] self.model3_2 = blocks['block3_2'] self.model4_2 = blocks['block4_2'] self.model5_2 = blocks['block5_2'] self.model6_2 = blocks['block6_2'] def forward(self, x): out1 = self.model0(x) out1_1 = self.model1_1(out1) out1_2 = self.model1_2(out1) out2 = torch.cat([out1_1, out1_2, out1], 1) out2_1 = self.model2_1(out2) out2_2 = self.model2_2(out2) out3 = torch.cat([out2_1, out2_2, out1], 1) out3_1 = self.model3_1(out3) out3_2 = self.model3_2(out3) out4 = torch.cat([out3_1, out3_2, out1], 1) out4_1 = self.model4_1(out4) out4_2 = self.model4_2(out4) out5 = torch.cat([out4_1, out4_2, out1], 1) out5_1 = self.model5_1(out5) out5_2 = self.model5_2(out5) out6 = torch.cat([out5_1, out5_2, out1], 1) out6_1 = self.model6_1(out6) out6_2 = self.model6_2(out6) return out6_1, out6_2 class handpose_model(nn.Module): def __init__(self): super(handpose_model, self).__init__() # these layers have no relu layer no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\ 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6'] # stage 1 block1_0 = OrderedDict([ ('conv1_1', [3, 64, 3, 1, 1]), ('conv1_2', [64, 64, 3, 1, 1]), ('pool1_stage1', [2, 2, 0]), ('conv2_1', [64, 128, 3, 1, 1]), ('conv2_2', [128, 128, 3, 1, 1]), ('pool2_stage1', [2, 2, 0]), ('conv3_1', [128, 256, 3, 1, 1]), ('conv3_2', [256, 256, 3, 1, 1]), ('conv3_3', [256, 256, 3, 1, 1]), ('conv3_4', [256, 256, 3, 1, 1]), ('pool3_stage1', [2, 2, 0]), ('conv4_1', [256, 512, 3, 1, 1]), ('conv4_2', [512, 512, 3, 1, 1]), ('conv4_3', [512, 512, 3, 1, 1]), ('conv4_4', [512, 512, 3, 1, 1]), ('conv5_1', [512, 512, 3, 1, 1]), ('conv5_2', [512, 512, 3, 1, 1]), ('conv5_3_CPM', [512, 128, 3, 1, 1]) ]) block1_1 = OrderedDict([ ('conv6_1_CPM', [128, 512, 1, 1, 0]), ('conv6_2_CPM', [512, 22, 1, 1, 0]) ]) blocks = {} blocks['block1_0'] = block1_0 blocks['block1_1'] = block1_1 # stage 2-6 for i in range(2, 7): blocks['block%d' % i] = OrderedDict([ ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]), ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]), ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]), ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]), ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]), ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]), ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0]) ]) for k in blocks.keys(): blocks[k] = make_layers(blocks[k], no_relu_layers) self.model1_0 = blocks['block1_0'] self.model1_1 = blocks['block1_1'] self.model2 = blocks['block2'] self.model3 = blocks['block3'] self.model4 = blocks['block4'] self.model5 = blocks['block5'] self.model6 = blocks['block6'] def forward(self, x): out1_0 = self.model1_0(x) out1_1 = self.model1_1(out1_0) concat_stage2 = torch.cat([out1_1, out1_0], 1) out_stage2 = self.model2(concat_stage2) concat_stage3 = torch.cat([out_stage2, out1_0], 1) out_stage3 = self.model3(concat_stage3) concat_stage4 = torch.cat([out_stage3, out1_0], 1) out_stage4 = self.model4(concat_stage4) concat_stage5 = torch.cat([out_stage4, out1_0], 1) out_stage5 = self.model5(concat_stage5) concat_stage6 = torch.cat([out_stage5, out1_0], 1) out_stage6 = self.model6(concat_stage6) return out_stage6 ================================================ FILE: annotator/openpose/types.py ================================================ from typing import NamedTuple, List, Optional, Union class Keypoint(NamedTuple): x: float y: float score: float = 1.0 id: int = -1 class BodyResult(NamedTuple): # Note: Using `Optional` instead of `|` operator as the ladder is a Python # 3.10 feature. # Annotator code should be Python 3.8 Compatible, as controlnet repo uses # Python 3.8 environment. # https://github.com/lllyasviel/ControlNet/blob/d3284fcd0972c510635a4f5abe2eeb71dc0de524/environment.yaml#L6 keypoints: List[Optional[Keypoint]] total_score: float = 0.0 total_parts: int = 0 HandResult = List[Keypoint] FaceResult = List[Keypoint] AnimalPoseResult = List[Keypoint] class HumanPoseResult(NamedTuple): body: BodyResult left_hand: Optional[HandResult] right_hand: Optional[HandResult] face: Optional[FaceResult] ================================================ FILE: annotator/openpose/util.py ================================================ import math import numpy as np import matplotlib import cv2 from typing import List, Tuple, Union, Optional from .body import BodyResult, Keypoint eps = 0.01 def smart_resize(x, s): Ht, Wt = s if x.ndim == 2: Ho, Wo = x.shape Co = 1 else: Ho, Wo, Co = x.shape if Co == 3 or Co == 1: k = float(Ht + Wt) / float(Ho + Wo) return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4) else: return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2) def smart_resize_k(x, fx, fy): if x.ndim == 2: Ho, Wo = x.shape Co = 1 else: Ho, Wo, Co = x.shape Ht, Wt = Ho * fy, Wo * fx if Co == 3 or Co == 1: k = float(Ht + Wt) / float(Ho + Wo) return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4) else: return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2) def padRightDownCorner(img, stride, padValue): h = img.shape[0] w = img.shape[1] pad = 4 * [None] pad[0] = 0 # up pad[1] = 0 # left pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right img_padded = img pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1)) img_padded = np.concatenate((pad_up, img_padded), axis=0) pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1)) img_padded = np.concatenate((pad_left, img_padded), axis=1) pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1)) img_padded = np.concatenate((img_padded, pad_down), axis=0) pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1)) img_padded = np.concatenate((img_padded, pad_right), axis=1) return img_padded, pad def transfer(model, model_weights): transfered_model_weights = {} for weights_name in model.state_dict().keys(): transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])] return transfered_model_weights def is_normalized(keypoints: List[Optional[Keypoint]]) -> bool: point_normalized = [ 0 <= abs(k.x) <= 1 and 0 <= abs(k.y) <= 1 for k in keypoints if k is not None ] if not point_normalized: return False return all(point_normalized) def draw_bodypose(canvas: np.ndarray, keypoints: List[Keypoint]) -> np.ndarray: """ Draw keypoints and limbs representing body pose on a given canvas. Args: canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the body pose. keypoints (List[Keypoint]): A list of Keypoint objects representing the body keypoints to be drawn. Returns: np.ndarray: A 3D numpy array representing the modified canvas with the drawn body pose. Note: The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1. """ if not is_normalized(keypoints): H, W = 1.0, 1.0 else: H, W, _ = canvas.shape stickwidth = 4 limbSeq = [ [2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], [1, 16], [16, 18], ] colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]] for (k1_index, k2_index), color in zip(limbSeq, colors): keypoint1 = keypoints[k1_index - 1] keypoint2 = keypoints[k2_index - 1] if keypoint1 is None or keypoint2 is None: continue Y = np.array([keypoint1.x, keypoint2.x]) * float(W) X = np.array([keypoint1.y, keypoint2.y]) * float(H) mX = np.mean(X) mY = np.mean(Y) length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5 angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1) cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color]) for keypoint, color in zip(keypoints, colors): if keypoint is None: continue x, y = keypoint.x, keypoint.y x = int(x * W) y = int(y * H) cv2.circle(canvas, (int(x), int(y)), 4, color, thickness=-1) return canvas def draw_handpose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray: """ Draw keypoints and connections representing hand pose on a given canvas. Args: canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the hand pose. keypoints (List[Keypoint]| None): A list of Keypoint objects representing the hand keypoints to be drawn or None if no keypoints are present. Returns: np.ndarray: A 3D numpy array representing the modified canvas with the drawn hand pose. Note: The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1. """ if not keypoints: return canvas if not is_normalized(keypoints): H, W = 1.0, 1.0 else: H, W, _ = canvas.shape edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]] for ie, (e1, e2) in enumerate(edges): k1 = keypoints[e1] k2 = keypoints[e2] if k1 is None or k2 is None: continue x1 = int(k1.x * W) y1 = int(k1.y * H) x2 = int(k2.x * W) y2 = int(k2.y * H) if x1 > eps and y1 > eps and x2 > eps and y2 > eps: cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2) for keypoint in keypoints: if keypoint is None: continue x, y = keypoint.x, keypoint.y x = int(x * W) y = int(y * H) if x > eps and y > eps: cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1) return canvas def draw_facepose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray: """ Draw keypoints representing face pose on a given canvas. Args: canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the face pose. keypoints (List[Keypoint]| None): A list of Keypoint objects representing the face keypoints to be drawn or None if no keypoints are present. Returns: np.ndarray: A 3D numpy array representing the modified canvas with the drawn face pose. Note: The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1. """ if not keypoints: return canvas if not is_normalized(keypoints): H, W = 1.0, 1.0 else: H, W, _ = canvas.shape for keypoint in keypoints: if keypoint is None: continue x, y = keypoint.x, keypoint.y x = int(x * W) y = int(y * H) if x > eps and y > eps: cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1) return canvas # detect hand according to body pose keypoints # please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp def handDetect(body: BodyResult, oriImg) -> List[Tuple[int, int, int, bool]]: """ Detect hands in the input body pose keypoints and calculate the bounding box for each hand. Args: body (BodyResult): A BodyResult object containing the detected body pose keypoints. oriImg (numpy.ndarray): A 3D numpy array representing the original input image. Returns: List[Tuple[int, int, int, bool]]: A list of tuples, each containing the coordinates (x, y) of the top-left corner of the bounding box, the width (height) of the bounding box, and a boolean flag indicating whether the hand is a left hand (True) or a right hand (False). Notes: - The width and height of the bounding boxes are equal since the network requires squared input. - The minimum bounding box size is 20 pixels. """ ratioWristElbow = 0.33 detect_result = [] image_height, image_width = oriImg.shape[0:2] keypoints = body.keypoints # right hand: wrist 4, elbow 3, shoulder 2 # left hand: wrist 7, elbow 6, shoulder 5 left_shoulder = keypoints[5] left_elbow = keypoints[6] left_wrist = keypoints[7] right_shoulder = keypoints[2] right_elbow = keypoints[3] right_wrist = keypoints[4] # if any of three not detected has_left = all(keypoint is not None for keypoint in (left_shoulder, left_elbow, left_wrist)) has_right = all(keypoint is not None for keypoint in (right_shoulder, right_elbow, right_wrist)) if not (has_left or has_right): return [] hands = [] #left hand if has_left: hands.append([ left_shoulder.x, left_shoulder.y, left_elbow.x, left_elbow.y, left_wrist.x, left_wrist.y, True ]) # right hand if has_right: hands.append([ right_shoulder.x, right_shoulder.y, right_elbow.x, right_elbow.y, right_wrist.x, right_wrist.y, False ]) for x1, y1, x2, y2, x3, y3, is_left in hands: # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]); # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]); # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow); # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder); # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder); x = x3 + ratioWristElbow * (x3 - x2) y = y3 + ratioWristElbow * (y3 - y2) distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2) distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2) width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder) # x-y refers to the center --> offset to topLeft point # handRectangle.x -= handRectangle.width / 2.f; # handRectangle.y -= handRectangle.height / 2.f; x -= width / 2 y -= width / 2 # width = height # overflow the image if x < 0: x = 0 if y < 0: y = 0 width1 = width width2 = width if x + width > image_width: width1 = image_width - x if y + width > image_height: width2 = image_height - y width = min(width1, width2) # the max hand box value is 20 pixels if width >= 20: detect_result.append((int(x), int(y), int(width), is_left)) ''' return value: [[x, y, w, True if left hand else False]]. width=height since the network require squared input. x, y is the coordinate of top left ''' return detect_result # Written by Lvmin def faceDetect(body: BodyResult, oriImg) -> Union[Tuple[int, int, int], None]: """ Detect the face in the input body pose keypoints and calculate the bounding box for the face. Args: body (BodyResult): A BodyResult object containing the detected body pose keypoints. oriImg (numpy.ndarray): A 3D numpy array representing the original input image. Returns: Tuple[int, int, int] | None: A tuple containing the coordinates (x, y) of the top-left corner of the bounding box and the width (height) of the bounding box, or None if the face is not detected or the bounding box width is less than 20 pixels. Notes: - The width and height of the bounding box are equal. - The minimum bounding box size is 20 pixels. """ # left right eye ear 14 15 16 17 image_height, image_width = oriImg.shape[0:2] keypoints = body.keypoints head = keypoints[0] left_eye = keypoints[14] right_eye = keypoints[15] left_ear = keypoints[16] right_ear = keypoints[17] if head is None or all(keypoint is None for keypoint in (left_eye, right_eye, left_ear, right_ear)): return None width = 0.0 x0, y0 = head.x, head.y if left_eye is not None: x1, y1 = left_eye.x, left_eye.y d = max(abs(x0 - x1), abs(y0 - y1)) width = max(width, d * 3.0) if right_eye is not None: x1, y1 = right_eye.x, right_eye.y d = max(abs(x0 - x1), abs(y0 - y1)) width = max(width, d * 3.0) if left_ear is not None: x1, y1 = left_ear.x, left_ear.y d = max(abs(x0 - x1), abs(y0 - y1)) width = max(width, d * 1.5) if right_ear is not None: x1, y1 = right_ear.x, right_ear.y d = max(abs(x0 - x1), abs(y0 - y1)) width = max(width, d * 1.5) x, y = x0, y0 x -= width y -= width if x < 0: x = 0 if y < 0: y = 0 width1 = width * 2 width2 = width * 2 if x + width > image_width: width1 = image_width - x if y + width > image_height: width2 = image_height - y width = min(width1, width2) if width >= 20: return int(x), int(y), int(width) else: return None # get max index of 2d array def npmax(array): arrayindex = array.argmax(1) arrayvalue = array.max(1) i = arrayvalue.argmax() j = arrayindex[i] return i, j ================================================ FILE: annotator/openpose/wholebody.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import cv2 import numpy as np from .cv_ox_det import inference_detector from .cv_ox_pose import inference_pose from typing import List, Optional from .types import HumanPoseResult, BodyResult, Keypoint class Wholebody: def __init__(self, onnx_det: str, onnx_pose: str): # Always loads to CPU to avoid building OpenCV. device = 'cpu' backend = cv2.dnn.DNN_BACKEND_OPENCV if device == 'cpu' else cv2.dnn.DNN_BACKEND_CUDA # You need to manually build OpenCV through cmake to work with your GPU. providers = cv2.dnn.DNN_TARGET_CPU if device == 'cpu' else cv2.dnn.DNN_TARGET_CUDA self.session_det = cv2.dnn.readNetFromONNX(onnx_det) self.session_det.setPreferableBackend(backend) self.session_det.setPreferableTarget(providers) self.session_pose = cv2.dnn.readNetFromONNX(onnx_pose) self.session_pose.setPreferableBackend(backend) self.session_pose.setPreferableTarget(providers) def __call__(self, oriImg) -> Optional[np.ndarray]: det_result = inference_detector(self.session_det, oriImg) if det_result is None: return None keypoints, scores = inference_pose(self.session_pose, det_result, oriImg) keypoints_info = np.concatenate( (keypoints, scores[..., None]), axis=-1) # compute neck joint neck = np.mean(keypoints_info[:, [5, 6]], axis=1) # neck score when visualizing pred neck[:, 2:4] = np.logical_and( keypoints_info[:, 5, 2:4] > 0.3, keypoints_info[:, 6, 2:4] > 0.3).astype(int) new_keypoints_info = np.insert( keypoints_info, 17, neck, axis=1) mmpose_idx = [ 17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3 ] openpose_idx = [ 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17 ] new_keypoints_info[:, openpose_idx] = \ new_keypoints_info[:, mmpose_idx] keypoints_info = new_keypoints_info return keypoints_info @staticmethod def format_result(keypoints_info: Optional[np.ndarray]) -> List[HumanPoseResult]: def format_keypoint_part( part: np.ndarray, ) -> Optional[List[Optional[Keypoint]]]: keypoints = [ Keypoint(x, y, score, i) if score >= 0.3 else None for i, (x, y, score) in enumerate(part) ] return ( None if all(keypoint is None for keypoint in keypoints) else keypoints ) def total_score(keypoints: Optional[List[Optional[Keypoint]]]) -> float: return ( sum(keypoint.score for keypoint in keypoints if keypoint is not None) if keypoints is not None else 0.0 ) pose_results = [] if keypoints_info is None: return pose_results for instance in keypoints_info: body_keypoints = format_keypoint_part(instance[:18]) or ([None] * 18) left_hand = format_keypoint_part(instance[92:113]) right_hand = format_keypoint_part(instance[113:134]) face = format_keypoint_part(instance[24:92]) # Openpose face consists of 70 points in total, while DWPose only # provides 68 points. Padding the last 2 points. if face is not None: # left eye face.append(body_keypoints[14]) # right eye face.append(body_keypoints[15]) body = BodyResult( body_keypoints, total_score(body_keypoints), len(body_keypoints) ) pose_results.append(HumanPoseResult(body, left_hand, right_hand, face)) return pose_results ================================================ FILE: annotator/pidinet/LICENSE ================================================ It is just for research purpose, and commercial use should be contacted with authors first. Copyright (c) 2021 Zhuo Su Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/pidinet/__init__.py ================================================ import os import torch import numpy as np from einops import rearrange from annotator.pidinet.model import pidinet from annotator.util import safe_step from modules import devices from annotator.annotator_path import models_path from scripts.utils import load_state_dict netNetwork = None remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/table5_pidinet.pth" modeldir = os.path.join(models_path, "pidinet") old_modeldir = os.path.dirname(os.path.realpath(__file__)) def apply_pidinet(input_image, is_safe=False, apply_fliter=False): global netNetwork if netNetwork is None: modelpath = os.path.join(modeldir, "table5_pidinet.pth") old_modelpath = os.path.join(old_modeldir, "table5_pidinet.pth") if os.path.exists(old_modelpath): modelpath = old_modelpath elif not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=modeldir) netNetwork = pidinet() ckp = load_state_dict(modelpath) netNetwork.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()}) netNetwork = netNetwork.to(devices.get_device_for("controlnet")) netNetwork.eval() assert input_image.ndim == 3 input_image = input_image[:, :, ::-1].copy() with torch.no_grad(): image_pidi = torch.from_numpy(input_image).float().to(devices.get_device_for("controlnet")) image_pidi = image_pidi / 255.0 image_pidi = rearrange(image_pidi, 'h w c -> 1 c h w') edge = netNetwork(image_pidi)[-1] edge = edge.cpu().numpy() if apply_fliter: edge = edge > 0.5 if is_safe: edge = safe_step(edge) edge = (edge * 255.0).clip(0, 255).astype(np.uint8) return edge[0][0] def unload_pid_model(): global netNetwork if netNetwork is not None: netNetwork.cpu() ================================================ FILE: annotator/pidinet/model.py ================================================ """ Author: Zhuo Su, Wenzhe Liu Date: Feb 18, 2021 """ import math import torch import torch.nn as nn import torch.nn.functional as F from modules import devices nets = { 'baseline': { 'layer0': 'cv', 'layer1': 'cv', 'layer2': 'cv', 'layer3': 'cv', 'layer4': 'cv', 'layer5': 'cv', 'layer6': 'cv', 'layer7': 'cv', 'layer8': 'cv', 'layer9': 'cv', 'layer10': 'cv', 'layer11': 'cv', 'layer12': 'cv', 'layer13': 'cv', 'layer14': 'cv', 'layer15': 'cv', }, 'c-v15': { 'layer0': 'cd', 'layer1': 'cv', 'layer2': 'cv', 'layer3': 'cv', 'layer4': 'cv', 'layer5': 'cv', 'layer6': 'cv', 'layer7': 'cv', 'layer8': 'cv', 'layer9': 'cv', 'layer10': 'cv', 'layer11': 'cv', 'layer12': 'cv', 'layer13': 'cv', 'layer14': 'cv', 'layer15': 'cv', }, 'a-v15': { 'layer0': 'ad', 'layer1': 'cv', 'layer2': 'cv', 'layer3': 'cv', 'layer4': 'cv', 'layer5': 'cv', 'layer6': 'cv', 'layer7': 'cv', 'layer8': 'cv', 'layer9': 'cv', 'layer10': 'cv', 'layer11': 'cv', 'layer12': 'cv', 'layer13': 'cv', 'layer14': 'cv', 'layer15': 'cv', }, 'r-v15': { 'layer0': 'rd', 'layer1': 'cv', 'layer2': 'cv', 'layer3': 'cv', 'layer4': 'cv', 'layer5': 'cv', 'layer6': 'cv', 'layer7': 'cv', 'layer8': 'cv', 'layer9': 'cv', 'layer10': 'cv', 'layer11': 'cv', 'layer12': 'cv', 'layer13': 'cv', 'layer14': 'cv', 'layer15': 'cv', }, 'cvvv4': { 'layer0': 'cd', 'layer1': 'cv', 'layer2': 'cv', 'layer3': 'cv', 'layer4': 'cd', 'layer5': 'cv', 'layer6': 'cv', 'layer7': 'cv', 'layer8': 'cd', 'layer9': 'cv', 'layer10': 'cv', 'layer11': 'cv', 'layer12': 'cd', 'layer13': 'cv', 'layer14': 'cv', 'layer15': 'cv', }, 'avvv4': { 'layer0': 'ad', 'layer1': 'cv', 'layer2': 'cv', 'layer3': 'cv', 'layer4': 'ad', 'layer5': 'cv', 'layer6': 'cv', 'layer7': 'cv', 'layer8': 'ad', 'layer9': 'cv', 'layer10': 'cv', 'layer11': 'cv', 'layer12': 'ad', 'layer13': 'cv', 'layer14': 'cv', 'layer15': 'cv', }, 'rvvv4': { 'layer0': 'rd', 'layer1': 'cv', 'layer2': 'cv', 'layer3': 'cv', 'layer4': 'rd', 'layer5': 'cv', 'layer6': 'cv', 'layer7': 'cv', 'layer8': 'rd', 'layer9': 'cv', 'layer10': 'cv', 'layer11': 'cv', 'layer12': 'rd', 'layer13': 'cv', 'layer14': 'cv', 'layer15': 'cv', }, 'cccv4': { 'layer0': 'cd', 'layer1': 'cd', 'layer2': 'cd', 'layer3': 'cv', 'layer4': 'cd', 'layer5': 'cd', 'layer6': 'cd', 'layer7': 'cv', 'layer8': 'cd', 'layer9': 'cd', 'layer10': 'cd', 'layer11': 'cv', 'layer12': 'cd', 'layer13': 'cd', 'layer14': 'cd', 'layer15': 'cv', }, 'aaav4': { 'layer0': 'ad', 'layer1': 'ad', 'layer2': 'ad', 'layer3': 'cv', 'layer4': 'ad', 'layer5': 'ad', 'layer6': 'ad', 'layer7': 'cv', 'layer8': 'ad', 'layer9': 'ad', 'layer10': 'ad', 'layer11': 'cv', 'layer12': 'ad', 'layer13': 'ad', 'layer14': 'ad', 'layer15': 'cv', }, 'rrrv4': { 'layer0': 'rd', 'layer1': 'rd', 'layer2': 'rd', 'layer3': 'cv', 'layer4': 'rd', 'layer5': 'rd', 'layer6': 'rd', 'layer7': 'cv', 'layer8': 'rd', 'layer9': 'rd', 'layer10': 'rd', 'layer11': 'cv', 'layer12': 'rd', 'layer13': 'rd', 'layer14': 'rd', 'layer15': 'cv', }, 'c16': { 'layer0': 'cd', 'layer1': 'cd', 'layer2': 'cd', 'layer3': 'cd', 'layer4': 'cd', 'layer5': 'cd', 'layer6': 'cd', 'layer7': 'cd', 'layer8': 'cd', 'layer9': 'cd', 'layer10': 'cd', 'layer11': 'cd', 'layer12': 'cd', 'layer13': 'cd', 'layer14': 'cd', 'layer15': 'cd', }, 'a16': { 'layer0': 'ad', 'layer1': 'ad', 'layer2': 'ad', 'layer3': 'ad', 'layer4': 'ad', 'layer5': 'ad', 'layer6': 'ad', 'layer7': 'ad', 'layer8': 'ad', 'layer9': 'ad', 'layer10': 'ad', 'layer11': 'ad', 'layer12': 'ad', 'layer13': 'ad', 'layer14': 'ad', 'layer15': 'ad', }, 'r16': { 'layer0': 'rd', 'layer1': 'rd', 'layer2': 'rd', 'layer3': 'rd', 'layer4': 'rd', 'layer5': 'rd', 'layer6': 'rd', 'layer7': 'rd', 'layer8': 'rd', 'layer9': 'rd', 'layer10': 'rd', 'layer11': 'rd', 'layer12': 'rd', 'layer13': 'rd', 'layer14': 'rd', 'layer15': 'rd', }, 'carv4': { 'layer0': 'cd', 'layer1': 'ad', 'layer2': 'rd', 'layer3': 'cv', 'layer4': 'cd', 'layer5': 'ad', 'layer6': 'rd', 'layer7': 'cv', 'layer8': 'cd', 'layer9': 'ad', 'layer10': 'rd', 'layer11': 'cv', 'layer12': 'cd', 'layer13': 'ad', 'layer14': 'rd', 'layer15': 'cv', }, } def createConvFunc(op_type): assert op_type in ['cv', 'cd', 'ad', 'rd'], 'unknown op type: %s' % str(op_type) if op_type == 'cv': return F.conv2d if op_type == 'cd': def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1): assert dilation in [1, 2], 'dilation for cd_conv should be in 1 or 2' assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for cd_conv should be 3x3' assert padding == dilation, 'padding for cd_conv set wrong' weights_c = weights.sum(dim=[2, 3], keepdim=True) yc = F.conv2d(x, weights_c, stride=stride, padding=0, groups=groups) y = F.conv2d(x, weights, bias, stride=stride, padding=padding, dilation=dilation, groups=groups) return y - yc return func elif op_type == 'ad': def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1): assert dilation in [1, 2], 'dilation for ad_conv should be in 1 or 2' assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for ad_conv should be 3x3' assert padding == dilation, 'padding for ad_conv set wrong' shape = weights.shape weights = weights.view(shape[0], shape[1], -1) weights_conv = (weights - weights[:, :, [3, 0, 1, 6, 4, 2, 7, 8, 5]]).view(shape) # clock-wise y = F.conv2d(x, weights_conv, bias, stride=stride, padding=padding, dilation=dilation, groups=groups) return y return func elif op_type == 'rd': def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1): assert dilation in [1, 2], 'dilation for rd_conv should be in 1 or 2' assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for rd_conv should be 3x3' padding = 2 * dilation shape = weights.shape if weights.is_cuda: buffer = torch.cuda.FloatTensor(shape[0], shape[1], 5 * 5).fill_(0).to(devices.get_device_for("controlnet")) else: buffer = torch.zeros(shape[0], shape[1], 5 * 5).to(devices.get_device_for("controlnet")) weights = weights.view(shape[0], shape[1], -1) buffer[:, :, [0, 2, 4, 10, 14, 20, 22, 24]] = weights[:, :, 1:] buffer[:, :, [6, 7, 8, 11, 13, 16, 17, 18]] = -weights[:, :, 1:] buffer[:, :, 12] = 0 buffer = buffer.view(shape[0], shape[1], 5, 5) y = F.conv2d(x, buffer, bias, stride=stride, padding=padding, dilation=dilation, groups=groups) return y return func else: print('impossible to be here unless you force that') return None class Conv2d(nn.Module): def __init__(self, pdc, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False): super(Conv2d, self).__init__() if in_channels % groups != 0: raise ValueError('in_channels must be divisible by groups') if out_channels % groups != 0: raise ValueError('out_channels must be divisible by groups') self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride self.padding = padding self.dilation = dilation self.groups = groups self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size)) if bias: self.bias = nn.Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.reset_parameters() self.pdc = pdc def reset_parameters(self): nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) nn.init.uniform_(self.bias, -bound, bound) def forward(self, input): return self.pdc(input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) class CSAM(nn.Module): """ Compact Spatial Attention Module """ def __init__(self, channels): super(CSAM, self).__init__() mid_channels = 4 self.relu1 = nn.ReLU() self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, padding=0) self.conv2 = nn.Conv2d(mid_channels, 1, kernel_size=3, padding=1, bias=False) self.sigmoid = nn.Sigmoid() nn.init.constant_(self.conv1.bias, 0) def forward(self, x): y = self.relu1(x) y = self.conv1(y) y = self.conv2(y) y = self.sigmoid(y) return x * y class CDCM(nn.Module): """ Compact Dilation Convolution based Module """ def __init__(self, in_channels, out_channels): super(CDCM, self).__init__() self.relu1 = nn.ReLU() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0) self.conv2_1 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=5, padding=5, bias=False) self.conv2_2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=7, padding=7, bias=False) self.conv2_3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=9, padding=9, bias=False) self.conv2_4 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=11, padding=11, bias=False) nn.init.constant_(self.conv1.bias, 0) def forward(self, x): x = self.relu1(x) x = self.conv1(x) x1 = self.conv2_1(x) x2 = self.conv2_2(x) x3 = self.conv2_3(x) x4 = self.conv2_4(x) return x1 + x2 + x3 + x4 class MapReduce(nn.Module): """ Reduce feature maps into a single edge map """ def __init__(self, channels): super(MapReduce, self).__init__() self.conv = nn.Conv2d(channels, 1, kernel_size=1, padding=0) nn.init.constant_(self.conv.bias, 0) def forward(self, x): return self.conv(x) class PDCBlock(nn.Module): def __init__(self, pdc, inplane, ouplane, stride=1): super(PDCBlock, self).__init__() self.stride=stride self.stride=stride if self.stride > 1: self.pool = nn.MaxPool2d(kernel_size=2, stride=2) self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0) self.conv1 = Conv2d(pdc, inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False) self.relu2 = nn.ReLU() self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False) def forward(self, x): if self.stride > 1: x = self.pool(x) y = self.conv1(x) y = self.relu2(y) y = self.conv2(y) if self.stride > 1: x = self.shortcut(x) y = y + x return y class PDCBlock_converted(nn.Module): """ CPDC, APDC can be converted to vanilla 3x3 convolution RPDC can be converted to vanilla 5x5 convolution """ def __init__(self, pdc, inplane, ouplane, stride=1): super(PDCBlock_converted, self).__init__() self.stride=stride if self.stride > 1: self.pool = nn.MaxPool2d(kernel_size=2, stride=2) self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0) if pdc == 'rd': self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=5, padding=2, groups=inplane, bias=False) else: self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False) self.relu2 = nn.ReLU() self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False) def forward(self, x): if self.stride > 1: x = self.pool(x) y = self.conv1(x) y = self.relu2(y) y = self.conv2(y) if self.stride > 1: x = self.shortcut(x) y = y + x return y class PiDiNet(nn.Module): def __init__(self, inplane, pdcs, dil=None, sa=False, convert=False): super(PiDiNet, self).__init__() self.sa = sa if dil is not None: assert isinstance(dil, int), 'dil should be an int' self.dil = dil self.fuseplanes = [] self.inplane = inplane if convert: if pdcs[0] == 'rd': init_kernel_size = 5 init_padding = 2 else: init_kernel_size = 3 init_padding = 1 self.init_block = nn.Conv2d(3, self.inplane, kernel_size=init_kernel_size, padding=init_padding, bias=False) block_class = PDCBlock_converted else: self.init_block = Conv2d(pdcs[0], 3, self.inplane, kernel_size=3, padding=1) block_class = PDCBlock self.block1_1 = block_class(pdcs[1], self.inplane, self.inplane) self.block1_2 = block_class(pdcs[2], self.inplane, self.inplane) self.block1_3 = block_class(pdcs[3], self.inplane, self.inplane) self.fuseplanes.append(self.inplane) # C inplane = self.inplane self.inplane = self.inplane * 2 self.block2_1 = block_class(pdcs[4], inplane, self.inplane, stride=2) self.block2_2 = block_class(pdcs[5], self.inplane, self.inplane) self.block2_3 = block_class(pdcs[6], self.inplane, self.inplane) self.block2_4 = block_class(pdcs[7], self.inplane, self.inplane) self.fuseplanes.append(self.inplane) # 2C inplane = self.inplane self.inplane = self.inplane * 2 self.block3_1 = block_class(pdcs[8], inplane, self.inplane, stride=2) self.block3_2 = block_class(pdcs[9], self.inplane, self.inplane) self.block3_3 = block_class(pdcs[10], self.inplane, self.inplane) self.block3_4 = block_class(pdcs[11], self.inplane, self.inplane) self.fuseplanes.append(self.inplane) # 4C self.block4_1 = block_class(pdcs[12], self.inplane, self.inplane, stride=2) self.block4_2 = block_class(pdcs[13], self.inplane, self.inplane) self.block4_3 = block_class(pdcs[14], self.inplane, self.inplane) self.block4_4 = block_class(pdcs[15], self.inplane, self.inplane) self.fuseplanes.append(self.inplane) # 4C self.conv_reduces = nn.ModuleList() if self.sa and self.dil is not None: self.attentions = nn.ModuleList() self.dilations = nn.ModuleList() for i in range(4): self.dilations.append(CDCM(self.fuseplanes[i], self.dil)) self.attentions.append(CSAM(self.dil)) self.conv_reduces.append(MapReduce(self.dil)) elif self.sa: self.attentions = nn.ModuleList() for i in range(4): self.attentions.append(CSAM(self.fuseplanes[i])) self.conv_reduces.append(MapReduce(self.fuseplanes[i])) elif self.dil is not None: self.dilations = nn.ModuleList() for i in range(4): self.dilations.append(CDCM(self.fuseplanes[i], self.dil)) self.conv_reduces.append(MapReduce(self.dil)) else: for i in range(4): self.conv_reduces.append(MapReduce(self.fuseplanes[i])) self.classifier = nn.Conv2d(4, 1, kernel_size=1) # has bias nn.init.constant_(self.classifier.weight, 0.25) nn.init.constant_(self.classifier.bias, 0) # print('initialization done') def get_weights(self): conv_weights = [] bn_weights = [] relu_weights = [] for pname, p in self.named_parameters(): if 'bn' in pname: bn_weights.append(p) elif 'relu' in pname: relu_weights.append(p) else: conv_weights.append(p) return conv_weights, bn_weights, relu_weights def forward(self, x): H, W = x.size()[2:] x = self.init_block(x) x1 = self.block1_1(x) x1 = self.block1_2(x1) x1 = self.block1_3(x1) x2 = self.block2_1(x1) x2 = self.block2_2(x2) x2 = self.block2_3(x2) x2 = self.block2_4(x2) x3 = self.block3_1(x2) x3 = self.block3_2(x3) x3 = self.block3_3(x3) x3 = self.block3_4(x3) x4 = self.block4_1(x3) x4 = self.block4_2(x4) x4 = self.block4_3(x4) x4 = self.block4_4(x4) x_fuses = [] if self.sa and self.dil is not None: for i, xi in enumerate([x1, x2, x3, x4]): x_fuses.append(self.attentions[i](self.dilations[i](xi))) elif self.sa: for i, xi in enumerate([x1, x2, x3, x4]): x_fuses.append(self.attentions[i](xi)) elif self.dil is not None: for i, xi in enumerate([x1, x2, x3, x4]): x_fuses.append(self.dilations[i](xi)) else: x_fuses = [x1, x2, x3, x4] e1 = self.conv_reduces[0](x_fuses[0]) e1 = F.interpolate(e1, (H, W), mode="bilinear", align_corners=False) e2 = self.conv_reduces[1](x_fuses[1]) e2 = F.interpolate(e2, (H, W), mode="bilinear", align_corners=False) e3 = self.conv_reduces[2](x_fuses[2]) e3 = F.interpolate(e3, (H, W), mode="bilinear", align_corners=False) e4 = self.conv_reduces[3](x_fuses[3]) e4 = F.interpolate(e4, (H, W), mode="bilinear", align_corners=False) outputs = [e1, e2, e3, e4] output = self.classifier(torch.cat(outputs, dim=1)) #if not self.training: # return torch.sigmoid(output) outputs.append(output) outputs = [torch.sigmoid(r) for r in outputs] return outputs def config_model(model): model_options = list(nets.keys()) assert model in model_options, \ 'unrecognized model, please choose from %s' % str(model_options) # print(str(nets[model])) pdcs = [] for i in range(16): layer_name = 'layer%d' % i op = nets[model][layer_name] pdcs.append(createConvFunc(op)) return pdcs def pidinet(): pdcs = config_model('carv4') dil = 24 #if args.dil else None return PiDiNet(60, pdcs, dil=dil, sa=True) ================================================ FILE: annotator/shuffle/__init__.py ================================================ import cv2 import numpy as np from annotator.util import make_noise_disk class ContentShuffleDetector: def __call__(self, img, h=None, w=None, f=None): H, W, C = img.shape if h is None: h = H if w is None: w = W if f is None: f = 256 x = make_noise_disk(h, w, 1, f) * float(W - 1) y = make_noise_disk(h, w, 1, f) * float(H - 1) flow = np.concatenate([x, y], axis=2).astype(np.float32) return cv2.remap(img, flow, None, cv2.INTER_LINEAR) ================================================ FILE: annotator/teed/Fmish.py ================================================ """ Script provides functional interface for Mish activation function. """ # import pytorch import torch import torch.nn.functional as F @torch.jit.script def mish(input): """ Applies the mish function element-wise: mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x))) See additional documentation for mish class. """ return input * torch.tanh(F.softplus(input)) ================================================ FILE: annotator/teed/Fsmish.py ================================================ """ Script based on: Wang, Xueliang, Honge Ren, and Achuan Wang. "Smish: A Novel Activation Function for Deep Learning Methods. " Electronics 11.4 (2022): 540. """ # import pytorch import torch import torch.nn.functional as F @torch.jit.script def smish(input): """ Applies the mish function element-wise: mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(sigmoid(x)))) See additional documentation for mish class. """ return input * torch.tanh(torch.log(1+torch.sigmoid(input))) ================================================ FILE: annotator/teed/LICENSE.txt ================================================ MIT License Copyright (c) 2022 Xavier Soria Poma Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/teed/Xmish.py ================================================ """ Applies the mish function element-wise: mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x))) """ # import pytorch import torch import torch.nn.functional as F from torch import nn # import activation functions from .Fmish import mish class Mish(nn.Module): """ Applies the mish function element-wise: mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x))) Shape: - Input: (N, *) where * means, any number of additional dimensions - Output: (N, *), same shape as the input Examples: >>> m = Mish() >>> input = torch.randn(2) >>> output = m(input) Reference: https://pytorch.org/docs/stable/generated/torch.nn.Mish.html """ def __init__(self): """ Init method. """ super().__init__() def forward(self, input): """ Forward pass of the function. """ if torch.__version__ >= "1.9": return F.mish(input) else: return mish(input) ================================================ FILE: annotator/teed/Xsmish.py ================================================ """ Script based on: Wang, Xueliang, Honge Ren, and Achuan Wang. "Smish: A Novel Activation Function for Deep Learning Methods. " Electronics 11.4 (2022): 540. smish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + sigmoid(x))) """ # import pytorch import torch import torch.nn.functional as F from torch import nn # import activation functions from .Fsmish import smish class Smish(nn.Module): """ Applies the mish function element-wise: mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x))) Shape: - Input: (N, *) where * means, any number of additional dimensions - Output: (N, *), same shape as the input Examples: >>> m = Mish() >>> input = torch.randn(2) >>> output = m(input) Reference: https://pytorch.org/docs/stable/generated/torch.nn.Mish.html """ def __init__(self): """ Init method. """ super().__init__() def forward(self, input): """ Forward pass of the function. """ return smish(input) ================================================ FILE: annotator/teed/__init__.py ================================================ from __future__ import print_function import os import cv2 import numpy as np import torch from einops import rearrange from modules import devices from modules.safe import unsafe_torch_load from annotator.teed.ted import TED # TEED architecture from annotator.util import load_model, safe_step from annotator.annotator_path import models_path class TEEDDetector: """https://github.com/xavysp/TEED""" model_dir = os.path.join(models_path, "TEED") def __init__(self, mteed: bool = False): self.device = devices.get_device_for("controlnet") self.model = TED().to(self.device).eval() if mteed: self.load_mteed_model() else: self.load_teed_model() def load_teed_model(self): """Load vanilla TEED model""" remote_url = os.environ.get( "CONTROLNET_TEED_MODEL_URL", "https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite/resolve/main/Annotators/7_model.pth", ) model_path = load_model( "7_model.pth", remote_url=remote_url, model_dir=self.model_dir ) self.model.load_state_dict(unsafe_torch_load(model_path)) def load_mteed_model(self): """Load MTEED model for Anyline""" remote_url = ( "https://huggingface.co/TheMistoAI/MistoLine/resolve/main/Anyline/MTEED.pth" ) model_path = load_model( "MTEED.pth", remote_url=remote_url, model_dir=self.model_dir ) self.model.load_state_dict(unsafe_torch_load(model_path)) def unload_model(self): if self.model is not None: self.model.cpu() def __call__(self, image: np.ndarray, safe_steps: int = 2) -> np.ndarray: self.model.to(self.device) H, W, _ = image.shape with torch.no_grad(): image_teed = torch.from_numpy(image.copy()).float().to(self.device) image_teed = rearrange(image_teed, "h w c -> 1 c h w") edges = self.model(image_teed) edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges] edges = [ cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges ] edges = np.stack(edges, axis=2) edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64))) if safe_steps != 0: edge = safe_step(edge, safe_steps) edge = (edge * 255.0).clip(0, 255).astype(np.uint8) return edge ================================================ FILE: annotator/teed/ted.py ================================================ # TEED: is a Tiny but Efficient Edge Detection, it comes from the LDC-B3 # with a Slightly modification # LDC parameters: # 155665 # TED > 58K import torch import torch.nn as nn import torch.nn.functional as F from .Fsmish import smish as Fsmish from .Xsmish import Smish def weight_init(m): if isinstance(m, (nn.Conv2d,)): torch.nn.init.xavier_normal_(m.weight, gain=1.0) if m.bias is not None: torch.nn.init.zeros_(m.bias) # for fusion layer if isinstance(m, (nn.ConvTranspose2d,)): torch.nn.init.xavier_normal_(m.weight, gain=1.0) if m.bias is not None: torch.nn.init.zeros_(m.bias) class CoFusion(nn.Module): # from LDC def __init__(self, in_ch, out_ch): super(CoFusion, self).__init__() self.conv1 = nn.Conv2d(in_ch, 32, kernel_size=3, stride=1, padding=1) # before 64 self.conv3= nn.Conv2d(32, out_ch, kernel_size=3, stride=1, padding=1)# before 64 instead of 32 self.relu = nn.ReLU() self.norm_layer1 = nn.GroupNorm(4, 32) # before 64 def forward(self, x): # fusecat = torch.cat(x, dim=1) attn = self.relu(self.norm_layer1(self.conv1(x))) attn = F.softmax(self.conv3(attn), dim=1) return ((x * attn).sum(1)).unsqueeze(1) class CoFusion2(nn.Module): # TEDv14-3 def __init__(self, in_ch, out_ch): super(CoFusion2, self).__init__() self.conv1 = nn.Conv2d(in_ch, 32, kernel_size=3, stride=1, padding=1) # before 64 # self.conv2 = nn.Conv2d(32, 32, kernel_size=3, # stride=1, padding=1)# before 64 self.conv3 = nn.Conv2d(32, out_ch, kernel_size=3, stride=1, padding=1)# before 64 instead of 32 self.smish= Smish()#nn.ReLU(inplace=True) def forward(self, x): # fusecat = torch.cat(x, dim=1) attn = self.conv1(self.smish(x)) attn = self.conv3(self.smish(attn)) # before , )dim=1) # return ((fusecat * attn).sum(1)).unsqueeze(1) return ((x * attn).sum(1)).unsqueeze(1) class DoubleFusion(nn.Module): # TED fusion before the final edge map prediction def __init__(self, in_ch, out_ch): super(DoubleFusion, self).__init__() self.DWconv1 = nn.Conv2d(in_ch, in_ch*8, kernel_size=3, stride=1, padding=1, groups=in_ch) # before 64 self.PSconv1 = nn.PixelShuffle(1) self.DWconv2 = nn.Conv2d(24, 24*1, kernel_size=3, stride=1, padding=1,groups=24)# before 64 instead of 32 self.AF= Smish()#XAF() #nn.Tanh()# XAF() # # Smish()# def forward(self, x): # fusecat = torch.cat(x, dim=1) attn = self.PSconv1(self.DWconv1(self.AF(x))) # #TEED best res TEDv14 [8, 32, 352, 352] attn2 = self.PSconv1(self.DWconv2(self.AF(attn))) # #TEED best res TEDv14[8, 3, 352, 352] return Fsmish(((attn2 +attn).sum(1)).unsqueeze(1)) #TED best res class _DenseLayer(nn.Sequential): def __init__(self, input_features, out_features): super(_DenseLayer, self).__init__() self.add_module('conv1', nn.Conv2d(input_features, out_features, kernel_size=3, stride=1, padding=2, bias=True)), self.add_module('smish1', Smish()), self.add_module('conv2', nn.Conv2d(out_features, out_features, kernel_size=3, stride=1, bias=True)) def forward(self, x): x1, x2 = x new_features = super(_DenseLayer, self).forward(Fsmish(x1)) # F.relu() return 0.5 * (new_features + x2), x2 class _DenseBlock(nn.Sequential): def __init__(self, num_layers, input_features, out_features): super(_DenseBlock, self).__init__() for i in range(num_layers): layer = _DenseLayer(input_features, out_features) self.add_module('denselayer%d' % (i + 1), layer) input_features = out_features class UpConvBlock(nn.Module): def __init__(self, in_features, up_scale): super(UpConvBlock, self).__init__() self.up_factor = 2 self.constant_features = 16 layers = self.make_deconv_layers(in_features, up_scale) assert layers is not None, layers self.features = nn.Sequential(*layers) def make_deconv_layers(self, in_features, up_scale): layers = [] all_pads=[0,0,1,3,7] for i in range(up_scale): kernel_size = 2 ** up_scale pad = all_pads[up_scale] # kernel_size-1 out_features = self.compute_out_features(i, up_scale) layers.append(nn.Conv2d(in_features, out_features, 1)) layers.append(Smish()) layers.append(nn.ConvTranspose2d( out_features, out_features, kernel_size, stride=2, padding=pad)) in_features = out_features return layers def compute_out_features(self, idx, up_scale): return 1 if idx == up_scale - 1 else self.constant_features def forward(self, x): return self.features(x) class SingleConvBlock(nn.Module): def __init__(self, in_features, out_features, stride, use_ac=False): super(SingleConvBlock, self).__init__() # self.use_bn = use_bs self.use_ac=use_ac self.conv = nn.Conv2d(in_features, out_features, 1, stride=stride, bias=True) if self.use_ac: self.smish = Smish() def forward(self, x): x = self.conv(x) if self.use_ac: return self.smish(x) else: return x class DoubleConvBlock(nn.Module): def __init__(self, in_features, mid_features, out_features=None, stride=1, use_act=True): super(DoubleConvBlock, self).__init__() self.use_act = use_act if out_features is None: out_features = mid_features self.conv1 = nn.Conv2d(in_features, mid_features, 3, padding=1, stride=stride) self.conv2 = nn.Conv2d(mid_features, out_features, 3, padding=1) self.smish= Smish()#nn.ReLU(inplace=True) def forward(self, x): x = self.conv1(x) x = self.smish(x) x = self.conv2(x) if self.use_act: x = self.smish(x) return x class TED(nn.Module): """ Definition of Tiny and Efficient Edge Detector model """ def __init__(self): super(TED, self).__init__() self.block_1 = DoubleConvBlock(3, 16, 16, stride=2,) self.block_2 = DoubleConvBlock(16, 32, use_act=False) self.dblock_3 = _DenseBlock(1, 32, 48) # [32,48,100,100] before (2, 32, 64) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) # skip1 connection, see fig. 2 self.side_1 = SingleConvBlock(16, 32, 2) # skip2 connection, see fig. 2 self.pre_dense_3 = SingleConvBlock(32, 48, 1) # before (32, 64, 1) # USNet self.up_block_1 = UpConvBlock(16, 1) self.up_block_2 = UpConvBlock(32, 1) self.up_block_3 = UpConvBlock(48, 2) # (32, 64, 1) self.block_cat = DoubleFusion(3,3) # TEED: DoubleFusion self.apply(weight_init) def slice(self, tensor, slice_shape): t_shape = tensor.shape img_h, img_w = slice_shape if img_w!=t_shape[-1] or img_h!=t_shape[2]: new_tensor = F.interpolate( tensor, size=(img_h, img_w), mode='bicubic',align_corners=False) else: new_tensor=tensor # tensor[..., :height, :width] return new_tensor def resize_input(self,tensor): t_shape = tensor.shape if t_shape[2] % 8 != 0 or t_shape[3] % 8 != 0: img_w= ((t_shape[3]// 8) + 1) * 8 img_h = ((t_shape[2] // 8) + 1) * 8 new_tensor = F.interpolate( tensor, size=(img_h, img_w), mode='bicubic', align_corners=False) else: new_tensor = tensor return new_tensor def crop_bdcn(data1, h, w, crop_h, crop_w): # Based on BDCN Implementation @ https://github.com/pkuCactus/BDCN _, _, h1, w1 = data1.size() assert (h <= h1 and w <= w1) data = data1[:, :, crop_h:crop_h + h, crop_w:crop_w + w] return data def forward(self, x, single_test=False): assert x.ndim == 4, x.shape # supose the image size is 352x352 # Block 1 block_1 = self.block_1(x) # [8,16,176,176] block_1_side = self.side_1(block_1) # 16 [8,32,88,88] # Block 2 block_2 = self.block_2(block_1) # 32 # [8,32,176,176] block_2_down = self.maxpool(block_2) # [8,32,88,88] block_2_add = block_2_down + block_1_side # [8,32,88,88] # Block 3 block_3_pre_dense = self.pre_dense_3(block_2_down) # [8,64,88,88] block 3 L connection block_3, _ = self.dblock_3([block_2_add, block_3_pre_dense]) # [8,64,88,88] # upsampling blocks out_1 = self.up_block_1(block_1) out_2 = self.up_block_2(block_2) out_3 = self.up_block_3(block_3) results = [out_1, out_2, out_3] # concatenate multiscale outputs block_cat = torch.cat(results, dim=1) # Bx6xHxW block_cat = self.block_cat(block_cat) # Bx1xHxW DoubleFusion results.append(block_cat) return results if __name__ == '__main__': batch_size = 8 img_height = 352 img_width = 352 # device = "cuda" if torch.cuda.is_available() else "cpu" device = "cpu" input = torch.rand(batch_size, 3, img_height, img_width).to(device) # target = torch.rand(batch_size, 1, img_height, img_width).to(device) print(f"input shape: {input.shape}") model = TED().to(device) output = model(input) print(f"output shapes: {[t.shape for t in output]}") # for i in range(20000): # print(i) # output = model(input) # loss = nn.MSELoss()(output[-1], target) # loss.backward() ================================================ FILE: annotator/uniformer/LICENSE ================================================ Copyright 2022 SenseTime X-Lab. All rights reserved. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2022 SenseTime X-Lab. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: annotator/uniformer/__init__.py ================================================ import os from annotator.annotator_path import models_path from modules import devices from annotator.uniformer.inference import init_segmentor, inference_segmentor, show_result_pyplot try: from mmseg.core.evaluation import get_palette except ImportError: from annotator.mmpkg.mmseg.core.evaluation import get_palette modeldir = os.path.join(models_path, "uniformer") checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth" config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "upernet_global_small.py") old_modeldir = os.path.dirname(os.path.realpath(__file__)) model = None def unload_uniformer_model(): global model if model is not None: model = model.cpu() def apply_uniformer(img): global model if model is None: modelpath = os.path.join(modeldir, "upernet_global_small.pth") old_modelpath = os.path.join(old_modeldir, "upernet_global_small.pth") if os.path.exists(old_modelpath): modelpath = old_modelpath elif not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(checkpoint_file, model_dir=modeldir) model = init_segmentor(config_file, modelpath, device=devices.get_device_for("controlnet")) model = model.to(devices.get_device_for("controlnet")) if devices.get_device_for("controlnet").type == 'mps': # adaptive_avg_pool2d can fail on MPS, workaround with CPU import torch.nn.functional orig_adaptive_avg_pool2d = torch.nn.functional.adaptive_avg_pool2d def cpu_if_exception(input, *args, **kwargs): try: return orig_adaptive_avg_pool2d(input, *args, **kwargs) except: return orig_adaptive_avg_pool2d(input.cpu(), *args, **kwargs).to(input.device) try: torch.nn.functional.adaptive_avg_pool2d = cpu_if_exception result = inference_segmentor(model, img) finally: torch.nn.functional.adaptive_avg_pool2d = orig_adaptive_avg_pool2d else: result = inference_segmentor(model, img) res_img = show_result_pyplot(model, img, result, get_palette('ade'), opacity=1) return res_img ================================================ FILE: annotator/uniformer/configs/_base_/datasets/ade20k.py ================================================ # dataset settings dataset_type = 'ADE20KDataset' data_root = 'data/ade/ADEChallengeData2016' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (512, 512) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', reduce_zero_label=True), dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, img_dir='images/training', ann_dir='annotations/training', pipeline=train_pipeline), val=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline), test=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/chase_db1.py ================================================ # dataset settings dataset_type = 'ChaseDB1Dataset' data_root = 'data/CHASE_DB1' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) img_scale = (960, 999) crop_size = (128, 128) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=img_scale, # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type='RepeatDataset', times=40000, dataset=dict( type=dataset_type, data_root=data_root, img_dir='images/training', ann_dir='annotations/training', pipeline=train_pipeline)), val=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline), test=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/cityscapes.py ================================================ # dataset settings dataset_type = 'CityscapesDataset' data_root = 'data/cityscapes/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (512, 1024) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 1024), # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, img_dir='leftImg8bit/train', ann_dir='gtFine/train', pipeline=train_pipeline), val=dict( type=dataset_type, data_root=data_root, img_dir='leftImg8bit/val', ann_dir='gtFine/val', pipeline=test_pipeline), test=dict( type=dataset_type, data_root=data_root, img_dir='leftImg8bit/val', ann_dir='gtFine/val', pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py ================================================ _base_ = './cityscapes.py' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (769, 769) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2049, 1025), # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( train=dict(pipeline=train_pipeline), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/drive.py ================================================ # dataset settings dataset_type = 'DRIVEDataset' data_root = 'data/DRIVE' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) img_scale = (584, 565) crop_size = (64, 64) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=img_scale, # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type='RepeatDataset', times=40000, dataset=dict( type=dataset_type, data_root=data_root, img_dir='images/training', ann_dir='annotations/training', pipeline=train_pipeline)), val=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline), test=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/hrf.py ================================================ # dataset settings dataset_type = 'HRFDataset' data_root = 'data/HRF' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) img_scale = (2336, 3504) crop_size = (256, 256) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=img_scale, # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type='RepeatDataset', times=40000, dataset=dict( type=dataset_type, data_root=data_root, img_dir='images/training', ann_dir='annotations/training', pipeline=train_pipeline)), val=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline), test=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/pascal_context.py ================================================ # dataset settings dataset_type = 'PascalContextDataset' data_root = 'data/VOCdevkit/VOC2010/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) img_scale = (520, 520) crop_size = (480, 480) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=img_scale, # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, img_dir='JPEGImages', ann_dir='SegmentationClassContext', split='ImageSets/SegmentationContext/train.txt', pipeline=train_pipeline), val=dict( type=dataset_type, data_root=data_root, img_dir='JPEGImages', ann_dir='SegmentationClassContext', split='ImageSets/SegmentationContext/val.txt', pipeline=test_pipeline), test=dict( type=dataset_type, data_root=data_root, img_dir='JPEGImages', ann_dir='SegmentationClassContext', split='ImageSets/SegmentationContext/val.txt', pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/pascal_context_59.py ================================================ # dataset settings dataset_type = 'PascalContextDataset59' data_root = 'data/VOCdevkit/VOC2010/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) img_scale = (520, 520) crop_size = (480, 480) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', reduce_zero_label=True), dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=img_scale, # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, img_dir='JPEGImages', ann_dir='SegmentationClassContext', split='ImageSets/SegmentationContext/train.txt', pipeline=train_pipeline), val=dict( type=dataset_type, data_root=data_root, img_dir='JPEGImages', ann_dir='SegmentationClassContext', split='ImageSets/SegmentationContext/val.txt', pipeline=test_pipeline), test=dict( type=dataset_type, data_root=data_root, img_dir='JPEGImages', ann_dir='SegmentationClassContext', split='ImageSets/SegmentationContext/val.txt', pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/pascal_voc12.py ================================================ # dataset settings dataset_type = 'PascalVOCDataset' data_root = 'data/VOCdevkit/VOC2012' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (512, 512) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, img_dir='JPEGImages', ann_dir='SegmentationClass', split='ImageSets/Segmentation/train.txt', pipeline=train_pipeline), val=dict( type=dataset_type, data_root=data_root, img_dir='JPEGImages', ann_dir='SegmentationClass', split='ImageSets/Segmentation/val.txt', pipeline=test_pipeline), test=dict( type=dataset_type, data_root=data_root, img_dir='JPEGImages', ann_dir='SegmentationClass', split='ImageSets/Segmentation/val.txt', pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py ================================================ _base_ = './pascal_voc12.py' # dataset settings data = dict( train=dict( ann_dir=['SegmentationClass', 'SegmentationClassAug'], split=[ 'ImageSets/Segmentation/train.txt', 'ImageSets/Segmentation/aug.txt' ])) ================================================ FILE: annotator/uniformer/configs/_base_/datasets/stare.py ================================================ # dataset settings dataset_type = 'STAREDataset' data_root = 'data/STARE' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) img_scale = (605, 700) crop_size = (128, 128) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=img_scale, # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0], flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type='RepeatDataset', times=40000, dataset=dict( type=dataset_type, data_root=data_root, img_dir='images/training', ann_dir='annotations/training', pipeline=train_pipeline)), val=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline), test=dict( type=dataset_type, data_root=data_root, img_dir='images/validation', ann_dir='annotations/validation', pipeline=test_pipeline)) ================================================ FILE: annotator/uniformer/configs/_base_/default_runtime.py ================================================ # yapf:disable log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook', by_epoch=False), # dict(type='TensorboardLoggerHook') ]) # yapf:enable dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] cudnn_benchmark = True ================================================ FILE: annotator/uniformer/configs/_base_/models/ann_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='ANNHead', in_channels=[1024, 2048], in_index=[2, 3], channels=512, project_channels=256, query_scales=(1, ), key_pool_scales=(1, 3, 6, 8), dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='APCHead', in_channels=2048, in_index=3, channels=512, pool_scales=(1, 2, 3, 6), dropout_ratio=0.1, num_classes=19, norm_cfg=dict(type='SyncBN', requires_grad=True), align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='CCHead', in_channels=2048, in_index=3, channels=512, recurrence=2, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/cgnet.py ================================================ # model settings norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True) model = dict( type='EncoderDecoder', backbone=dict( type='CGNet', norm_cfg=norm_cfg, in_channels=3, num_channels=(32, 64, 128), num_blocks=(3, 21), dilations=(2, 4), reductions=(8, 16)), decode_head=dict( type='FCNHead', in_channels=256, in_index=2, channels=256, num_convs=0, concat_input=False, dropout_ratio=0, num_classes=19, norm_cfg=norm_cfg, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, class_weight=[ 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352, 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905, 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587, 10.396974, 10.055647 ])), # model training and testing settings train_cfg=dict(sampler=None), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/danet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='DAHead', in_channels=2048, in_index=3, channels=512, pam_channels=64, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='ASPPHead', in_channels=2048, in_index=3, channels=512, dilations=(1, 12, 24, 36), dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained=None, backbone=dict( type='UNet', in_channels=3, base_channels=64, num_stages=5, strides=(1, 1, 1, 1, 1), enc_num_convs=(2, 2, 2, 2, 2), dec_num_convs=(2, 2, 2, 2), downsamples=(True, True, True, True), enc_dilations=(1, 1, 1, 1, 1), dec_dilations=(1, 1, 1, 1), with_cp=False, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=dict(type='ReLU'), upsample_cfg=dict(type='InterpConv'), norm_eval=False), decode_head=dict( type='ASPPHead', in_channels=64, in_index=4, channels=16, dilations=(1, 12, 24, 36), dropout_ratio=0.1, num_classes=2, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=128, in_index=3, channels=64, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=2, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='slide', crop_size=256, stride=170)) ================================================ FILE: annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='DepthwiseSeparableASPPHead', in_channels=2048, in_index=3, channels=512, dilations=(1, 12, 24, 36), c1_in_channels=256, c1_channels=48, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='DMHead', in_channels=2048, in_index=3, channels=512, filter_sizes=(1, 3, 5, 7), dropout_ratio=0.1, num_classes=19, norm_cfg=dict(type='SyncBN', requires_grad=True), align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/dnl_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='DNLHead', in_channels=2048, in_index=3, channels=512, dropout_ratio=0.1, reduction=2, use_scale=True, mode='embedded_gaussian', num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/emanet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='EMAHead', in_channels=2048, in_index=3, channels=256, ema_channels=512, num_bases=64, num_stages=3, momentum=0.1, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/encnet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='EncHead', in_channels=[512, 1024, 2048], in_index=(1, 2, 3), channels=512, num_codes=32, use_se_loss=True, add_lateral=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_se_decode=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/fast_scnn.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01) model = dict( type='EncoderDecoder', backbone=dict( type='FastSCNN', downsample_dw_channels=(32, 48), global_in_channels=64, global_block_channels=(64, 96, 128), global_block_strides=(2, 2, 1), global_out_channels=128, higher_in_channels=64, lower_in_channels=128, fusion_out_channels=128, out_indices=(0, 1, 2), norm_cfg=norm_cfg, align_corners=False), decode_head=dict( type='DepthwiseSeparableFCNHead', in_channels=128, channels=128, concat_input=False, num_classes=19, in_index=-1, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), auxiliary_head=[ dict( type='FCNHead', in_channels=128, channels=32, num_convs=1, num_classes=19, in_index=-2, norm_cfg=norm_cfg, concat_input=False, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), dict( type='FCNHead', in_channels=64, channels=32, num_convs=1, num_classes=19, in_index=-3, norm_cfg=norm_cfg, concat_input=False, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), ], # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/fcn_hr18.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://msra/hrnetv2_w18', backbone=dict( type='HRNet', norm_cfg=norm_cfg, norm_eval=False, extra=dict( stage1=dict( num_modules=1, num_branches=1, block='BOTTLENECK', num_blocks=(4, ), num_channels=(64, )), stage2=dict( num_modules=1, num_branches=2, block='BASIC', num_blocks=(4, 4), num_channels=(18, 36)), stage3=dict( num_modules=4, num_branches=3, block='BASIC', num_blocks=(4, 4, 4), num_channels=(18, 36, 72)), stage4=dict( num_modules=3, num_branches=4, block='BASIC', num_blocks=(4, 4, 4, 4), num_channels=(18, 36, 72, 144)))), decode_head=dict( type='FCNHead', in_channels=[18, 36, 72, 144], in_index=(0, 1, 2, 3), channels=sum([18, 36, 72, 144]), input_transform='resize_concat', kernel_size=1, num_convs=1, concat_input=False, dropout_ratio=-1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/fcn_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='FCNHead', in_channels=2048, in_index=3, channels=512, num_convs=2, concat_input=True, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained=None, backbone=dict( type='UNet', in_channels=3, base_channels=64, num_stages=5, strides=(1, 1, 1, 1, 1), enc_num_convs=(2, 2, 2, 2, 2), dec_num_convs=(2, 2, 2, 2), downsamples=(True, True, True, True), enc_dilations=(1, 1, 1, 1, 1), dec_dilations=(1, 1, 1, 1), with_cp=False, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=dict(type='ReLU'), upsample_cfg=dict(type='InterpConv'), norm_eval=False), decode_head=dict( type='FCNHead', in_channels=64, in_index=4, channels=64, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=2, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=128, in_index=3, channels=64, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=2, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='slide', crop_size=256, stride=170)) ================================================ FILE: annotator/uniformer/configs/_base_/models/fpn_r50.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 1, 1), strides=(1, 2, 2, 2), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=4), decode_head=dict( type='FPNHead', in_channels=[256, 256, 256, 256], in_index=[0, 1, 2, 3], feature_strides=[4, 8, 16, 32], channels=128, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/fpn_uniformer.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', backbone=dict( type='UniFormer', embed_dim=[64, 128, 320, 512], layers=[3, 4, 8, 3], head_dim=64, mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1), neck=dict( type='FPN', in_channels=[64, 128, 320, 512], out_channels=256, num_outs=4), decode_head=dict( type='FPNHead', in_channels=[256, 256, 256, 256], in_index=[0, 1, 2, 3], feature_strides=[4, 8, 16, 32], channels=128, dropout_ratio=0.1, num_classes=150, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole') ) ================================================ FILE: annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='GCHead', in_channels=2048, in_index=3, channels=512, ratio=1 / 4., pooling_type='att', fusion_types=('channel_add', ), dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True) model = dict( type='EncoderDecoder', backbone=dict( type='MobileNetV3', arch='large', out_indices=(1, 3, 16), norm_cfg=norm_cfg), decode_head=dict( type='LRASPPHead', in_channels=(16, 24, 960), in_index=(0, 1, 2), channels=128, input_transform='multiple_select', dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, act_cfg=dict(type='ReLU'), align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='NLHead', in_channels=2048, in_index=3, channels=512, dropout_ratio=0.1, reduction=2, use_scale=True, mode='embedded_gaussian', num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/ocrnet_hr18.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='CascadeEncoderDecoder', num_stages=2, pretrained='open-mmlab://msra/hrnetv2_w18', backbone=dict( type='HRNet', norm_cfg=norm_cfg, norm_eval=False, extra=dict( stage1=dict( num_modules=1, num_branches=1, block='BOTTLENECK', num_blocks=(4, ), num_channels=(64, )), stage2=dict( num_modules=1, num_branches=2, block='BASIC', num_blocks=(4, 4), num_channels=(18, 36)), stage3=dict( num_modules=4, num_branches=3, block='BASIC', num_blocks=(4, 4, 4), num_channels=(18, 36, 72)), stage4=dict( num_modules=3, num_branches=4, block='BASIC', num_blocks=(4, 4, 4, 4), num_channels=(18, 36, 72, 144)))), decode_head=[ dict( type='FCNHead', in_channels=[18, 36, 72, 144], channels=sum([18, 36, 72, 144]), in_index=(0, 1, 2, 3), input_transform='resize_concat', kernel_size=1, num_convs=1, concat_input=False, dropout_ratio=-1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), dict( type='OCRHead', in_channels=[18, 36, 72, 144], in_index=(0, 1, 2, 3), input_transform='resize_concat', channels=512, ocr_channels=256, dropout_ratio=-1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), ], # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='CascadeEncoderDecoder', num_stages=2, pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=[ dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), dict( type='OCRHead', in_channels=2048, in_index=3, channels=512, ocr_channels=256, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)) ], # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/pointrend_r50.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='CascadeEncoderDecoder', num_stages=2, pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 1, 1), strides=(1, 2, 2, 2), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=4), decode_head=[ dict( type='FPNHead', in_channels=[256, 256, 256, 256], in_index=[0, 1, 2, 3], feature_strides=[4, 8, 16, 32], channels=128, dropout_ratio=-1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), dict( type='PointHead', in_channels=[256], in_index=[0], channels=256, num_fcs=3, coarse_pred_each_layer=True, dropout_ratio=-1, num_classes=19, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)) ], # model training and testing settings train_cfg=dict( num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75), test_cfg=dict( mode='whole', subdivision_steps=2, subdivision_num_points=8196, scale_factor=2)) ================================================ FILE: annotator/uniformer/configs/_base_/models/psanet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='PSAHead', in_channels=2048, in_index=3, channels=512, mask_size=(97, 97), psa_type='bi-direction', compact=False, shrink_factor=2, normalization_factor=1.0, psa_softmax=True, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 2, 4), strides=(1, 2, 1, 1), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='PSPHead', in_channels=2048, in_index=3, channels=512, pool_scales=(1, 2, 3, 6), dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained=None, backbone=dict( type='UNet', in_channels=3, base_channels=64, num_stages=5, strides=(1, 1, 1, 1, 1), enc_num_convs=(2, 2, 2, 2, 2), dec_num_convs=(2, 2, 2, 2), downsamples=(True, True, True, True), enc_dilations=(1, 1, 1, 1, 1), dec_dilations=(1, 1, 1, 1), with_cp=False, conv_cfg=None, norm_cfg=norm_cfg, act_cfg=dict(type='ReLU'), upsample_cfg=dict(type='InterpConv'), norm_eval=False), decode_head=dict( type='PSPHead', in_channels=64, in_index=4, channels=16, pool_scales=(1, 2, 3, 6), dropout_ratio=0.1, num_classes=2, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=128, in_index=3, channels=64, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=2, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='slide', crop_size=256, stride=170)) ================================================ FILE: annotator/uniformer/configs/_base_/models/upernet_r50.py ================================================ # model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='ResNetV1c', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 1, 1), strides=(1, 2, 2, 2), norm_cfg=norm_cfg, norm_eval=False, style='pytorch', contract_dilation=True), decode_head=dict( type='UPerHead', in_channels=[256, 512, 1024, 2048], in_index=[0, 1, 2, 3], pool_scales=(1, 2, 3, 6), channels=512, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=1024, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/models/upernet_uniformer.py ================================================ # model settings norm_cfg = dict(type='BN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained=None, backbone=dict( type='UniFormer', embed_dim=[64, 128, 320, 512], layers=[3, 4, 8, 3], head_dim=64, mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1), decode_head=dict( type='UPerHead', in_channels=[64, 128, 320, 512], in_index=[0, 1, 2, 3], pool_scales=(1, 2, 3, 6), channels=512, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=320, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) ================================================ FILE: annotator/uniformer/configs/_base_/schedules/schedule_160k.py ================================================ # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) optimizer_config = dict() # learning policy lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) # runtime settings runner = dict(type='IterBasedRunner', max_iters=160000) checkpoint_config = dict(by_epoch=False, interval=16000) evaluation = dict(interval=16000, metric='mIoU') ================================================ FILE: annotator/uniformer/configs/_base_/schedules/schedule_20k.py ================================================ # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) optimizer_config = dict() # learning policy lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) # runtime settings runner = dict(type='IterBasedRunner', max_iters=20000) checkpoint_config = dict(by_epoch=False, interval=2000) evaluation = dict(interval=2000, metric='mIoU') ================================================ FILE: annotator/uniformer/configs/_base_/schedules/schedule_40k.py ================================================ # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) optimizer_config = dict() # learning policy lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) # runtime settings runner = dict(type='IterBasedRunner', max_iters=40000) checkpoint_config = dict(by_epoch=False, interval=4000) evaluation = dict(interval=4000, metric='mIoU') ================================================ FILE: annotator/uniformer/configs/_base_/schedules/schedule_80k.py ================================================ # optimizer optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) optimizer_config = dict() # learning policy lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) # runtime settings runner = dict(type='IterBasedRunner', max_iters=80000) checkpoint_config = dict(by_epoch=False, interval=8000) evaluation = dict(interval=8000, metric='mIoU') ================================================ FILE: annotator/uniformer/inference.py ================================================ import torch try: import mmcv as mmcv from mmcv.parallel import collate, scatter from mmcv.runner import load_checkpoint from mmseg.datasets.pipelines import Compose from mmseg.models import build_segmentor except ImportError: import annotator.mmpkg.mmcv as mmcv from annotator.mmpkg.mmcv.parallel import collate, scatter from annotator.mmpkg.mmcv.runner import load_checkpoint from annotator.mmpkg.mmseg.datasets.pipelines import Compose from annotator.mmpkg.mmseg.models import build_segmentor def init_segmentor(config, checkpoint=None, device='cuda:0'): """Initialize a segmentor from config file. Args: config (str or :obj:`mmcv.Config`): Config file path or the config object. checkpoint (str, optional): Checkpoint path. If left as None, the model will not load any weights. device (str, optional) CPU/CUDA device option. Default 'cuda:0'. Use 'cpu' for loading model on CPU. Returns: nn.Module: The constructed segmentor. """ if isinstance(config, str): config = mmcv.Config.fromfile(config) elif not isinstance(config, mmcv.Config): raise TypeError('config must be a filename or Config object, ' 'but got {}'.format(type(config))) config.model.pretrained = None config.model.train_cfg = None model = build_segmentor(config.model, test_cfg=config.get('test_cfg')) if checkpoint is not None: checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') model.CLASSES = checkpoint['meta']['CLASSES'] model.PALETTE = checkpoint['meta']['PALETTE'] model.cfg = config # save the config in the model for convenience model.to(device) model.eval() return model class LoadImage: """A simple pipeline to load image.""" def __call__(self, results): """Call function to load images into results. Args: results (dict): A result dict contains the file name of the image to be read. Returns: dict: ``results`` will be returned containing loaded image. """ if isinstance(results['img'], str): results['filename'] = results['img'] results['ori_filename'] = results['img'] else: results['filename'] = None results['ori_filename'] = None img = mmcv.imread(results['img']) results['img'] = img results['img_shape'] = img.shape results['ori_shape'] = img.shape return results def inference_segmentor(model, img): """Inference image(s) with the segmentor. Args: model (nn.Module): The loaded segmentor. imgs (str/ndarray or list[str/ndarray]): Either image files or loaded images. Returns: (list[Tensor]): The segmentation result. """ cfg = model.cfg device = next(model.parameters()).device # model device # build the data pipeline test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:] test_pipeline = Compose(test_pipeline) # prepare data data = dict(img=img) data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device])[0] else: data['img_metas'] = [i.data[0] for i in data['img_metas']] data['img'] = [x.to(device) for x in data['img']] # forward the model with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) return result def show_result_pyplot(model, img, result, palette=None, fig_size=(15, 10), opacity=0.5, title='', block=True): """Visualize the segmentation results on the image. Args: model (nn.Module): The loaded segmentor. img (str or np.ndarray): Image filename or loaded image. result (list): The segmentation result. palette (list[list[int]]] | None): The palette of segmentation map. If None is given, random palette will be generated. Default: None fig_size (tuple): Figure size of the pyplot figure. opacity(float): Opacity of painted segmentation map. Default 0.5. Must be in (0, 1] range. title (str): The title of pyplot figure. Default is ''. block (bool): Whether to block the pyplot figure. Default is True. """ if hasattr(model, 'module'): model = model.module img = model.show_result( img, result, palette=palette, show=False, opacity=opacity) # plt.figure(figsize=fig_size) # plt.imshow(mmcv.bgr2rgb(img)) # plt.title(title) # plt.tight_layout() # plt.show(block=block) return mmcv.bgr2rgb(img) ================================================ FILE: annotator/uniformer/mmcv_custom/__init__.py ================================================ # -*- coding: utf-8 -*- from .checkpoint import load_checkpoint __all__ = ['load_checkpoint'] ================================================ FILE: annotator/uniformer/mmcv_custom/checkpoint.py ================================================ # Copyright (c) Open-MMLab. All rights reserved. import io import os import os.path as osp import pkgutil import time import warnings from collections import OrderedDict from importlib import import_module from tempfile import TemporaryDirectory import torch import torchvision from torch.optim import Optimizer from torch.utils import model_zoo from torch.nn import functional as F try: import mmcv as mmcv from mmcv.fileio import FileClient from mmcv.fileio import load as load_file from mmcv.parallel import is_module_wrapper from mmcv.utils import mkdir_or_exist from mmcv.runner import get_dist_info except ImportError: import annotator.mmpkg.mmcv as mmcv from annotator.mmpkg.mmcv.fileio import FileClient from annotator.mmpkg.mmcv.fileio import load as load_file from annotator.mmpkg.mmcv.parallel import is_module_wrapper from annotator.mmpkg.mmcv.utils import mkdir_or_exist from annotator.mmpkg.mmcv.runner import get_dist_info ENV_MMCV_HOME = 'MMCV_HOME' ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' DEFAULT_CACHE_DIR = '~/.cache' def _get_mmcv_home(): mmcv_home = os.path.expanduser( os.getenv( ENV_MMCV_HOME, os.path.join( os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv'))) mkdir_or_exist(mmcv_home) return mmcv_home def load_state_dict(module, state_dict, strict=False, logger=None): """Load state_dict to a module. This method is modified from :meth:`torch.nn.Module.load_state_dict`. Default value for ``strict`` is set to ``False`` and the message for param mismatch will be shown even if strict is False. Args: module (Module): Module that receives the state_dict. state_dict (OrderedDict): Weights. strict (bool): whether to strictly enforce that the keys in :attr:`state_dict` match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. logger (:obj:`logging.Logger`, optional): Logger to log the error message. If not specified, print function will be used. """ unexpected_keys = [] all_missing_keys = [] err_msg = [] metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # use _load_from_state_dict to enable checkpoint version control def load(module, prefix=''): # recursively check parallel module in case that the model has a # complicated structure, e.g., nn.Module(nn.Module(DDP)) if is_module_wrapper(module): module = module.module local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, all_missing_keys, unexpected_keys, err_msg) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') load(module) load = None # break load->load reference cycle # ignore "num_batches_tracked" of BN layers missing_keys = [ key for key in all_missing_keys if 'num_batches_tracked' not in key ] if unexpected_keys: err_msg.append('unexpected key in source ' f'state_dict: {", ".join(unexpected_keys)}\n') if missing_keys: err_msg.append( f'missing keys in source state_dict: {", ".join(missing_keys)}\n') rank, _ = get_dist_info() if len(err_msg) > 0 and rank == 0: err_msg.insert( 0, 'The model and loaded state dict do not match exactly\n') err_msg = '\n'.join(err_msg) if strict: raise RuntimeError(err_msg) elif logger is not None: logger.warning(err_msg) else: print(err_msg) def load_url_dist(url, model_dir=None): """In distributed setting, this function only download checkpoint at local rank 0.""" rank, world_size = get_dist_info() rank = int(os.environ.get('LOCAL_RANK', rank)) if rank == 0: checkpoint = model_zoo.load_url(url, model_dir=model_dir) if world_size > 1: torch.distributed.barrier() if rank > 0: checkpoint = model_zoo.load_url(url, model_dir=model_dir) return checkpoint def load_pavimodel_dist(model_path, map_location=None): """In distributed setting, this function only download checkpoint at local rank 0.""" try: from pavi import modelcloud except ImportError: raise ImportError( 'Please install pavi to load checkpoint from modelcloud.') rank, world_size = get_dist_info() rank = int(os.environ.get('LOCAL_RANK', rank)) if rank == 0: model = modelcloud.get(model_path) with TemporaryDirectory() as tmp_dir: downloaded_file = osp.join(tmp_dir, model.name) model.download(downloaded_file) checkpoint = torch.load(downloaded_file, map_location=map_location) if world_size > 1: torch.distributed.barrier() if rank > 0: model = modelcloud.get(model_path) with TemporaryDirectory() as tmp_dir: downloaded_file = osp.join(tmp_dir, model.name) model.download(downloaded_file) checkpoint = torch.load( downloaded_file, map_location=map_location) return checkpoint def load_fileclient_dist(filename, backend, map_location): """In distributed setting, this function only download checkpoint at local rank 0.""" rank, world_size = get_dist_info() rank = int(os.environ.get('LOCAL_RANK', rank)) allowed_backends = ['ceph'] if backend not in allowed_backends: raise ValueError(f'Load from Backend {backend} is not supported.') if rank == 0: fileclient = FileClient(backend=backend) buffer = io.BytesIO(fileclient.get(filename)) checkpoint = torch.load(buffer, map_location=map_location) if world_size > 1: torch.distributed.barrier() if rank > 0: fileclient = FileClient(backend=backend) buffer = io.BytesIO(fileclient.get(filename)) checkpoint = torch.load(buffer, map_location=map_location) return checkpoint def get_torchvision_models(): model_urls = dict() for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): if ispkg: continue _zoo = import_module(f'torchvision.models.{name}') if hasattr(_zoo, 'model_urls'): _urls = getattr(_zoo, 'model_urls') model_urls.update(_urls) return model_urls def get_external_models(): mmcv_home = _get_mmcv_home() default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json') default_urls = load_file(default_json_path) assert isinstance(default_urls, dict) external_json_path = osp.join(mmcv_home, 'open_mmlab.json') if osp.exists(external_json_path): external_urls = load_file(external_json_path) assert isinstance(external_urls, dict) default_urls.update(external_urls) return default_urls def get_mmcls_models(): mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json') mmcls_urls = load_file(mmcls_json_path) return mmcls_urls def get_deprecated_model_names(): deprecate_json_path = osp.join(mmcv.__path__[0], 'model_zoo/deprecated.json') deprecate_urls = load_file(deprecate_json_path) assert isinstance(deprecate_urls, dict) return deprecate_urls def _process_mmcls_checkpoint(checkpoint): state_dict = checkpoint['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): if k.startswith('backbone.'): new_state_dict[k[9:]] = v new_checkpoint = dict(state_dict=new_state_dict) return new_checkpoint def _load_checkpoint(filename, map_location=None): """Load checkpoint from somewhere (modelzoo, file, url). Args: filename (str): Accept local filepath, URL, ``torchvision://xxx``, ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for details. map_location (str | None): Same as :func:`torch.load`. Default: None. Returns: dict | OrderedDict: The loaded checkpoint. It can be either an OrderedDict storing model weights or a dict containing other information, which depends on the checkpoint. """ if filename.startswith('modelzoo://'): warnings.warn('The URL scheme of "modelzoo://" is deprecated, please ' 'use "torchvision://" instead') model_urls = get_torchvision_models() model_name = filename[11:] checkpoint = load_url_dist(model_urls[model_name]) elif filename.startswith('torchvision://'): model_urls = get_torchvision_models() model_name = filename[14:] checkpoint = load_url_dist(model_urls[model_name]) elif filename.startswith('open-mmlab://'): model_urls = get_external_models() model_name = filename[13:] deprecated_urls = get_deprecated_model_names() if model_name in deprecated_urls: warnings.warn(f'open-mmlab://{model_name} is deprecated in favor ' f'of open-mmlab://{deprecated_urls[model_name]}') model_name = deprecated_urls[model_name] model_url = model_urls[model_name] # check if is url if model_url.startswith(('http://', 'https://')): checkpoint = load_url_dist(model_url) else: filename = osp.join(_get_mmcv_home(), model_url) if not osp.isfile(filename): raise IOError(f'{filename} is not a checkpoint file') checkpoint = torch.load(filename, map_location=map_location) elif filename.startswith('mmcls://'): model_urls = get_mmcls_models() model_name = filename[8:] checkpoint = load_url_dist(model_urls[model_name]) checkpoint = _process_mmcls_checkpoint(checkpoint) elif filename.startswith(('http://', 'https://')): checkpoint = load_url_dist(filename) elif filename.startswith('pavi://'): model_path = filename[7:] checkpoint = load_pavimodel_dist(model_path, map_location=map_location) elif filename.startswith('s3://'): checkpoint = load_fileclient_dist( filename, backend='ceph', map_location=map_location) else: if not osp.isfile(filename): raise IOError(f'{filename} is not a checkpoint file') checkpoint = torch.load(filename, map_location=map_location) return checkpoint def load_checkpoint(model, filename, map_location='cpu', strict=False, logger=None): """Load checkpoint from a file or URI. Args: model (Module): Module to load checkpoint. filename (str): Accept local filepath, URL, ``torchvision://xxx``, ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for details. map_location (str): Same as :func:`torch.load`. strict (bool): Whether to allow different params for the model and checkpoint. logger (:mod:`logging.Logger` or None): The logger for error message. Returns: dict or OrderedDict: The loaded checkpoint. """ checkpoint = _load_checkpoint(filename, map_location) # OrderedDict is a subclass of dict if not isinstance(checkpoint, dict): raise RuntimeError( f'No state_dict found in checkpoint file {filename}') # get state_dict from checkpoint if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] elif 'model' in checkpoint: state_dict = checkpoint['model'] else: state_dict = checkpoint # strip prefix of state_dict if list(state_dict.keys())[0].startswith('module.'): state_dict = {k[7:]: v for k, v in state_dict.items()} # for MoBY, load model of online branch if sorted(list(state_dict.keys()))[0].startswith('encoder'): state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')} # reshape absolute position embedding if state_dict.get('absolute_pos_embed') is not None: absolute_pos_embed = state_dict['absolute_pos_embed'] N1, L, C1 = absolute_pos_embed.size() N2, C2, H, W = model.absolute_pos_embed.size() if N1 != N2 or C1 != C2 or L != H*W: logger.warning("Error in loading absolute_pos_embed, pass") else: state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2) # interpolate position bias table if needed relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k] for table_key in relative_position_bias_table_keys: table_pretrained = state_dict[table_key] table_current = model.state_dict()[table_key] L1, nH1 = table_pretrained.size() L2, nH2 = table_current.size() if nH1 != nH2: logger.warning(f"Error in loading {table_key}, pass") else: if L1 != L2: S1 = int(L1 ** 0.5) S2 = int(L2 ** 0.5) table_pretrained_resized = F.interpolate( table_pretrained.permute(1, 0).view(1, nH1, S1, S1), size=(S2, S2), mode='bicubic') state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0) # load state_dict load_state_dict(model, state_dict, strict, logger) return checkpoint def weights_to_cpu(state_dict): """Copy a model state_dict to cpu. Args: state_dict (OrderedDict): Model weights on GPU. Returns: OrderedDict: Model weights on GPU. """ state_dict_cpu = OrderedDict() for key, val in state_dict.items(): state_dict_cpu[key] = val.cpu() return state_dict_cpu def _save_to_state_dict(module, destination, prefix, keep_vars): """Saves module state to `destination` dictionary. This method is modified from :meth:`torch.nn.Module._save_to_state_dict`. Args: module (nn.Module): The module to generate state_dict. destination (dict): A dict where state will be stored. prefix (str): The prefix for parameters and buffers used in this module. """ for name, param in module._parameters.items(): if param is not None: destination[prefix + name] = param if keep_vars else param.detach() for name, buf in module._buffers.items(): # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d if buf is not None: destination[prefix + name] = buf if keep_vars else buf.detach() def get_state_dict(module, destination=None, prefix='', keep_vars=False): """Returns a dictionary containing a whole state of the module. Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names. This method is modified from :meth:`torch.nn.Module.state_dict` to recursively check parallel module in case that the model has a complicated structure, e.g., nn.Module(nn.Module(DDP)). Args: module (nn.Module): The module to generate state_dict. destination (OrderedDict): Returned dict for the state of the module. prefix (str): Prefix of the key. keep_vars (bool): Whether to keep the variable property of the parameters. Default: False. Returns: dict: A dictionary containing a whole state of the module. """ # recursively check parallel module in case that the model has a # complicated structure, e.g., nn.Module(nn.Module(DDP)) if is_module_wrapper(module): module = module.module # below is the same as torch.nn.Module.state_dict() if destination is None: destination = OrderedDict() destination._metadata = OrderedDict() destination._metadata[prefix[:-1]] = local_metadata = dict( version=module._version) _save_to_state_dict(module, destination, prefix, keep_vars) for name, child in module._modules.items(): if child is not None: get_state_dict( child, destination, prefix + name + '.', keep_vars=keep_vars) for hook in module._state_dict_hooks.values(): hook_result = hook(module, destination, prefix, local_metadata) if hook_result is not None: destination = hook_result return destination def save_checkpoint(model, filename, optimizer=None, meta=None): """Save checkpoint to file. The checkpoint will have 3 fields: ``meta``, ``state_dict`` and ``optimizer``. By default ``meta`` will contain version and time info. Args: model (Module): Module whose params are to be saved. filename (str): Checkpoint filename. optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. meta (dict, optional): Metadata to be saved in checkpoint. """ if meta is None: meta = {} elif not isinstance(meta, dict): raise TypeError(f'meta must be a dict or None, but got {type(meta)}') meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) if is_module_wrapper(model): model = model.module if hasattr(model, 'CLASSES') and model.CLASSES is not None: # save class name to the meta meta.update(CLASSES=model.CLASSES) checkpoint = { 'meta': meta, 'state_dict': weights_to_cpu(get_state_dict(model)) } # save optimizer state dict in the checkpoint if isinstance(optimizer, Optimizer): checkpoint['optimizer'] = optimizer.state_dict() elif isinstance(optimizer, dict): checkpoint['optimizer'] = {} for name, optim in optimizer.items(): checkpoint['optimizer'][name] = optim.state_dict() if filename.startswith('pavi://'): try: from pavi import modelcloud from pavi.exception import NodeNotFoundError except ImportError: raise ImportError( 'Please install pavi to load checkpoint from modelcloud.') model_path = filename[7:] root = modelcloud.Folder() model_dir, model_name = osp.split(model_path) try: model = modelcloud.get(model_dir) except NodeNotFoundError: model = root.create_training_model(model_dir) with TemporaryDirectory() as tmp_dir: checkpoint_file = osp.join(tmp_dir, model_name) with open(checkpoint_file, 'wb') as f: torch.save(checkpoint, f) f.flush() model.create_file(checkpoint_file, name=model_name) else: mmcv.mkdir_or_exist(osp.dirname(filename)) # immediately flush buffer with open(filename, 'wb') as f: torch.save(checkpoint, f) f.flush() ================================================ FILE: annotator/uniformer/uniformer.py ================================================ # -------------------------------------------------------- # UniFormer # Copyright (c) 2022 SenseTime X-Lab # Licensed under The MIT License [see LICENSE for details] # Written by Kunchang Li # -------------------------------------------------------- import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from functools import partial from collections import OrderedDict from timm.models.layers import DropPath, to_2tuple, trunc_normal_ try: from mmseg.utils import get_root_logger from mmseg.models.builder import BACKBONES except ImportError: from annotator.mmpkg.mmseg.utils import get_root_logger from annotator.mmpkg.mmseg.models.builder import BACKBONES from annotator.uniformer.mmcv_custom import load_checkpoint class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class CMlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Conv2d(in_features, hidden_features, 1) self.act = act_layer() self.fc2 = nn.Conv2d(hidden_features, out_features, 1) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class CBlock(nn.Module): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) self.norm1 = nn.BatchNorm2d(dim) self.conv1 = nn.Conv2d(dim, dim, 1) self.conv2 = nn.Conv2d(dim, dim, 1) self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = nn.BatchNorm2d(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = CMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) def forward(self, x): x = x + self.pos_embed(x) x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x))))) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class Attention(nn.Module): def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x): B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class SABlock(nn.Module): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) def forward(self, x): x = x + self.pos_embed(x) B, N, H, W = x.shape x = x.flatten(2).transpose(1, 2) x = x + self.drop_path(self.attn(self.norm1(x))) x = x + self.drop_path(self.mlp(self.norm2(x))) x = x.transpose(1, 2).reshape(B, N, H, W) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class SABlock_Windows(nn.Module): def __init__(self, dim, num_heads, window_size=14, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.window_size=window_size self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim) self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) def forward(self, x): x = x + self.pos_embed(x) x = x.permute(0, 2, 3, 1) B, H, W, C = x.shape shortcut = x x = self.norm1(x) pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) _, Hp, Wp, _ = x.shape x_windows = window_partition(x, self.window_size) # nW*B, window_size, window_size, C x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :].contiguous() x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) x = x.permute(0, 3, 1, 2).reshape(B, C, H, W) return x class PatchEmbed(nn.Module): """ Image to Patch Embedding """ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches self.norm = nn.LayerNorm(embed_dim) self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) def forward(self, x): B, _, H, W = x.shape x = self.proj(x) B, _, H, W = x.shape x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() return x @BACKBONES.register_module() class UniFormer(nn.Module): """ Vision Transformer A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 """ def __init__(self, layers=[3, 4, 8, 3], img_size=224, in_chans=3, num_classes=80, embed_dim=[64, 128, 320, 512], head_dim=64, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6), pretrained_path=None, use_checkpoint=False, checkpoint_num=[0, 0, 0, 0], windows=False, hybrid=False, window_size=14): """ Args: layer (list): number of block in each layer img_size (int, tuple): input image size in_chans (int): number of input channels num_classes (int): number of classes for classification head embed_dim (int): embedding dimension head_dim (int): dimension of attention heads mlp_ratio (int): ratio of mlp hidden dim to embedding dim qkv_bias (bool): enable bias for qkv if True qk_scale (float): override default qk scale of head_dim ** -0.5 if set representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set drop_rate (float): dropout rate attn_drop_rate (float): attention dropout rate drop_path_rate (float): stochastic depth rate norm_layer (nn.Module): normalization layer pretrained_path (str): path of pretrained model use_checkpoint (bool): whether use checkpoint checkpoint_num (list): index for using checkpoint in every stage windows (bool): whether use window MHRA hybrid (bool): whether use hybrid MHRA window_size (int): size of window (>14) """ super().__init__() self.num_classes = num_classes self.use_checkpoint = use_checkpoint self.checkpoint_num = checkpoint_num self.windows = windows print(f'Use Checkpoint: {self.use_checkpoint}') print(f'Checkpoint Number: {self.checkpoint_num}') self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) self.patch_embed1 = PatchEmbed( img_size=img_size, patch_size=4, in_chans=in_chans, embed_dim=embed_dim[0]) self.patch_embed2 = PatchEmbed( img_size=img_size // 4, patch_size=2, in_chans=embed_dim[0], embed_dim=embed_dim[1]) self.patch_embed3 = PatchEmbed( img_size=img_size // 8, patch_size=2, in_chans=embed_dim[1], embed_dim=embed_dim[2]) self.patch_embed4 = PatchEmbed( img_size=img_size // 16, patch_size=2, in_chans=embed_dim[2], embed_dim=embed_dim[3]) self.pos_drop = nn.Dropout(p=drop_rate) dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(layers))] # stochastic depth decay rule num_heads = [dim // head_dim for dim in embed_dim] self.blocks1 = nn.ModuleList([ CBlock( dim=embed_dim[0], num_heads=num_heads[0], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) for i in range(layers[0])]) self.norm1=norm_layer(embed_dim[0]) self.blocks2 = nn.ModuleList([ CBlock( dim=embed_dim[1], num_heads=num_heads[1], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]], norm_layer=norm_layer) for i in range(layers[1])]) self.norm2 = norm_layer(embed_dim[1]) if self.windows: print('Use local window for all blocks in stage3') self.blocks3 = nn.ModuleList([ SABlock_Windows( dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer) for i in range(layers[2])]) elif hybrid: print('Use hybrid window for blocks in stage3') block3 = [] for i in range(layers[2]): if (i + 1) % 4 == 0: block3.append(SABlock( dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)) else: block3.append(SABlock_Windows( dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)) self.blocks3 = nn.ModuleList(block3) else: print('Use global window for all blocks in stage3') self.blocks3 = nn.ModuleList([ SABlock( dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer) for i in range(layers[2])]) self.norm3 = norm_layer(embed_dim[2]) self.blocks4 = nn.ModuleList([ SABlock( dim=embed_dim[3], num_heads=num_heads[3], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]+layers[2]], norm_layer=norm_layer) for i in range(layers[3])]) self.norm4 = norm_layer(embed_dim[3]) # Representation layer if representation_size: self.num_features = representation_size self.pre_logits = nn.Sequential(OrderedDict([ ('fc', nn.Linear(embed_dim, representation_size)), ('act', nn.Tanh()) ])) else: self.pre_logits = nn.Identity() self.apply(self._init_weights) self.init_weights(pretrained=pretrained_path) def init_weights(self, pretrained): if isinstance(pretrained, str): logger = get_root_logger() load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger) print(f'Load pretrained model from {pretrained}') def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'} def get_classifier(self): return self.head def reset_classifier(self, num_classes, global_pool=''): self.num_classes = num_classes self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() def forward_features(self, x): out = [] x = self.patch_embed1(x) x = self.pos_drop(x) for i, blk in enumerate(self.blocks1): if self.use_checkpoint and i < self.checkpoint_num[0]: x = checkpoint.checkpoint(blk, x) else: x = blk(x) x_out = self.norm1(x.permute(0, 2, 3, 1)) out.append(x_out.permute(0, 3, 1, 2).contiguous()) x = self.patch_embed2(x) for i, blk in enumerate(self.blocks2): if self.use_checkpoint and i < self.checkpoint_num[1]: x = checkpoint.checkpoint(blk, x) else: x = blk(x) x_out = self.norm2(x.permute(0, 2, 3, 1)) out.append(x_out.permute(0, 3, 1, 2).contiguous()) x = self.patch_embed3(x) for i, blk in enumerate(self.blocks3): if self.use_checkpoint and i < self.checkpoint_num[2]: x = checkpoint.checkpoint(blk, x) else: x = blk(x) x_out = self.norm3(x.permute(0, 2, 3, 1)) out.append(x_out.permute(0, 3, 1, 2).contiguous()) x = self.patch_embed4(x) for i, blk in enumerate(self.blocks4): if self.use_checkpoint and i < self.checkpoint_num[3]: x = checkpoint.checkpoint(blk, x) else: x = blk(x) x_out = self.norm4(x.permute(0, 2, 3, 1)) out.append(x_out.permute(0, 3, 1, 2).contiguous()) return tuple(out) def forward(self, x): x = self.forward_features(x) return x ================================================ FILE: annotator/uniformer/upernet_global_small.py ================================================ _base_ = [ 'configs/_base_/models/upernet_uniformer.py', 'configs/_base_/datasets/ade20k.py', 'configs/_base_/default_runtime.py', 'configs/_base_/schedules/schedule_160k.py' ] custom_imports = dict( imports=['annotator.uniformer.uniformer'], allow_failed_imports=False ) model = dict( backbone=dict( type='UniFormer', embed_dim=[64, 128, 320, 512], layers=[3, 4, 8, 3], head_dim=64, drop_path_rate=0.25, windows=False, hybrid=False ), decode_head=dict( in_channels=[64, 128, 320, 512], num_classes=150 ), auxiliary_head=dict( in_channels=320, num_classes=150 )) # AdamW optimizer, no weight decay for position embedding & layer norm in backbone optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01, paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.), 'relative_position_bias_table': dict(decay_mult=0.), 'norm': dict(decay_mult=0.)})) lr_config = dict(_delete_=True, policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-6, power=1.0, min_lr=0.0, by_epoch=False) data=dict(samples_per_gpu=2) ================================================ FILE: annotator/util.py ================================================ import numpy as np import cv2 import os def load_model(filename: str, remote_url: str, model_dir: str) -> str: """ Load the model from the specified filename and remote URL if it doesn't exist locally. Args: filename (str): The filename of the model. remote_url (str): The remote URL of the model. """ local_path = os.path.join(model_dir, filename) if not os.path.exists(local_path): from scripts.utils import load_file_from_url load_file_from_url(remote_url, model_dir=model_dir) return local_path def HWC3(x): assert x.dtype == np.uint8 if x.ndim == 2: x = x[:, :, None] assert x.ndim == 3 H, W, C = x.shape assert C == 1 or C == 3 or C == 4 if C == 3: return x if C == 1: return np.concatenate([x, x, x], axis=2) if C == 4: color = x[:, :, 0:3].astype(np.float32) alpha = x[:, :, 3:4].astype(np.float32) / 255.0 y = color * alpha + 255.0 * (1.0 - alpha) y = y.clip(0, 255).astype(np.uint8) return y def make_noise_disk(H, W, C, F): noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C)) noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC) noise = noise[F: F + H, F: F + W] noise -= np.min(noise) noise /= np.max(noise) if C == 1: noise = noise[:, :, None] return noise def nms(x, t, s): x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s) f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8) f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8) f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8) f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8) y = np.zeros_like(x) for f in [f1, f2, f3, f4]: np.putmask(y, cv2.dilate(x, kernel=f) == x, x) z = np.zeros_like(y, dtype=np.uint8) z[y > t] = 255 return z def min_max_norm(x): x -= np.min(x) x /= np.maximum(np.max(x), 1e-5) return x def safe_step(x, step=2): y = x.astype(np.float32) * float(step + 1) y = y.astype(np.int32).astype(np.float32) / float(step) return y ================================================ FILE: annotator/zoe/LICENSE ================================================ MIT License Copyright (c) 2022 Intelligent Systems Lab Org Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/zoe/__init__.py ================================================ import os import cv2 import numpy as np import torch from einops import rearrange from .zoedepth.models.zoedepth.zoedepth_v1 import ZoeDepth from .zoedepth.utils.config import get_config from modules import devices from annotator.annotator_path import models_path class ZoeDetector: model_dir = os.path.join(models_path, "zoedepth") def __init__(self): self.model = None self.device = devices.get_device_for("controlnet") def load_model(self): remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/ZoeD_M12_N.pt" modelpath = os.path.join(self.model_dir, "ZoeD_M12_N.pt") if not os.path.exists(modelpath): from scripts.utils import load_file_from_url load_file_from_url(remote_model_path, model_dir=self.model_dir) conf = get_config("zoedepth", "infer") model = ZoeDepth.build_from_config(conf) model.load_state_dict(torch.load(modelpath, map_location=model.device)['model']) model.eval() self.model = model.to(self.device) def unload_model(self): if self.model is not None: self.model.cpu() def __call__(self, input_image): if self.model is None: self.load_model() self.model.to(self.device) assert input_image.ndim == 3 image_depth = input_image with torch.no_grad(): image_depth = torch.from_numpy(image_depth).float().to(self.device) image_depth = image_depth / 255.0 image_depth = rearrange(image_depth, 'h w c -> 1 c h w') depth = self.model.infer(image_depth) depth = depth[0, 0].cpu().numpy() vmin = np.percentile(depth, 2) vmax = np.percentile(depth, 85) depth -= vmin depth /= vmax - vmin depth = 1.0 - depth depth_image = (depth * 255.0).clip(0, 255).astype(np.uint8) return depth_image ================================================ FILE: annotator/zoe/zoedepth/models/__init__.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat ================================================ FILE: annotator/zoe/zoedepth/models/base_models/__init__.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas.py ================================================ # MIT License import os # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import torch import torch.nn as nn import numpy as np from torchvision.transforms import Normalize def denormalize(x): """Reverses the imagenet normalization applied to the input. Args: x (torch.Tensor - shape(N,3,H,W)): input tensor Returns: torch.Tensor - shape(N,3,H,W): Denormalized input """ mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device) std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device) return x * std + mean def get_activation(name, bank): def hook(model, input, output): bank[name] = output return hook class Resize(object): """Resize sample to given size (width, height). """ def __init__( self, width, height, resize_target=True, keep_aspect_ratio=False, ensure_multiple_of=1, resize_method="lower_bound", ): """Init. Args: width (int): desired output width height (int): desired output height resize_target (bool, optional): True: Resize the full sample (image, mask, target). False: Resize image only. Defaults to True. keep_aspect_ratio (bool, optional): True: Keep the aspect ratio of the input sample. Output sample might not have the given width and height, and resize behaviour depends on the parameter 'resize_method'. Defaults to False. ensure_multiple_of (int, optional): Output width and height is constrained to be multiple of this parameter. Defaults to 1. resize_method (str, optional): "lower_bound": Output will be at least as large as the given size. "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) "minimal": Scale as least as possible. (Output size might be smaller than given size.) Defaults to "lower_bound". """ print("Params passed to Resize transform:") print("\twidth: ", width) print("\theight: ", height) print("\tresize_target: ", resize_target) print("\tkeep_aspect_ratio: ", keep_aspect_ratio) print("\tensure_multiple_of: ", ensure_multiple_of) print("\tresize_method: ", resize_method) self.__width = width self.__height = height self.__keep_aspect_ratio = keep_aspect_ratio self.__multiple_of = ensure_multiple_of self.__resize_method = resize_method def constrain_to_multiple_of(self, x, min_val=0, max_val=None): y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) if max_val is not None and y > max_val: y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) if y < min_val: y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) return y def get_size(self, width, height): # determine new height and width scale_height = self.__height / height scale_width = self.__width / width if self.__keep_aspect_ratio: if self.__resize_method == "lower_bound": # scale such that output size is lower bound if scale_width > scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "upper_bound": # scale such that output size is upper bound if scale_width < scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "minimal": # scale as least as possbile if abs(1 - scale_width) < abs(1 - scale_height): # fit width scale_height = scale_width else: # fit height scale_width = scale_height else: raise ValueError( f"resize_method {self.__resize_method} not implemented" ) if self.__resize_method == "lower_bound": new_height = self.constrain_to_multiple_of( scale_height * height, min_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, min_val=self.__width ) elif self.__resize_method == "upper_bound": new_height = self.constrain_to_multiple_of( scale_height * height, max_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, max_val=self.__width ) elif self.__resize_method == "minimal": new_height = self.constrain_to_multiple_of(scale_height * height) new_width = self.constrain_to_multiple_of(scale_width * width) else: raise ValueError( f"resize_method {self.__resize_method} not implemented") return (new_width, new_height) def __call__(self, x): width, height = self.get_size(*x.shape[-2:][::-1]) return nn.functional.interpolate(x, (int(height), int(width)), mode='bilinear', align_corners=True) class PrepForMidas(object): def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True): if isinstance(img_size, int): img_size = (img_size, img_size) net_h, net_w = img_size self.normalization = Normalize( mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=32, resize_method=resize_mode) \ if do_resize else nn.Identity() def __call__(self, x): return self.normalization(self.resizer(x)) class MidasCore(nn.Module): def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True, img_size=384, **kwargs): """Midas Base model used for multi-scale feature extraction. Args: midas (torch.nn.Module): Midas model. trainable (bool, optional): Train midas model. Defaults to False. fetch_features (bool, optional): Extract multi-scale features. Defaults to True. layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'). freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False. keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True. img_size (int, tuple, optional): Input resolution. Defaults to 384. """ super().__init__() self.core = midas self.output_channels = None self.core_out = {} self.trainable = trainable self.fetch_features = fetch_features # midas.scratch.output_conv = nn.Identity() self.handles = [] # self.layer_names = ['out_conv','l4_rn', 'r4', 'r3', 'r2', 'r1'] self.layer_names = layer_names self.set_trainable(trainable) self.set_fetch_features(fetch_features) self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio, img_size=img_size, do_resize=kwargs.get('do_resize', True)) if freeze_bn: self.freeze_bn() def set_trainable(self, trainable): self.trainable = trainable if trainable: self.unfreeze() else: self.freeze() return self def set_fetch_features(self, fetch_features): self.fetch_features = fetch_features if fetch_features: if len(self.handles) == 0: self.attach_hooks(self.core) else: self.remove_hooks() return self def freeze(self): for p in self.parameters(): p.requires_grad = False self.trainable = False return self def unfreeze(self): for p in self.parameters(): p.requires_grad = True self.trainable = True return self def freeze_bn(self): for m in self.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() return self def forward(self, x, denorm=False, return_rel_depth=False): with torch.no_grad(): if denorm: x = denormalize(x) x = self.prep(x) # print("Shape after prep: ", x.shape) with torch.set_grad_enabled(self.trainable): # print("Input size to Midascore", x.shape) rel_depth = self.core(x) # print("Output from midas shape", rel_depth.shape) if not self.fetch_features: return rel_depth out = [self.core_out[k] for k in self.layer_names] if return_rel_depth: return rel_depth, out return out def get_rel_pos_params(self): for name, p in self.core.pretrained.named_parameters(): if "relative_position" in name: yield p def get_enc_params_except_rel_pos(self): for name, p in self.core.pretrained.named_parameters(): if "relative_position" not in name: yield p def freeze_encoder(self, freeze_rel_pos=False): if freeze_rel_pos: for p in self.core.pretrained.parameters(): p.requires_grad = False else: for p in self.get_enc_params_except_rel_pos(): p.requires_grad = False return self def attach_hooks(self, midas): if len(self.handles) > 0: self.remove_hooks() if "out_conv" in self.layer_names: self.handles.append(list(midas.scratch.output_conv.children())[ 3].register_forward_hook(get_activation("out_conv", self.core_out))) if "r4" in self.layer_names: self.handles.append(midas.scratch.refinenet4.register_forward_hook( get_activation("r4", self.core_out))) if "r3" in self.layer_names: self.handles.append(midas.scratch.refinenet3.register_forward_hook( get_activation("r3", self.core_out))) if "r2" in self.layer_names: self.handles.append(midas.scratch.refinenet2.register_forward_hook( get_activation("r2", self.core_out))) if "r1" in self.layer_names: self.handles.append(midas.scratch.refinenet1.register_forward_hook( get_activation("r1", self.core_out))) if "l4_rn" in self.layer_names: self.handles.append(midas.scratch.layer4_rn.register_forward_hook( get_activation("l4_rn", self.core_out))) return self def remove_hooks(self): for h in self.handles: h.remove() return self def __del__(self): self.remove_hooks() def set_output_channels(self, model_type): self.output_channels = MIDAS_SETTINGS[model_type] @staticmethod def build(midas_model_type="DPT_BEiT_L_384", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs): if midas_model_type not in MIDAS_SETTINGS: raise ValueError( f"Invalid model type: {midas_model_type}. Must be one of {list(MIDAS_SETTINGS.keys())}") if "img_size" in kwargs: kwargs = MidasCore.parse_img_size(kwargs) img_size = kwargs.pop("img_size", [384, 384]) print("img_size", img_size) midas_path = os.path.join(os.path.dirname(__file__), 'midas_repo') midas = torch.hub.load(midas_path, midas_model_type, pretrained=use_pretrained_midas, force_reload=force_reload, source='local') kwargs.update({'keep_aspect_ratio': force_keep_ar}) midas_core = MidasCore(midas, trainable=train_midas, fetch_features=fetch_features, freeze_bn=freeze_bn, img_size=img_size, **kwargs) midas_core.set_output_channels(midas_model_type) return midas_core @staticmethod def build_from_config(config): return MidasCore.build(**config) @staticmethod def parse_img_size(config): assert 'img_size' in config if isinstance(config['img_size'], str): assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W" config['img_size'] = list(map(int, config['img_size'].split(","))) assert len( config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W" elif isinstance(config['img_size'], int): config['img_size'] = [config['img_size'], config['img_size']] else: assert isinstance(config['img_size'], list) and len( config['img_size']) == 2, "img_size should be a list of H,W" return config nchannels2models = { tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"], (512, 256, 128, 64, 64): ["MiDaS_small"] } # Model name to number of output channels MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items() for m in v } ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/.gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ *.png *.pfm *.jpg *.jpeg *.pt ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/Dockerfile ================================================ # enables cuda support in docker FROM nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04 # install python 3.6, pip and requirements for opencv-python # (see https://github.com/NVIDIA/nvidia-docker/issues/864) RUN apt-get update && apt-get -y install \ python3 \ python3-pip \ libsm6 \ libxext6 \ libxrender-dev \ curl \ && rm -rf /var/lib/apt/lists/* # install python dependencies RUN pip3 install --upgrade pip RUN pip3 install torch~=1.8 torchvision opencv-python-headless~=3.4 timm # copy inference code WORKDIR /opt/MiDaS COPY ./midas ./midas COPY ./*.py ./ # download model weights so the docker image can be used offline RUN cd weights && {curl -OL https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt; cd -; } RUN python3 run.py --model_type dpt_hybrid; exit 0 # entrypoint (dont forget to mount input and output directories) CMD python3 run.py --model_type dpt_hybrid ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/LICENSE ================================================ MIT License Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/README.md ================================================ ## Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer This repository contains code to compute depth from a single image. It accompanies our [paper](https://arxiv.org/abs/1907.01341v3): >Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer René Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, Vladlen Koltun and our [preprint](https://arxiv.org/abs/2103.13413): > Vision Transformers for Dense Prediction > René Ranftl, Alexey Bochkovskiy, Vladlen Koltun MiDaS was trained on up to 12 datasets (ReDWeb, DIML, Movies, MegaDepth, WSVD, TartanAir, HRWSI, ApolloScape, BlendedMVS, IRS, KITTI, NYU Depth V2) with multi-objective optimization. The original model that was trained on 5 datasets (`MIX 5` in the paper) can be found [here](https://github.com/isl-org/MiDaS/releases/tag/v2). The figure below shows an overview of the different MiDaS models; the bubble size scales with number of parameters. ![](figures/Improvement_vs_FPS.png) ### Setup 1) Pick one or more models and download the corresponding weights to the `weights` folder: MiDaS 3.1 - For highest quality: [dpt_beit_large_512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) - For moderately less quality, but better speed-performance trade-off: [dpt_swin2_large_384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt) - For embedded devices: [dpt_swin2_tiny_256](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt), [dpt_levit_224](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt) - For inference on Intel CPUs, OpenVINO may be used for the small legacy model: openvino_midas_v21_small [.xml](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.xml), [.bin](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.bin) MiDaS 3.0: Legacy transformer models [dpt_large_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) and [dpt_hybrid_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt) MiDaS 2.1: Legacy convolutional models [midas_v21_384](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) and [midas_v21_small_256](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt) 1) Set up dependencies: ```shell conda env create -f environment.yaml conda activate midas-py310 ``` #### optional For the Next-ViT model, execute ```shell git submodule add https://github.com/isl-org/Next-ViT midas/external/next_vit ``` For the OpenVINO model, install ```shell pip install openvino ``` ### Usage 1) Place one or more input images in the folder `input`. 2) Run the model with ```shell python run.py --model_type --input_path input --output_path output ``` where `````` is chosen from [dpt_beit_large_512](#model_type), [dpt_beit_large_384](#model_type), [dpt_beit_base_384](#model_type), [dpt_swin2_large_384](#model_type), [dpt_swin2_base_384](#model_type), [dpt_swin2_tiny_256](#model_type), [dpt_swin_large_384](#model_type), [dpt_next_vit_large_384](#model_type), [dpt_levit_224](#model_type), [dpt_large_384](#model_type), [dpt_hybrid_384](#model_type), [midas_v21_384](#model_type), [midas_v21_small_256](#model_type), [openvino_midas_v21_small_256](#model_type). 3) The resulting depth maps are written to the `output` folder. #### optional 1) By default, the inference resizes the height of input images to the size of a model to fit into the encoder. This size is given by the numbers in the model names of the [accuracy table](#accuracy). Some models do not only support a single inference height but a range of different heights. Feel free to explore different heights by appending the extra command line argument `--height`. Unsupported height values will throw an error. Note that using this argument may decrease the model accuracy. 2) By default, the inference keeps the aspect ratio of input images when feeding them into the encoder if this is supported by a model (all models except for Swin, Swin2, LeViT). In order to resize to a square resolution, disregarding the aspect ratio while preserving the height, use the command line argument `--square`. #### via Camera If you want the input images to be grabbed from the camera and shown in a window, leave the input and output paths away and choose a model type as shown above: ```shell python run.py --model_type --side ``` The argument `--side` is optional and causes both the input RGB image and the output depth map to be shown side-by-side for comparison. #### via Docker 1) Make sure you have installed Docker and the [NVIDIA Docker runtime](https://github.com/NVIDIA/nvidia-docker/wiki/Installation-\(Native-GPU-Support\)). 2) Build the Docker image: ```shell docker build -t midas . ``` 3) Run inference: ```shell docker run --rm --gpus all -v $PWD/input:/opt/MiDaS/input -v $PWD/output:/opt/MiDaS/output -v $PWD/weights:/opt/MiDaS/weights midas ``` This command passes through all of your NVIDIA GPUs to the container, mounts the `input` and `output` directories and then runs the inference. #### via PyTorch Hub The pretrained model is also available on [PyTorch Hub](https://pytorch.org/hub/intelisl_midas_v2/) #### via TensorFlow or ONNX See [README](https://github.com/isl-org/MiDaS/tree/master/tf) in the `tf` subdirectory. Currently only supports MiDaS v2.1. #### via Mobile (iOS / Android) See [README](https://github.com/isl-org/MiDaS/tree/master/mobile) in the `mobile` subdirectory. #### via ROS1 (Robot Operating System) See [README](https://github.com/isl-org/MiDaS/tree/master/ros) in the `ros` subdirectory. Currently only supports MiDaS v2.1. DPT-based models to be added. ### Accuracy We provide a **zero-shot error** $\epsilon_d$ which is evaluated for 6 different datasets (see [paper](https://arxiv.org/abs/1907.01341v3)). **Lower error values are better**. $\color{green}{\textsf{Overall model quality is represented by the improvement}}$ ([Imp.](#improvement)) with respect to MiDaS 3.0 DPTL-384. The models are grouped by the height used for inference, whereas the square training resolution is given by the numbers in the model names. The table also shows the **number of parameters** (in millions) and the **frames per second** for inference at the training resolution (for GPU RTX 3090): | MiDaS Model | DIW
WHDR | Eth3d
AbsRel | Sintel
AbsRel | TUM
δ1 | KITTI
δ1 | NYUv2
δ1 | $\color{green}{\textsf{Imp.}}$
% | Par.
M | FPS
  | |-----------------------------------------------------------------------------------------------------------------------|-------------------------:|-----------------------------:|------------------------------:|-------------------------:|-------------------------:|-------------------------:|-------------------------------------------------:|----------------------:|--------------------------:| | **Inference height 512** | | | | | | | | | | | [v3.1 BEiTL-512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1137 | 0.0659 | 0.2366 | **6.13** | 11.56* | **1.86*** | $\color{green}{\textsf{19}}$ | **345** | **5.7** | | [v3.1 BEiTL-512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt)$\tiny{\square}$ | **0.1121** | **0.0614** | **0.2090** | 6.46 | **5.00*** | 1.90* | $\color{green}{\textsf{34}}$ | **345** | **5.7** | | | | | | | | | | | | | **Inference height 384** | | | | | | | | | | | [v3.1 BEiTL-512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1245 | 0.0681 | **0.2176** | **6.13** | 6.28* | **2.16*** | $\color{green}{\textsf{28}}$ | 345 | 12 | | [v3.1 Swin2L-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt)$\tiny{\square}$ | 0.1106 | 0.0732 | 0.2442 | 8.87 | **5.84*** | 2.92* | $\color{green}{\textsf{22}}$ | 213 | 41 | | [v3.1 Swin2B-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt)$\tiny{\square}$ | 0.1095 | 0.0790 | 0.2404 | 8.93 | 5.97* | 3.28* | $\color{green}{\textsf{22}}$ | 102 | 39 | | [v3.1 SwinL-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt)$\tiny{\square}$ | 0.1126 | 0.0853 | 0.2428 | 8.74 | 6.60* | 3.34* | $\color{green}{\textsf{17}}$ | 213 | 49 | | [v3.1 BEiTL-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt) | 0.1239 | **0.0667** | 0.2545 | 7.17 | 9.84* | 2.21* | $\color{green}{\textsf{17}}$ | 344 | 13 | | [v3.1 Next-ViTL-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt) | **0.1031** | 0.0954 | 0.2295 | 9.21 | 6.89* | 3.47* | $\color{green}{\textsf{16}}$ | **72** | 30 | | [v3.1 BEiTB-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt) | 0.1159 | 0.0967 | 0.2901 | 9.88 | 26.60* | 3.91* | $\color{green}{\textsf{-31}}$ | 112 | 31 | | [v3.0 DPTL-384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) | 0.1082 | 0.0888 | 0.2697 | 9.97 | 8.46 | 8.32 | $\color{green}{\textsf{0}}$ | 344 | **61** | | [v3.0 DPTH-384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt) | 0.1106 | 0.0934 | 0.2741 | 10.89 | 11.56 | 8.69 | $\color{green}{\textsf{-10}}$ | 123 | 50 | | [v2.1 Large384](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) | 0.1295 | 0.1155 | 0.3285 | 12.51 | 16.08 | 8.71 | $\color{green}{\textsf{-32}}$ | 105 | 47 | | | | | | | | | | | | | **Inference height 256** | | | | | | | | | | | [v3.1 Swin2T-256](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt)$\tiny{\square}$ | **0.1211** | **0.1106** | **0.2868** | **13.43** | **10.13*** | **5.55*** | $\color{green}{\textsf{-11}}$ | 42 | 64 | | [v2.1 Small256](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt) | 0.1344 | 0.1344 | 0.3370 | 14.53 | 29.27 | 13.43 | $\color{green}{\textsf{-76}}$ | **21** | **90** | | | | | | | | | | | | | **Inference height 224** | | | | | | | | | | | [v3.1 LeViT224](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt)$\tiny{\square}$ | **0.1314** | **0.1206** | **0.3148** | **18.21** | **15.27*** | **8.64*** | $\color{green}{\textsf{-40}}$ | **51** | **73** | * No zero-shot error, because models are also trained on KITTI and NYU Depth V2\ $\square$ Validation performed at **square resolution**, either because the transformer encoder backbone of a model does not support non-square resolutions (Swin, Swin2, LeViT) or for comparison with these models. All other validations keep the aspect ratio. A difference in resolution limits the comparability of the zero-shot error and the improvement, because these quantities are averages over the pixels of an image and do not take into account the advantage of more details due to a higher resolution.\ Best values per column and same validation height in bold #### Improvement The improvement in the above table is defined as the relative zero-shot error with respect to MiDaS v3.0 DPTL-384 and averaging over the datasets. So, if $\epsilon_d$ is the zero-shot error for dataset $d$, then the $\color{green}{\textsf{improvement}}$ is given by $100(1-(1/6)\sum_d\epsilon_d/\epsilon_{d,\rm{DPT_{L-384}}})$%. Note that the improvements of 10% for MiDaS v2.0 → v2.1 and 21% for MiDaS v2.1 → v3.0 are not visible from the improvement column (Imp.) in the table but would require an evaluation with respect to MiDaS v2.1 Large384 and v2.0 Large384 respectively instead of v3.0 DPTL-384. ### Depth map comparison Zoom in for better visibility ![](figures/Comparison.png) ### Speed on Camera Feed Test configuration - Windows 10 - 11th Gen Intel Core i7-1185G7 3.00GHz - 16GB RAM - Camera resolution 640x480 - openvino_midas_v21_small_256 Speed: 22 FPS ### Changelog * [Dec 2022] Released MiDaS v3.1: - New models based on 5 different types of transformers ([BEiT](https://arxiv.org/pdf/2106.08254.pdf), [Swin2](https://arxiv.org/pdf/2111.09883.pdf), [Swin](https://arxiv.org/pdf/2103.14030.pdf), [Next-ViT](https://arxiv.org/pdf/2207.05501.pdf), [LeViT](https://arxiv.org/pdf/2104.01136.pdf)) - Training datasets extended from 10 to 12, including also KITTI and NYU Depth V2 using [BTS](https://github.com/cleinc/bts) split - Best model, BEiTLarge 512, with resolution 512x512, is on average about [28% more accurate](#Accuracy) than MiDaS v3.0 - Integrated live depth estimation from camera feed * [Sep 2021] Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See [Gradio Web Demo](https://huggingface.co/spaces/akhaliq/DPT-Large). * [Apr 2021] Released MiDaS v3.0: - New models based on [Dense Prediction Transformers](https://arxiv.org/abs/2103.13413) are on average [21% more accurate](#Accuracy) than MiDaS v2.1 - Additional models can be found [here](https://github.com/isl-org/DPT) * [Nov 2020] Released MiDaS v2.1: - New model that was trained on 10 datasets and is on average about [10% more accurate](#Accuracy) than [MiDaS v2.0](https://github.com/isl-org/MiDaS/releases/tag/v2) - New light-weight model that achieves [real-time performance](https://github.com/isl-org/MiDaS/tree/master/mobile) on mobile platforms. - Sample applications for [iOS](https://github.com/isl-org/MiDaS/tree/master/mobile/ios) and [Android](https://github.com/isl-org/MiDaS/tree/master/mobile/android) - [ROS package](https://github.com/isl-org/MiDaS/tree/master/ros) for easy deployment on robots * [Jul 2020] Added TensorFlow and ONNX code. Added [online demo](http://35.202.76.57/). * [Dec 2019] Released new version of MiDaS - the new model is significantly more accurate and robust * [Jul 2019] Initial release of MiDaS ([Link](https://github.com/isl-org/MiDaS/releases/tag/v1)) ### Citation Please cite our paper if you use this code or any of the models: ``` @ARTICLE {Ranftl2022, author = "Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun", title = "Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-Shot Cross-Dataset Transfer", journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence", year = "2022", volume = "44", number = "3" } ``` If you use a DPT-based model, please also cite: ``` @article{Ranftl2021, author = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun}, title = {Vision Transformers for Dense Prediction}, journal = {ICCV}, year = {2021}, } ``` ### Acknowledgements Our work builds on and uses code from [timm](https://github.com/rwightman/pytorch-image-models) and [Next-ViT](https://github.com/bytedance/Next-ViT). We'd like to thank the authors for making these libraries available. ### License MIT License ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/environment.yaml ================================================ name: midas-py310 channels: - pytorch - defaults dependencies: - nvidia::cudatoolkit=11.7 - python=3.10.8 - pytorch::pytorch=1.13.0 - torchvision=0.14.0 - pip=22.3.1 - numpy=1.23.4 - pip: - opencv-python==4.6.0.66 - imutils==0.5.4 - timm==0.6.12 - einops==0.6.0 ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/hubconf.py ================================================ dependencies = ["torch"] import torch from midas.dpt_depth import DPTDepthModel from midas.midas_net import MidasNet from midas.midas_net_custom import MidasNet_small def DPT_BEiT_L_512(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT_BEiT_L_512 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="beitl16_512", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_BEiT_L_384(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT_BEiT_L_384 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="beitl16_384", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_BEiT_B_384(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT_BEiT_B_384 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="beitb16_384", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_SwinV2_L_384(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT_SwinV2_L_384 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="swin2l24_384", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_SwinV2_B_384(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT_SwinV2_B_384 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="swin2b24_384", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_SwinV2_T_256(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT_SwinV2_T_256 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="swin2t16_256", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_Swin_L_384(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT_Swin_L_384 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="swinl12_384", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_Next_ViT_L_384(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT_Next_ViT_L_384 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="next_vit_large_6m", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_LeViT_224(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT_LeViT_224 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="levit_384", non_negative=True, head_features_1=64, head_features_2=8, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_Large(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT-Large model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="vitl16_384", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def DPT_Hybrid(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS DPT-Hybrid model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = DPTDepthModel( path=None, backbone="vitb_rn50_384", non_negative=True, ) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def MiDaS(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS v2.1 model for monocular depth estimation pretrained (bool): load pretrained weights into model """ model = MidasNet() if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def MiDaS_small(pretrained=True, **kwargs): """ # This docstring shows up in hub.help() MiDaS v2.1 small model for monocular depth estimation on resource-constrained devices pretrained (bool): load pretrained weights into model """ model = MidasNet_small(None, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True}) if pretrained: checkpoint = ( "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt" ) state_dict = torch.hub.load_state_dict_from_url( checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True ) model.load_state_dict(state_dict) return model def transforms(): import cv2 from torchvision.transforms import Compose from midas.transforms import Resize, NormalizeImage, PrepareForNet from midas import transforms transforms.default_transform = Compose( [ lambda img: {"image": img / 255.0}, Resize( 384, 384, resize_target=None, keep_aspect_ratio=True, ensure_multiple_of=32, resize_method="upper_bound", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), PrepareForNet(), lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), ] ) transforms.small_transform = Compose( [ lambda img: {"image": img / 255.0}, Resize( 256, 256, resize_target=None, keep_aspect_ratio=True, ensure_multiple_of=32, resize_method="upper_bound", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), PrepareForNet(), lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), ] ) transforms.dpt_transform = Compose( [ lambda img: {"image": img / 255.0}, Resize( 384, 384, resize_target=None, keep_aspect_ratio=True, ensure_multiple_of=32, resize_method="minimal", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), PrepareForNet(), lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), ] ) transforms.beit512_transform = Compose( [ lambda img: {"image": img / 255.0}, Resize( 512, 512, resize_target=None, keep_aspect_ratio=True, ensure_multiple_of=32, resize_method="minimal", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), PrepareForNet(), lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), ] ) transforms.swin384_transform = Compose( [ lambda img: {"image": img / 255.0}, Resize( 384, 384, resize_target=None, keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), PrepareForNet(), lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), ] ) transforms.swin256_transform = Compose( [ lambda img: {"image": img / 255.0}, Resize( 256, 256, resize_target=None, keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), PrepareForNet(), lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), ] ) transforms.levit_transform = Compose( [ lambda img: {"image": img / 255.0}, Resize( 224, 224, resize_target=None, keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="minimal", image_interpolation_method=cv2.INTER_CUBIC, ), NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), PrepareForNet(), lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), ] ) return transforms ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/input/.placeholder ================================================ ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/beit.py ================================================ import timm import torch import types import numpy as np import torch.nn.functional as F from .utils import forward_adapted_unflatten, make_backbone_default from timm.models.beit import gen_relative_position_index from torch.utils.checkpoint import checkpoint from typing import Optional def forward_beit(pretrained, x): return forward_adapted_unflatten(pretrained, x, "forward_features") def patch_embed_forward(self, x): """ Modification of timm.models.layers.patch_embed.py: PatchEmbed.forward to support arbitrary window sizes. """ x = self.proj(x) if self.flatten: x = x.flatten(2).transpose(1, 2) x = self.norm(x) return x def _get_rel_pos_bias(self, window_size): """ Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes. """ old_height = 2 * self.window_size[0] - 1 old_width = 2 * self.window_size[1] - 1 new_height = 2 * window_size[0] - 1 new_width = 2 * window_size[1] - 1 old_relative_position_bias_table = self.relative_position_bias_table old_num_relative_distance = self.num_relative_distance new_num_relative_distance = new_height * new_width + 3 old_sub_table = old_relative_position_bias_table[:old_num_relative_distance - 3] old_sub_table = old_sub_table.reshape(1, old_width, old_height, -1).permute(0, 3, 1, 2) new_sub_table = F.interpolate(old_sub_table, size=(int(new_height), int(new_width)), mode="bilinear") new_sub_table = new_sub_table.permute(0, 2, 3, 1).reshape(new_num_relative_distance - 3, -1) new_relative_position_bias_table = torch.cat( [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3:]]) key = str(window_size[1]) + "," + str(window_size[0]) if key not in self.relative_position_indices.keys(): self.relative_position_indices[key] = gen_relative_position_index(window_size) relative_position_bias = new_relative_position_bias_table[ self.relative_position_indices[key].view(-1)].view( window_size[0] * window_size[1] + 1, window_size[0] * window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww return relative_position_bias.unsqueeze(0) def attention_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None): """ Modification of timm.models.beit.py: Attention.forward to support arbitrary window sizes. """ B, N, C = x.shape qkv_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias)) if self.q_bias is not None else None qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = (q @ k.transpose(-2, -1)) if self.relative_position_bias_table is not None: window_size = tuple(np.array(resolution) // 16) attn = attn + self._get_rel_pos_bias(window_size) if shared_rel_pos_bias is not None: attn = attn + shared_rel_pos_bias attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, -1) x = self.proj(x) x = self.proj_drop(x) return x def block_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None): """ Modification of timm.models.beit.py: Block.forward to support arbitrary window sizes. """ if hasattr(self, 'drop_path1') and not hasattr(self, 'drop_path'): self.drop_path = self.drop_path1 if self.gamma_1 is None: x = x + self.drop_path(self.attn(self.norm1(x), resolution, shared_rel_pos_bias=shared_rel_pos_bias)) x = x + self.drop_path(self.mlp(self.norm2(x))) else: x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), resolution, shared_rel_pos_bias=shared_rel_pos_bias)) x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) return x def beit_forward_features(self, x): """ Modification of timm.models.beit.py: Beit.forward_features to support arbitrary window sizes. """ resolution = x.shape[2:] x = self.patch_embed(x) x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) if self.pos_embed is not None: x = x + self.pos_embed x = self.pos_drop(x) rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None for blk in self.blocks: if self.grad_checkpointing and not torch.jit.is_scripting(): x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias) else: x = blk(x, resolution, shared_rel_pos_bias=rel_pos_bias) x = self.norm(x) return x def _make_beit_backbone( model, features=[96, 192, 384, 768], size=[384, 384], hooks=[0, 4, 8, 11], vit_features=768, use_readout="ignore", start_index=1, start_index_readout=1, ): backbone = make_backbone_default(model, features, size, hooks, vit_features, use_readout, start_index, start_index_readout) backbone.model.patch_embed.forward = types.MethodType(patch_embed_forward, backbone.model.patch_embed) backbone.model.forward_features = types.MethodType(beit_forward_features, backbone.model) for block in backbone.model.blocks: attn = block.attn attn._get_rel_pos_bias = types.MethodType(_get_rel_pos_bias, attn) attn.forward = types.MethodType(attention_forward, attn) attn.relative_position_indices = {} block.forward = types.MethodType(block_forward, block) return backbone def _make_pretrained_beitl16_512(pretrained, use_readout="ignore", hooks=None): model = timm.create_model("beit_large_patch16_512", pretrained=pretrained) hooks = [5, 11, 17, 23] if hooks is None else hooks features = [256, 512, 1024, 1024] return _make_beit_backbone( model, features=features, size=[512, 512], hooks=hooks, vit_features=1024, use_readout=use_readout, ) def _make_pretrained_beitl16_384(pretrained, use_readout="ignore", hooks=None): model = timm.create_model("beit_large_patch16_384", pretrained=pretrained) hooks = [5, 11, 17, 23] if hooks is None else hooks return _make_beit_backbone( model, features=[256, 512, 1024, 1024], hooks=hooks, vit_features=1024, use_readout=use_readout, ) def _make_pretrained_beitb16_384(pretrained, use_readout="ignore", hooks=None): model = timm.create_model("beit_base_patch16_384", pretrained=pretrained) hooks = [2, 5, 8, 11] if hooks is None else hooks return _make_beit_backbone( model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout, ) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/levit.py ================================================ import timm import torch import torch.nn as nn import numpy as np from .utils import activations, get_activation, Transpose def forward_levit(pretrained, x): pretrained.model.forward_features(x) layer_1 = pretrained.activations["1"] layer_2 = pretrained.activations["2"] layer_3 = pretrained.activations["3"] layer_1 = pretrained.act_postprocess1(layer_1) layer_2 = pretrained.act_postprocess2(layer_2) layer_3 = pretrained.act_postprocess3(layer_3) return layer_1, layer_2, layer_3 def _make_levit_backbone( model, hooks=[3, 11, 21], patch_grid=[14, 14] ): pretrained = nn.Module() pretrained.model = model pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) pretrained.activations = activations patch_grid_size = np.array(patch_grid, dtype=int) pretrained.act_postprocess1 = nn.Sequential( Transpose(1, 2), nn.Unflatten(2, torch.Size(patch_grid_size.tolist())) ) pretrained.act_postprocess2 = nn.Sequential( Transpose(1, 2), nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 2).astype(int)).tolist())) ) pretrained.act_postprocess3 = nn.Sequential( Transpose(1, 2), nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 4).astype(int)).tolist())) ) return pretrained class ConvTransposeNorm(nn.Sequential): """ Modification of https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: ConvNorm such that ConvTranspose2d is used instead of Conv2d. """ def __init__( self, in_chs, out_chs, kernel_size=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1): super().__init__() self.add_module('c', nn.ConvTranspose2d(in_chs, out_chs, kernel_size, stride, pad, dilation, groups, bias=False)) self.add_module('bn', nn.BatchNorm2d(out_chs)) nn.init.constant_(self.bn.weight, bn_weight_init) @torch.no_grad() def fuse(self): c, bn = self._modules.values() w = bn.weight / (bn.running_var + bn.eps) ** 0.5 w = c.weight * w[:, None, None, None] b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5 m = nn.ConvTranspose2d( w.size(1), w.size(0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups) m.weight.data.copy_(w) m.bias.data.copy_(b) return m def stem_b4_transpose(in_chs, out_chs, activation): """ Modification of https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: stem_b16 such that ConvTranspose2d is used instead of Conv2d and stem is also reduced to the half. """ return nn.Sequential( ConvTransposeNorm(in_chs, out_chs, 3, 2, 1), activation(), ConvTransposeNorm(out_chs, out_chs // 2, 3, 2, 1), activation()) def _make_pretrained_levit_384(pretrained, hooks=None): model = timm.create_model("levit_384", pretrained=pretrained) hooks = [3, 11, 21] if hooks == None else hooks return _make_levit_backbone( model, hooks=hooks ) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/next_vit.py ================================================ import timm import torch.nn as nn from pathlib import Path from .utils import activations, forward_default, get_activation from ..external.next_vit.classification.nextvit import * def forward_next_vit(pretrained, x): return forward_default(pretrained, x, "forward") def _make_next_vit_backbone( model, hooks=[2, 6, 36, 39], ): pretrained = nn.Module() pretrained.model = model pretrained.model.features[hooks[0]].register_forward_hook(get_activation("1")) pretrained.model.features[hooks[1]].register_forward_hook(get_activation("2")) pretrained.model.features[hooks[2]].register_forward_hook(get_activation("3")) pretrained.model.features[hooks[3]].register_forward_hook(get_activation("4")) pretrained.activations = activations return pretrained def _make_pretrained_next_vit_large_6m(hooks=None): model = timm.create_model("nextvit_large") hooks = [2, 6, 36, 39] if hooks == None else hooks return _make_next_vit_backbone( model, hooks=hooks, ) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin.py ================================================ import timm from .swin_common import _make_swin_backbone def _make_pretrained_swinl12_384(pretrained, hooks=None): model = timm.create_model("swin_large_patch4_window12_384", pretrained=pretrained) hooks = [1, 1, 17, 1] if hooks == None else hooks return _make_swin_backbone( model, hooks=hooks ) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin2.py ================================================ import timm from .swin_common import _make_swin_backbone def _make_pretrained_swin2l24_384(pretrained, hooks=None): model = timm.create_model("swinv2_large_window12to24_192to384_22kft1k", pretrained=pretrained) hooks = [1, 1, 17, 1] if hooks == None else hooks return _make_swin_backbone( model, hooks=hooks ) def _make_pretrained_swin2b24_384(pretrained, hooks=None): model = timm.create_model("swinv2_base_window12to24_192to384_22kft1k", pretrained=pretrained) hooks = [1, 1, 17, 1] if hooks == None else hooks return _make_swin_backbone( model, hooks=hooks ) def _make_pretrained_swin2t16_256(pretrained, hooks=None): model = timm.create_model("swinv2_tiny_window16_256", pretrained=pretrained) hooks = [1, 1, 5, 1] if hooks == None else hooks return _make_swin_backbone( model, hooks=hooks, patch_grid=[64, 64] ) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin_common.py ================================================ import torch import torch.nn as nn import numpy as np from .utils import activations, forward_default, get_activation, Transpose def forward_swin(pretrained, x): return forward_default(pretrained, x) def _make_swin_backbone( model, hooks=[1, 1, 17, 1], patch_grid=[96, 96] ): pretrained = nn.Module() pretrained.model = model pretrained.model.layers[0].blocks[hooks[0]].register_forward_hook(get_activation("1")) pretrained.model.layers[1].blocks[hooks[1]].register_forward_hook(get_activation("2")) pretrained.model.layers[2].blocks[hooks[2]].register_forward_hook(get_activation("3")) pretrained.model.layers[3].blocks[hooks[3]].register_forward_hook(get_activation("4")) pretrained.activations = activations if hasattr(model, "patch_grid"): used_patch_grid = model.patch_grid else: used_patch_grid = patch_grid patch_grid_size = np.array(used_patch_grid, dtype=int) pretrained.act_postprocess1 = nn.Sequential( Transpose(1, 2), nn.Unflatten(2, torch.Size(patch_grid_size.tolist())) ) pretrained.act_postprocess2 = nn.Sequential( Transpose(1, 2), nn.Unflatten(2, torch.Size((patch_grid_size // 2).tolist())) ) pretrained.act_postprocess3 = nn.Sequential( Transpose(1, 2), nn.Unflatten(2, torch.Size((patch_grid_size // 4).tolist())) ) pretrained.act_postprocess4 = nn.Sequential( Transpose(1, 2), nn.Unflatten(2, torch.Size((patch_grid_size // 8).tolist())) ) return pretrained ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/utils.py ================================================ import torch import torch.nn as nn class Slice(nn.Module): def __init__(self, start_index=1): super(Slice, self).__init__() self.start_index = start_index def forward(self, x): return x[:, self.start_index:] class AddReadout(nn.Module): def __init__(self, start_index=1): super(AddReadout, self).__init__() self.start_index = start_index def forward(self, x): if self.start_index == 2: readout = (x[:, 0] + x[:, 1]) / 2 else: readout = x[:, 0] return x[:, self.start_index:] + readout.unsqueeze(1) class ProjectReadout(nn.Module): def __init__(self, in_features, start_index=1): super(ProjectReadout, self).__init__() self.start_index = start_index self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU()) def forward(self, x): readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:]) features = torch.cat((x[:, self.start_index:], readout), -1) return self.project(features) class Transpose(nn.Module): def __init__(self, dim0, dim1): super(Transpose, self).__init__() self.dim0 = dim0 self.dim1 = dim1 def forward(self, x): x = x.transpose(self.dim0, self.dim1) return x activations = {} def get_activation(name): def hook(model, input, output): activations[name] = output return hook def forward_default(pretrained, x, function_name="forward_features"): exec(f"pretrained.model.{function_name}(x)") layer_1 = pretrained.activations["1"] layer_2 = pretrained.activations["2"] layer_3 = pretrained.activations["3"] layer_4 = pretrained.activations["4"] if hasattr(pretrained, "act_postprocess1"): layer_1 = pretrained.act_postprocess1(layer_1) if hasattr(pretrained, "act_postprocess2"): layer_2 = pretrained.act_postprocess2(layer_2) if hasattr(pretrained, "act_postprocess3"): layer_3 = pretrained.act_postprocess3(layer_3) if hasattr(pretrained, "act_postprocess4"): layer_4 = pretrained.act_postprocess4(layer_4) return layer_1, layer_2, layer_3, layer_4 def forward_adapted_unflatten(pretrained, x, function_name="forward_features"): b, c, h, w = x.shape exec(f"glob = pretrained.model.{function_name}(x)") layer_1 = pretrained.activations["1"] layer_2 = pretrained.activations["2"] layer_3 = pretrained.activations["3"] layer_4 = pretrained.activations["4"] layer_1 = pretrained.act_postprocess1[0:2](layer_1) layer_2 = pretrained.act_postprocess2[0:2](layer_2) layer_3 = pretrained.act_postprocess3[0:2](layer_3) layer_4 = pretrained.act_postprocess4[0:2](layer_4) unflatten = nn.Sequential( nn.Unflatten( 2, torch.Size( [ h // pretrained.model.patch_size[1], w // pretrained.model.patch_size[0], ] ), ) ) if layer_1.ndim == 3: layer_1 = unflatten(layer_1) if layer_2.ndim == 3: layer_2 = unflatten(layer_2) if layer_3.ndim == 3: layer_3 = unflatten(layer_3) if layer_4.ndim == 3: layer_4 = unflatten(layer_4) layer_1 = pretrained.act_postprocess1[3: len(pretrained.act_postprocess1)](layer_1) layer_2 = pretrained.act_postprocess2[3: len(pretrained.act_postprocess2)](layer_2) layer_3 = pretrained.act_postprocess3[3: len(pretrained.act_postprocess3)](layer_3) layer_4 = pretrained.act_postprocess4[3: len(pretrained.act_postprocess4)](layer_4) return layer_1, layer_2, layer_3, layer_4 def get_readout_oper(vit_features, features, use_readout, start_index=1): if use_readout == "ignore": readout_oper = [Slice(start_index)] * len(features) elif use_readout == "add": readout_oper = [AddReadout(start_index)] * len(features) elif use_readout == "project": readout_oper = [ ProjectReadout(vit_features, start_index) for out_feat in features ] else: assert ( False ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" return readout_oper def make_backbone_default( model, features=[96, 192, 384, 768], size=[384, 384], hooks=[2, 5, 8, 11], vit_features=768, use_readout="ignore", start_index=1, start_index_readout=1, ): pretrained = nn.Module() pretrained.model = model pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) pretrained.activations = activations readout_oper = get_readout_oper(vit_features, features, use_readout, start_index_readout) # 32, 48, 136, 384 pretrained.act_postprocess1 = nn.Sequential( readout_oper[0], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[0], kernel_size=1, stride=1, padding=0, ), nn.ConvTranspose2d( in_channels=features[0], out_channels=features[0], kernel_size=4, stride=4, padding=0, bias=True, dilation=1, groups=1, ), ) pretrained.act_postprocess2 = nn.Sequential( readout_oper[1], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[1], kernel_size=1, stride=1, padding=0, ), nn.ConvTranspose2d( in_channels=features[1], out_channels=features[1], kernel_size=2, stride=2, padding=0, bias=True, dilation=1, groups=1, ), ) pretrained.act_postprocess3 = nn.Sequential( readout_oper[2], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[2], kernel_size=1, stride=1, padding=0, ), ) pretrained.act_postprocess4 = nn.Sequential( readout_oper[3], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[3], kernel_size=1, stride=1, padding=0, ), nn.Conv2d( in_channels=features[3], out_channels=features[3], kernel_size=3, stride=2, padding=1, ), ) pretrained.model.start_index = start_index pretrained.model.patch_size = [16, 16] return pretrained ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/vit.py ================================================ import torch import torch.nn as nn import timm import types import math import torch.nn.functional as F from .utils import (activations, forward_adapted_unflatten, get_activation, get_readout_oper, make_backbone_default, Transpose) def forward_vit(pretrained, x): return forward_adapted_unflatten(pretrained, x, "forward_flex") def _resize_pos_embed(self, posemb, gs_h, gs_w): posemb_tok, posemb_grid = ( posemb[:, : self.start_index], posemb[0, self.start_index:], ) gs_old = int(math.sqrt(len(posemb_grid))) posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear") posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) posemb = torch.cat([posemb_tok, posemb_grid], dim=1) return posemb def forward_flex(self, x): b, c, h, w = x.shape pos_embed = self._resize_pos_embed( self.pos_embed, h // self.patch_size[1], w // self.patch_size[0] ) B = x.shape[0] if hasattr(self.patch_embed, "backbone"): x = self.patch_embed.backbone(x) if isinstance(x, (list, tuple)): x = x[-1] # last feature if backbone outputs list/tuple of features x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) if getattr(self, "dist_token", None) is not None: cls_tokens = self.cls_token.expand( B, -1, -1 ) # stole cls_tokens impl from Phil Wang, thanks dist_token = self.dist_token.expand(B, -1, -1) x = torch.cat((cls_tokens, dist_token, x), dim=1) else: if self.no_embed_class: x = x + pos_embed cls_tokens = self.cls_token.expand( B, -1, -1 ) # stole cls_tokens impl from Phil Wang, thanks x = torch.cat((cls_tokens, x), dim=1) if not self.no_embed_class: x = x + pos_embed x = self.pos_drop(x) for blk in self.blocks: x = blk(x) x = self.norm(x) return x def _make_vit_b16_backbone( model, features=[96, 192, 384, 768], size=[384, 384], hooks=[2, 5, 8, 11], vit_features=768, use_readout="ignore", start_index=1, start_index_readout=1, ): pretrained = make_backbone_default(model, features, size, hooks, vit_features, use_readout, start_index, start_index_readout) # We inject this function into the VisionTransformer instances so that # we can use it with interpolated position embeddings without modifying the library source. pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) pretrained.model._resize_pos_embed = types.MethodType( _resize_pos_embed, pretrained.model ) return pretrained def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None): model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) hooks = [5, 11, 17, 23] if hooks == None else hooks return _make_vit_b16_backbone( model, features=[256, 512, 1024, 1024], hooks=hooks, vit_features=1024, use_readout=use_readout, ) def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None): model = timm.create_model("vit_base_patch16_384", pretrained=pretrained) hooks = [2, 5, 8, 11] if hooks == None else hooks return _make_vit_b16_backbone( model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout ) def _make_vit_b_rn50_backbone( model, features=[256, 512, 768, 768], size=[384, 384], hooks=[0, 1, 8, 11], vit_features=768, patch_size=[16, 16], number_stages=2, use_vit_only=False, use_readout="ignore", start_index=1, ): pretrained = nn.Module() pretrained.model = model used_number_stages = 0 if use_vit_only else number_stages for s in range(used_number_stages): pretrained.model.patch_embed.backbone.stages[s].register_forward_hook( get_activation(str(s + 1)) ) for s in range(used_number_stages, 4): pretrained.model.blocks[hooks[s]].register_forward_hook(get_activation(str(s + 1))) pretrained.activations = activations readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) for s in range(used_number_stages): value = nn.Sequential(nn.Identity(), nn.Identity(), nn.Identity()) exec(f"pretrained.act_postprocess{s + 1}=value") for s in range(used_number_stages, 4): if s < number_stages: final_layer = nn.ConvTranspose2d( in_channels=features[s], out_channels=features[s], kernel_size=4 // (2 ** s), stride=4 // (2 ** s), padding=0, bias=True, dilation=1, groups=1, ) elif s > number_stages: final_layer = nn.Conv2d( in_channels=features[3], out_channels=features[3], kernel_size=3, stride=2, padding=1, ) else: final_layer = None layers = [ readout_oper[s], Transpose(1, 2), nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), nn.Conv2d( in_channels=vit_features, out_channels=features[s], kernel_size=1, stride=1, padding=0, ), ] if final_layer is not None: layers.append(final_layer) value = nn.Sequential(*layers) exec(f"pretrained.act_postprocess{s + 1}=value") pretrained.model.start_index = start_index pretrained.model.patch_size = patch_size # We inject this function into the VisionTransformer instances so that # we can use it with interpolated position embeddings without modifying the library source. pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) # We inject this function into the VisionTransformer instances so that # we can use it with interpolated position embeddings without modifying the library source. pretrained.model._resize_pos_embed = types.MethodType( _resize_pos_embed, pretrained.model ) return pretrained def _make_pretrained_vitb_rn50_384( pretrained, use_readout="ignore", hooks=None, use_vit_only=False ): model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained) hooks = [0, 1, 8, 11] if hooks == None else hooks return _make_vit_b_rn50_backbone( model, features=[256, 512, 768, 768], size=[384, 384], hooks=hooks, use_vit_only=use_vit_only, use_readout=use_readout, ) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/base_model.py ================================================ import torch class BaseModel(torch.nn.Module): def load(self, path): """Load model from file. Args: path (str): file path """ parameters = torch.load(path, map_location=torch.device('cpu')) if "optimizer" in parameters: parameters = parameters["model"] self.load_state_dict(parameters) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/blocks.py ================================================ import torch import torch.nn as nn from .backbones.beit import ( _make_pretrained_beitl16_512, _make_pretrained_beitl16_384, _make_pretrained_beitb16_384, forward_beit, ) from .backbones.swin_common import ( forward_swin, ) from .backbones.swin2 import ( _make_pretrained_swin2l24_384, _make_pretrained_swin2b24_384, _make_pretrained_swin2t16_256, ) from .backbones.swin import ( _make_pretrained_swinl12_384, ) from .backbones.levit import ( _make_pretrained_levit_384, forward_levit, ) from .backbones.vit import ( _make_pretrained_vitb_rn50_384, _make_pretrained_vitl16_384, _make_pretrained_vitb16_384, forward_vit, ) def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore", in_features=[96, 256, 512, 1024]): if backbone == "beitl16_512": pretrained = _make_pretrained_beitl16_512( use_pretrained, hooks=hooks, use_readout=use_readout ) scratch = _make_scratch( [256, 512, 1024, 1024], features, groups=groups, expand=expand ) # BEiT_512-L (backbone) elif backbone == "beitl16_384": pretrained = _make_pretrained_beitl16_384( use_pretrained, hooks=hooks, use_readout=use_readout ) scratch = _make_scratch( [256, 512, 1024, 1024], features, groups=groups, expand=expand ) # BEiT_384-L (backbone) elif backbone == "beitb16_384": pretrained = _make_pretrained_beitb16_384( use_pretrained, hooks=hooks, use_readout=use_readout ) scratch = _make_scratch( [96, 192, 384, 768], features, groups=groups, expand=expand ) # BEiT_384-B (backbone) elif backbone == "swin2l24_384": pretrained = _make_pretrained_swin2l24_384( use_pretrained, hooks=hooks ) scratch = _make_scratch( [192, 384, 768, 1536], features, groups=groups, expand=expand ) # Swin2-L/12to24 (backbone) elif backbone == "swin2b24_384": pretrained = _make_pretrained_swin2b24_384( use_pretrained, hooks=hooks ) scratch = _make_scratch( [128, 256, 512, 1024], features, groups=groups, expand=expand ) # Swin2-B/12to24 (backbone) elif backbone == "swin2t16_256": pretrained = _make_pretrained_swin2t16_256( use_pretrained, hooks=hooks ) scratch = _make_scratch( [96, 192, 384, 768], features, groups=groups, expand=expand ) # Swin2-T/16 (backbone) elif backbone == "swinl12_384": pretrained = _make_pretrained_swinl12_384( use_pretrained, hooks=hooks ) scratch = _make_scratch( [192, 384, 768, 1536], features, groups=groups, expand=expand ) # Swin-L/12 (backbone) elif backbone == "next_vit_large_6m": from .backbones.next_vit import _make_pretrained_next_vit_large_6m pretrained = _make_pretrained_next_vit_large_6m(hooks=hooks) scratch = _make_scratch( in_features, features, groups=groups, expand=expand ) # Next-ViT-L on ImageNet-1K-6M (backbone) elif backbone == "levit_384": pretrained = _make_pretrained_levit_384( use_pretrained, hooks=hooks ) scratch = _make_scratch( [384, 512, 768], features, groups=groups, expand=expand ) # LeViT 384 (backbone) elif backbone == "vitl16_384": pretrained = _make_pretrained_vitl16_384( use_pretrained, hooks=hooks, use_readout=use_readout ) scratch = _make_scratch( [256, 512, 1024, 1024], features, groups=groups, expand=expand ) # ViT-L/16 - 85.0% Top1 (backbone) elif backbone == "vitb_rn50_384": pretrained = _make_pretrained_vitb_rn50_384( use_pretrained, hooks=hooks, use_vit_only=use_vit_only, use_readout=use_readout, ) scratch = _make_scratch( [256, 512, 768, 768], features, groups=groups, expand=expand ) # ViT-H/16 - 85.0% Top1 (backbone) elif backbone == "vitb16_384": pretrained = _make_pretrained_vitb16_384( use_pretrained, hooks=hooks, use_readout=use_readout ) scratch = _make_scratch( [96, 192, 384, 768], features, groups=groups, expand=expand ) # ViT-B/16 - 84.6% Top1 (backbone) elif backbone == "resnext101_wsl": pretrained = _make_pretrained_resnext101_wsl(use_pretrained) scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3 elif backbone == "efficientnet_lite3": pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable) scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3 else: print(f"Backbone '{backbone}' not implemented") assert False return pretrained, scratch def _make_scratch(in_shape, out_shape, groups=1, expand=False): scratch = nn.Module() out_shape1 = out_shape out_shape2 = out_shape out_shape3 = out_shape if len(in_shape) >= 4: out_shape4 = out_shape if expand: out_shape1 = out_shape out_shape2 = out_shape*2 out_shape3 = out_shape*4 if len(in_shape) >= 4: out_shape4 = out_shape*8 scratch.layer1_rn = nn.Conv2d( in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups ) scratch.layer2_rn = nn.Conv2d( in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups ) scratch.layer3_rn = nn.Conv2d( in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups ) if len(in_shape) >= 4: scratch.layer4_rn = nn.Conv2d( in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups ) return scratch def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False): efficientnet = torch.hub.load( "rwightman/gen-efficientnet-pytorch", "tf_efficientnet_lite3", pretrained=use_pretrained, exportable=exportable ) return _make_efficientnet_backbone(efficientnet) def _make_efficientnet_backbone(effnet): pretrained = nn.Module() pretrained.layer1 = nn.Sequential( effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2] ) pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3]) pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5]) pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9]) return pretrained def _make_resnet_backbone(resnet): pretrained = nn.Module() pretrained.layer1 = nn.Sequential( resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1 ) pretrained.layer2 = resnet.layer2 pretrained.layer3 = resnet.layer3 pretrained.layer4 = resnet.layer4 return pretrained def _make_pretrained_resnext101_wsl(use_pretrained): resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl") return _make_resnet_backbone(resnet) class Interpolate(nn.Module): """Interpolation module. """ def __init__(self, scale_factor, mode, align_corners=False): """Init. Args: scale_factor (float): scaling mode (str): interpolation mode """ super(Interpolate, self).__init__() self.interp = nn.functional.interpolate self.scale_factor = scale_factor self.mode = mode self.align_corners = align_corners def forward(self, x): """Forward pass. Args: x (tensor): input Returns: tensor: interpolated data """ x = self.interp( x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners ) return x class ResidualConvUnit(nn.Module): """Residual convolution module. """ def __init__(self, features): """Init. Args: features (int): number of features """ super().__init__() self.conv1 = nn.Conv2d( features, features, kernel_size=3, stride=1, padding=1, bias=True ) self.conv2 = nn.Conv2d( features, features, kernel_size=3, stride=1, padding=1, bias=True ) self.relu = nn.ReLU(inplace=True) def forward(self, x): """Forward pass. Args: x (tensor): input Returns: tensor: output """ out = self.relu(x) out = self.conv1(out) out = self.relu(out) out = self.conv2(out) return out + x class FeatureFusionBlock(nn.Module): """Feature fusion block. """ def __init__(self, features): """Init. Args: features (int): number of features """ super(FeatureFusionBlock, self).__init__() self.resConfUnit1 = ResidualConvUnit(features) self.resConfUnit2 = ResidualConvUnit(features) def forward(self, *xs): """Forward pass. Returns: tensor: output """ output = xs[0] if len(xs) == 2: output += self.resConfUnit1(xs[1]) output = self.resConfUnit2(output) output = nn.functional.interpolate( output, scale_factor=2, mode="bilinear", align_corners=True ) return output class ResidualConvUnit_custom(nn.Module): """Residual convolution module. """ def __init__(self, features, activation, bn): """Init. Args: features (int): number of features """ super().__init__() self.bn = bn self.groups=1 self.conv1 = nn.Conv2d( features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups ) self.conv2 = nn.Conv2d( features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups ) if self.bn==True: self.bn1 = nn.BatchNorm2d(features) self.bn2 = nn.BatchNorm2d(features) self.activation = activation self.skip_add = nn.quantized.FloatFunctional() def forward(self, x): """Forward pass. Args: x (tensor): input Returns: tensor: output """ out = self.activation(x) out = self.conv1(out) if self.bn==True: out = self.bn1(out) out = self.activation(out) out = self.conv2(out) if self.bn==True: out = self.bn2(out) if self.groups > 1: out = self.conv_merge(out) return self.skip_add.add(out, x) # return out + x class FeatureFusionBlock_custom(nn.Module): """Feature fusion block. """ def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None): """Init. Args: features (int): number of features """ super(FeatureFusionBlock_custom, self).__init__() self.deconv = deconv self.align_corners = align_corners self.groups=1 self.expand = expand out_features = features if self.expand==True: out_features = features//2 self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) self.skip_add = nn.quantized.FloatFunctional() self.size=size def forward(self, *xs, size=None): """Forward pass. Returns: tensor: output """ output = xs[0] if len(xs) == 2: res = self.resConfUnit1(xs[1]) output = self.skip_add.add(output, res) # output += res output = self.resConfUnit2(output) if (size is None) and (self.size is None): modifier = {"scale_factor": 2} elif size is None: modifier = {"size": self.size} else: modifier = {"size": size} output = nn.functional.interpolate( output, **modifier, mode="bilinear", align_corners=self.align_corners ) output = self.out_conv(output) return output ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/dpt_depth.py ================================================ import torch import torch.nn as nn from .base_model import BaseModel from .blocks import ( FeatureFusionBlock_custom, Interpolate, _make_encoder, forward_beit, forward_swin, forward_levit, forward_vit, ) from .backbones.levit import stem_b4_transpose from timm.models.layers import get_act_layer def _make_fusion_block(features, use_bn, size = None): return FeatureFusionBlock_custom( features, nn.ReLU(False), deconv=False, bn=use_bn, expand=False, align_corners=True, size=size, ) class DPT(BaseModel): def __init__( self, head, features=256, backbone="vitb_rn50_384", readout="project", channels_last=False, use_bn=False, **kwargs ): super(DPT, self).__init__() self.channels_last = channels_last # For the Swin, Swin 2, LeViT and Next-ViT Transformers, the hierarchical architectures prevent setting the # hooks freely. Instead, the hooks have to be chosen according to the ranges specified in the comments. hooks = { "beitl16_512": [5, 11, 17, 23], "beitl16_384": [5, 11, 17, 23], "beitb16_384": [2, 5, 8, 11], "swin2l24_384": [1, 1, 17, 1], # Allowed ranges: [0, 1], [0, 1], [ 0, 17], [ 0, 1] "swin2b24_384": [1, 1, 17, 1], # [0, 1], [0, 1], [ 0, 17], [ 0, 1] "swin2t16_256": [1, 1, 5, 1], # [0, 1], [0, 1], [ 0, 5], [ 0, 1] "swinl12_384": [1, 1, 17, 1], # [0, 1], [0, 1], [ 0, 17], [ 0, 1] "next_vit_large_6m": [2, 6, 36, 39], # [0, 2], [3, 6], [ 7, 36], [37, 39] "levit_384": [3, 11, 21], # [0, 3], [6, 11], [14, 21] "vitb_rn50_384": [0, 1, 8, 11], "vitb16_384": [2, 5, 8, 11], "vitl16_384": [5, 11, 17, 23], }[backbone] if "next_vit" in backbone: in_features = { "next_vit_large_6m": [96, 256, 512, 1024], }[backbone] else: in_features = None # Instantiate backbone and reassemble blocks self.pretrained, self.scratch = _make_encoder( backbone, features, False, # Set to true of you want to train from scratch, uses ImageNet weights groups=1, expand=False, exportable=False, hooks=hooks, use_readout=readout, in_features=in_features, ) self.number_layers = len(hooks) if hooks is not None else 4 size_refinenet3 = None self.scratch.stem_transpose = None if "beit" in backbone: self.forward_transformer = forward_beit elif "swin" in backbone: self.forward_transformer = forward_swin elif "next_vit" in backbone: from .backbones.next_vit import forward_next_vit self.forward_transformer = forward_next_vit elif "levit" in backbone: self.forward_transformer = forward_levit size_refinenet3 = 7 self.scratch.stem_transpose = stem_b4_transpose(256, 128, get_act_layer("hard_swish")) else: self.forward_transformer = forward_vit self.scratch.refinenet1 = _make_fusion_block(features, use_bn) self.scratch.refinenet2 = _make_fusion_block(features, use_bn) self.scratch.refinenet3 = _make_fusion_block(features, use_bn, size_refinenet3) if self.number_layers >= 4: self.scratch.refinenet4 = _make_fusion_block(features, use_bn) self.scratch.output_conv = head def forward(self, x): if self.channels_last == True: x.contiguous(memory_format=torch.channels_last) layers = self.forward_transformer(self.pretrained, x) if self.number_layers == 3: layer_1, layer_2, layer_3 = layers else: layer_1, layer_2, layer_3, layer_4 = layers layer_1_rn = self.scratch.layer1_rn(layer_1) layer_2_rn = self.scratch.layer2_rn(layer_2) layer_3_rn = self.scratch.layer3_rn(layer_3) if self.number_layers >= 4: layer_4_rn = self.scratch.layer4_rn(layer_4) if self.number_layers == 3: path_3 = self.scratch.refinenet3(layer_3_rn, size=layer_2_rn.shape[2:]) else: path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:]) path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:]) path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:]) path_1 = self.scratch.refinenet1(path_2, layer_1_rn) if self.scratch.stem_transpose is not None: path_1 = self.scratch.stem_transpose(path_1) out = self.scratch.output_conv(path_1) return out class DPTDepthModel(DPT): def __init__(self, path=None, non_negative=True, **kwargs): features = kwargs["features"] if "features" in kwargs else 256 head_features_1 = kwargs["head_features_1"] if "head_features_1" in kwargs else features head_features_2 = kwargs["head_features_2"] if "head_features_2" in kwargs else 32 kwargs.pop("head_features_1", None) kwargs.pop("head_features_2", None) head = nn.Sequential( nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1), Interpolate(scale_factor=2, mode="bilinear", align_corners=True), nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1), nn.ReLU(True), nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0), nn.ReLU(True) if non_negative else nn.Identity(), nn.Identity(), ) super().__init__(head, **kwargs) if path is not None: self.load(path) def forward(self, x): return super().forward(x).squeeze(dim=1) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net.py ================================================ """MidashNet: Network for monocular depth estimation trained by mixing several datasets. This file contains code that is adapted from https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py """ import torch import torch.nn as nn from .base_model import BaseModel from .blocks import FeatureFusionBlock, Interpolate, _make_encoder class MidasNet(BaseModel): """Network for monocular depth estimation. """ def __init__(self, path=None, features=256, non_negative=True): """Init. Args: path (str, optional): Path to saved model. Defaults to None. features (int, optional): Number of features. Defaults to 256. backbone (str, optional): Backbone network for encoder. Defaults to resnet50 """ print("Loading weights: ", path) super(MidasNet, self).__init__() use_pretrained = False if path is None else True self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained) self.scratch.refinenet4 = FeatureFusionBlock(features) self.scratch.refinenet3 = FeatureFusionBlock(features) self.scratch.refinenet2 = FeatureFusionBlock(features) self.scratch.refinenet1 = FeatureFusionBlock(features) self.scratch.output_conv = nn.Sequential( nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), Interpolate(scale_factor=2, mode="bilinear"), nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), nn.ReLU(True), nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), nn.ReLU(True) if non_negative else nn.Identity(), ) if path: self.load(path) def forward(self, x): """Forward pass. Args: x (tensor): input data (image) Returns: tensor: depth """ layer_1 = self.pretrained.layer1(x) layer_2 = self.pretrained.layer2(layer_1) layer_3 = self.pretrained.layer3(layer_2) layer_4 = self.pretrained.layer4(layer_3) layer_1_rn = self.scratch.layer1_rn(layer_1) layer_2_rn = self.scratch.layer2_rn(layer_2) layer_3_rn = self.scratch.layer3_rn(layer_3) layer_4_rn = self.scratch.layer4_rn(layer_4) path_4 = self.scratch.refinenet4(layer_4_rn) path_3 = self.scratch.refinenet3(path_4, layer_3_rn) path_2 = self.scratch.refinenet2(path_3, layer_2_rn) path_1 = self.scratch.refinenet1(path_2, layer_1_rn) out = self.scratch.output_conv(path_1) return torch.squeeze(out, dim=1) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net_custom.py ================================================ """MidashNet: Network for monocular depth estimation trained by mixing several datasets. This file contains code that is adapted from https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py """ import torch import torch.nn as nn from .base_model import BaseModel from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder class MidasNet_small(BaseModel): """Network for monocular depth estimation. """ def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True, blocks={'expand': True}): """Init. Args: path (str, optional): Path to saved model. Defaults to None. features (int, optional): Number of features. Defaults to 256. backbone (str, optional): Backbone network for encoder. Defaults to resnet50 """ print("Loading weights: ", path) super(MidasNet_small, self).__init__() use_pretrained = False if path else True self.channels_last = channels_last self.blocks = blocks self.backbone = backbone self.groups = 1 features1=features features2=features features3=features features4=features self.expand = False if "expand" in self.blocks and self.blocks['expand'] == True: self.expand = True features1=features features2=features*2 features3=features*4 features4=features*8 self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable) self.scratch.activation = nn.ReLU(False) self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners) self.scratch.output_conv = nn.Sequential( nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups), Interpolate(scale_factor=2, mode="bilinear"), nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1), self.scratch.activation, nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), nn.ReLU(True) if non_negative else nn.Identity(), nn.Identity(), ) if path: self.load(path) def forward(self, x): """Forward pass. Args: x (tensor): input data (image) Returns: tensor: depth """ if self.channels_last==True: print("self.channels_last = ", self.channels_last) x.contiguous(memory_format=torch.channels_last) layer_1 = self.pretrained.layer1(x) layer_2 = self.pretrained.layer2(layer_1) layer_3 = self.pretrained.layer3(layer_2) layer_4 = self.pretrained.layer4(layer_3) layer_1_rn = self.scratch.layer1_rn(layer_1) layer_2_rn = self.scratch.layer2_rn(layer_2) layer_3_rn = self.scratch.layer3_rn(layer_3) layer_4_rn = self.scratch.layer4_rn(layer_4) path_4 = self.scratch.refinenet4(layer_4_rn) path_3 = self.scratch.refinenet3(path_4, layer_3_rn) path_2 = self.scratch.refinenet2(path_3, layer_2_rn) path_1 = self.scratch.refinenet1(path_2, layer_1_rn) out = self.scratch.output_conv(path_1) return torch.squeeze(out, dim=1) def fuse_model(m): prev_previous_type = nn.Identity() prev_previous_name = '' previous_type = nn.Identity() previous_name = '' for name, module in m.named_modules(): if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU: # print("FUSED ", prev_previous_name, previous_name, name) torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True) elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d: # print("FUSED ", prev_previous_name, previous_name) torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True) # elif previous_type == nn.Conv2d and type(module) == nn.ReLU: # print("FUSED ", previous_name, name) # torch.quantization.fuse_modules(m, [previous_name, name], inplace=True) prev_previous_type = previous_type prev_previous_name = previous_name previous_type = type(module) previous_name = name ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/model_loader.py ================================================ import cv2 import torch from midas.dpt_depth import DPTDepthModel from midas.midas_net import MidasNet from midas.midas_net_custom import MidasNet_small from midas.transforms import Resize, NormalizeImage, PrepareForNet from torchvision.transforms import Compose default_models = { "dpt_beit_large_512": "weights/dpt_beit_large_512.pt", "dpt_beit_large_384": "weights/dpt_beit_large_384.pt", "dpt_beit_base_384": "weights/dpt_beit_base_384.pt", "dpt_swin2_large_384": "weights/dpt_swin2_large_384.pt", "dpt_swin2_base_384": "weights/dpt_swin2_base_384.pt", "dpt_swin2_tiny_256": "weights/dpt_swin2_tiny_256.pt", "dpt_swin_large_384": "weights/dpt_swin_large_384.pt", "dpt_next_vit_large_384": "weights/dpt_next_vit_large_384.pt", "dpt_levit_224": "weights/dpt_levit_224.pt", "dpt_large_384": "weights/dpt_large_384.pt", "dpt_hybrid_384": "weights/dpt_hybrid_384.pt", "midas_v21_384": "weights/midas_v21_384.pt", "midas_v21_small_256": "weights/midas_v21_small_256.pt", "openvino_midas_v21_small_256": "weights/openvino_midas_v21_small_256.xml", } def load_model(device, model_path, model_type="dpt_large_384", optimize=True, height=None, square=False): """Load the specified network. Args: device (device): the torch device used model_path (str): path to saved model model_type (str): the type of the model to be loaded optimize (bool): optimize the model to half-integer on CUDA? height (int): inference encoder image height square (bool): resize to a square resolution? Returns: The loaded network, the transform which prepares images as input to the network and the dimensions of the network input """ if "openvino" in model_type: from openvino.runtime import Core keep_aspect_ratio = not square if model_type == "dpt_beit_large_512": model = DPTDepthModel( path=model_path, backbone="beitl16_512", non_negative=True, ) net_w, net_h = 512, 512 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_beit_large_384": model = DPTDepthModel( path=model_path, backbone="beitl16_384", non_negative=True, ) net_w, net_h = 384, 384 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_beit_base_384": model = DPTDepthModel( path=model_path, backbone="beitb16_384", non_negative=True, ) net_w, net_h = 384, 384 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_swin2_large_384": model = DPTDepthModel( path=model_path, backbone="swin2l24_384", non_negative=True, ) net_w, net_h = 384, 384 keep_aspect_ratio = False resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_swin2_base_384": model = DPTDepthModel( path=model_path, backbone="swin2b24_384", non_negative=True, ) net_w, net_h = 384, 384 keep_aspect_ratio = False resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_swin2_tiny_256": model = DPTDepthModel( path=model_path, backbone="swin2t16_256", non_negative=True, ) net_w, net_h = 256, 256 keep_aspect_ratio = False resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_swin_large_384": model = DPTDepthModel( path=model_path, backbone="swinl12_384", non_negative=True, ) net_w, net_h = 384, 384 keep_aspect_ratio = False resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_next_vit_large_384": model = DPTDepthModel( path=model_path, backbone="next_vit_large_6m", non_negative=True, ) net_w, net_h = 384, 384 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) # We change the notation from dpt_levit_224 (MiDaS notation) to levit_384 (timm notation) here, where the 224 refers # to the resolution 224x224 used by LeViT and 384 is the first entry of the embed_dim, see _cfg and model_cfgs of # https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/levit.py # (commit id: 927f031293a30afb940fff0bee34b85d9c059b0e) elif model_type == "dpt_levit_224": model = DPTDepthModel( path=model_path, backbone="levit_384", non_negative=True, head_features_1=64, head_features_2=8, ) net_w, net_h = 224, 224 keep_aspect_ratio = False resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_large_384": model = DPTDepthModel( path=model_path, backbone="vitl16_384", non_negative=True, ) net_w, net_h = 384, 384 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_hybrid_384": model = DPTDepthModel( path=model_path, backbone="vitb_rn50_384", non_negative=True, ) net_w, net_h = 384, 384 resize_mode = "minimal" normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "midas_v21_384": model = MidasNet(model_path, non_negative=True) net_w, net_h = 384, 384 resize_mode = "upper_bound" normalization = NormalizeImage( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) elif model_type == "midas_v21_small_256": model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True}) net_w, net_h = 256, 256 resize_mode = "upper_bound" normalization = NormalizeImage( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) elif model_type == "openvino_midas_v21_small_256": ie = Core() uncompiled_model = ie.read_model(model=model_path) model = ie.compile_model(uncompiled_model, "CPU") net_w, net_h = 256, 256 resize_mode = "upper_bound" normalization = NormalizeImage( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) else: print(f"model_type '{model_type}' not implemented, use: --model_type large") assert False if not "openvino" in model_type: print("Model loaded, number of parameters = {:.0f}M".format(sum(p.numel() for p in model.parameters()) / 1e6)) else: print("Model loaded, optimized with OpenVINO") if "openvino" in model_type: keep_aspect_ratio = False if height is not None: net_w, net_h = height, height transform = Compose( [ Resize( net_w, net_h, resize_target=None, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=32, resize_method=resize_mode, image_interpolation_method=cv2.INTER_CUBIC, ), normalization, PrepareForNet(), ] ) if not "openvino" in model_type: model.eval() if optimize and (device == torch.device("cuda")): if not "openvino" in model_type: model = model.to(memory_format=torch.channels_last) model = model.half() else: print("Error: OpenVINO models are already optimized. No optimization to half-float possible.") exit() if not "openvino" in model_type: model.to(device) return model, transform, net_w, net_h ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/midas/transforms.py ================================================ import numpy as np import cv2 import math def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): """Rezise the sample to ensure the given size. Keeps aspect ratio. Args: sample (dict): sample size (tuple): image size Returns: tuple: new size """ shape = list(sample["disparity"].shape) if shape[0] >= size[0] and shape[1] >= size[1]: return sample scale = [0, 0] scale[0] = size[0] / shape[0] scale[1] = size[1] / shape[1] scale = max(scale) shape[0] = math.ceil(scale * shape[0]) shape[1] = math.ceil(scale * shape[1]) # resize sample["image"] = cv2.resize( sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method ) sample["disparity"] = cv2.resize( sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST ) sample["mask"] = cv2.resize( sample["mask"].astype(np.float32), tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST, ) sample["mask"] = sample["mask"].astype(bool) return tuple(shape) class Resize(object): """Resize sample to given size (width, height). """ def __init__( self, width, height, resize_target=True, keep_aspect_ratio=False, ensure_multiple_of=1, resize_method="lower_bound", image_interpolation_method=cv2.INTER_AREA, ): """Init. Args: width (int): desired output width height (int): desired output height resize_target (bool, optional): True: Resize the full sample (image, mask, target). False: Resize image only. Defaults to True. keep_aspect_ratio (bool, optional): True: Keep the aspect ratio of the input sample. Output sample might not have the given width and height, and resize behaviour depends on the parameter 'resize_method'. Defaults to False. ensure_multiple_of (int, optional): Output width and height is constrained to be multiple of this parameter. Defaults to 1. resize_method (str, optional): "lower_bound": Output will be at least as large as the given size. "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) "minimal": Scale as least as possible. (Output size might be smaller than given size.) Defaults to "lower_bound". """ self.__width = width self.__height = height self.__resize_target = resize_target self.__keep_aspect_ratio = keep_aspect_ratio self.__multiple_of = ensure_multiple_of self.__resize_method = resize_method self.__image_interpolation_method = image_interpolation_method def constrain_to_multiple_of(self, x, min_val=0, max_val=None): y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) if max_val is not None and y > max_val: y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) if y < min_val: y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) return y def get_size(self, width, height): # determine new height and width scale_height = self.__height / height scale_width = self.__width / width if self.__keep_aspect_ratio: if self.__resize_method == "lower_bound": # scale such that output size is lower bound if scale_width > scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "upper_bound": # scale such that output size is upper bound if scale_width < scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "minimal": # scale as least as possbile if abs(1 - scale_width) < abs(1 - scale_height): # fit width scale_height = scale_width else: # fit height scale_width = scale_height else: raise ValueError( f"resize_method {self.__resize_method} not implemented" ) if self.__resize_method == "lower_bound": new_height = self.constrain_to_multiple_of( scale_height * height, min_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, min_val=self.__width ) elif self.__resize_method == "upper_bound": new_height = self.constrain_to_multiple_of( scale_height * height, max_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, max_val=self.__width ) elif self.__resize_method == "minimal": new_height = self.constrain_to_multiple_of(scale_height * height) new_width = self.constrain_to_multiple_of(scale_width * width) else: raise ValueError(f"resize_method {self.__resize_method} not implemented") return (new_width, new_height) def __call__(self, sample): width, height = self.get_size( sample["image"].shape[1], sample["image"].shape[0] ) # resize sample sample["image"] = cv2.resize( sample["image"], (width, height), interpolation=self.__image_interpolation_method, ) if self.__resize_target: if "disparity" in sample: sample["disparity"] = cv2.resize( sample["disparity"], (width, height), interpolation=cv2.INTER_NEAREST, ) if "depth" in sample: sample["depth"] = cv2.resize( sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST ) sample["mask"] = cv2.resize( sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST, ) sample["mask"] = sample["mask"].astype(bool) return sample class NormalizeImage(object): """Normlize image by given mean and std. """ def __init__(self, mean, std): self.__mean = mean self.__std = std def __call__(self, sample): sample["image"] = (sample["image"] - self.__mean) / self.__std return sample class PrepareForNet(object): """Prepare sample for usage as network input. """ def __init__(self): pass def __call__(self, sample): image = np.transpose(sample["image"], (2, 0, 1)) sample["image"] = np.ascontiguousarray(image).astype(np.float32) if "mask" in sample: sample["mask"] = sample["mask"].astype(np.float32) sample["mask"] = np.ascontiguousarray(sample["mask"]) if "disparity" in sample: disparity = sample["disparity"].astype(np.float32) sample["disparity"] = np.ascontiguousarray(disparity) if "depth" in sample: depth = sample["depth"].astype(np.float32) sample["depth"] = np.ascontiguousarray(depth) return sample ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/output/.placeholder ================================================ ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/LICENSE ================================================ MIT License Copyright (c) 2020 Alexey Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/README.md ================================================ # MiDaS for ROS1 by using LibTorch in C++ ### Requirements - Ubuntu 17.10 / 18.04 / 20.04, Debian Stretch - ROS Melodic for Ubuntu (17.10 / 18.04) / Debian Stretch, ROS Noetic for Ubuntu 20.04 - C++11 - LibTorch >= 1.6 ## Quick Start with a MiDaS Example MiDaS is a neural network to compute depth from a single image. * input from `image_topic`: `sensor_msgs/Image` - `RGB8` image with any shape * output to `midas_topic`: `sensor_msgs/Image` - `TYPE_32FC1` inverse relative depth maps in range [0 - 255] with original size and channels=1 ### Install Dependecies * install ROS Melodic for Ubuntu 17.10 / 18.04: ```bash wget https://raw.githubusercontent.com/isl-org/MiDaS/master/ros/additions/install_ros_melodic_ubuntu_17_18.sh ./install_ros_melodic_ubuntu_17_18.sh ``` or Noetic for Ubuntu 20.04: ```bash wget https://raw.githubusercontent.com/isl-org/MiDaS/master/ros/additions/install_ros_noetic_ubuntu_20.sh ./install_ros_noetic_ubuntu_20.sh ``` * install LibTorch 1.7 with CUDA 11.0: On **Jetson (ARM)**: ```bash wget https://nvidia.box.com/shared/static/wa34qwrwtk9njtyarwt5nvo6imenfy26.whl -O torch-1.7.0-cp36-cp36m-linux_aarch64.whl sudo apt-get install python3-pip libopenblas-base libopenmpi-dev pip3 install Cython pip3 install numpy torch-1.7.0-cp36-cp36m-linux_aarch64.whl ``` Or compile LibTorch from source: https://github.com/pytorch/pytorch#from-source On **Linux (x86_64)**: ```bash cd ~/ wget https://download.pytorch.org/libtorch/cu110/libtorch-cxx11-abi-shared-with-deps-1.7.0%2Bcu110.zip unzip libtorch-cxx11-abi-shared-with-deps-1.7.0+cu110.zip ``` * create symlink for OpenCV: ```bash sudo ln -s /usr/include/opencv4 /usr/include/opencv ``` * download and install MiDaS: ```bash source ~/.bashrc cd ~/ mkdir catkin_ws cd catkin_ws git clone https://github.com/isl-org/MiDaS mkdir src cp -r MiDaS/ros/* src chmod +x src/additions/*.sh chmod +x src/*.sh chmod +x src/midas_cpp/scripts/*.py cp src/additions/do_catkin_make.sh ./do_catkin_make.sh ./do_catkin_make.sh ./src/additions/downloads.sh ``` ### Usage * run only `midas` node: `~/catkin_ws/src/launch_midas_cpp.sh` #### Test * Test - capture video and show result in the window: * place any `test.mp4` video file to the directory `~/catkin_ws/src/` * run `midas` node: `~/catkin_ws/src/launch_midas_cpp.sh` * run test nodes in another terminal: `cd ~/catkin_ws/src && ./run_talker_listener_test.sh` and wait 30 seconds (to use Python 2, run command `sed -i 's/python3/python2/' ~/catkin_ws/src/midas_cpp/scripts/*.py` ) ## Mobile version of MiDaS - Monocular Depth Estimation ### Accuracy * MiDaS v2 small - ResNet50 default-decoder 384x384 * MiDaS v2.1 small - EfficientNet-Lite3 small-decoder 256x256 **Zero-shot error** (the lower - the better): | Model | DIW WHDR | Eth3d AbsRel | Sintel AbsRel | Kitti δ>1.25 | NyuDepthV2 δ>1.25 | TUM δ>1.25 | |---|---|---|---|---|---|---| | MiDaS v2 small 384x384 | **0.1248** | 0.1550 | **0.3300** | **21.81** | 15.73 | 17.00 | | MiDaS v2.1 small 256x256 | 0.1344 | **0.1344** | 0.3370 | 29.27 | **13.43** | **14.53** | | Relative improvement, % | -8 % | **+13 %** | -2 % | -34 % | **+15 %** | **+15 %** | None of Train/Valid/Test subsets of datasets (DIW, Eth3d, Sintel, Kitti, NyuDepthV2, TUM) were not involved in Training or Fine Tuning. ### Inference speed (FPS) on nVidia GPU Inference speed excluding pre and post processing, batch=1, **Frames Per Second** (the higher - the better): | Model | Jetson Nano, FPS | RTX 2080Ti, FPS | |---|---|---| | MiDaS v2 small 384x384 | 1.6 | 117 | | MiDaS v2.1 small 256x256 | 8.1 | 232 | | SpeedUp, X times | **5x** | **2x** | ### Citation This repository contains code to compute depth from a single image. It accompanies our [paper](https://arxiv.org/abs/1907.01341v3): >Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer René Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, Vladlen Koltun Please cite our paper if you use this code or any of the models: ``` @article{Ranftl2020, author = {Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun}, title = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)}, year = {2020}, } ``` ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/do_catkin_make.sh ================================================ mkdir src catkin_make source devel/setup.bash echo $ROS_PACKAGE_PATH chmod +x ./devel/setup.bash ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/downloads.sh ================================================ mkdir ~/.ros wget https://github.com/isl-org/MiDaS/releases/download/v2_1/model-small-traced.pt cp ./model-small-traced.pt ~/.ros/model-small-traced.pt ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_melodic_ubuntu_17_18.sh ================================================ #@title { display-mode: "code" } #from http://wiki.ros.org/indigo/Installation/Ubuntu #1.2 Setup sources.list sudo sh -c 'echo "deb http://packages.ros.org/ros/ubuntu $(lsb_release -sc) main" > /etc/apt/sources.list.d/ros-latest.list' # 1.3 Setup keys sudo apt-key adv --keyserver 'hkp://keyserver.ubuntu.com:80' --recv-key C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654 sudo apt-key adv --keyserver 'hkp://ha.pool.sks-keyservers.net:80' --recv-key 421C365BD9FF1F717815A3895523BAEEB01FA116 curl -sSL 'http://keyserver.ubuntu.com/pks/lookup?op=get&search=0xC1CF6E31E6BADE8868B172B4F42ED6FBAB17C654' | sudo apt-key add - # 1.4 Installation sudo apt-get update sudo apt-get upgrade # Desktop-Full Install: sudo apt-get install ros-melodic-desktop-full printf "\nsource /opt/ros/melodic/setup.bash\n" >> ~/.bashrc # 1.5 Initialize rosdep sudo rosdep init rosdep update # 1.7 Getting rosinstall (python) sudo apt-get install python-rosinstall sudo apt-get install python-catkin-tools sudo apt-get install python-rospy sudo apt-get install python-rosdep sudo apt-get install python-roscd sudo apt-get install python-pip ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_noetic_ubuntu_20.sh ================================================ #@title { display-mode: "code" } #from http://wiki.ros.org/indigo/Installation/Ubuntu #1.2 Setup sources.list sudo sh -c 'echo "deb http://packages.ros.org/ros/ubuntu $(lsb_release -sc) main" > /etc/apt/sources.list.d/ros-latest.list' # 1.3 Setup keys sudo apt-key adv --keyserver 'hkp://keyserver.ubuntu.com:80' --recv-key C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654 curl -sSL 'http://keyserver.ubuntu.com/pks/lookup?op=get&search=0xC1CF6E31E6BADE8868B172B4F42ED6FBAB17C654' | sudo apt-key add - # 1.4 Installation sudo apt-get update sudo apt-get upgrade # Desktop-Full Install: sudo apt-get install ros-noetic-desktop-full printf "\nsource /opt/ros/noetic/setup.bash\n" >> ~/.bashrc # 1.5 Initialize rosdep sudo rosdep init rosdep update # 1.7 Getting rosinstall (python) sudo apt-get install python3-rosinstall sudo apt-get install python3-catkin-tools sudo apt-get install python3-rospy sudo apt-get install python3-rosdep sudo apt-get install python3-roscd sudo apt-get install python3-pip ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/make_package_cpp.sh ================================================ cd ~/catkin_ws/src catkin_create_pkg midas_cpp std_msgs roscpp cv_bridge sensor_msgs image_transport cd ~/catkin_ws catkin_make chmod +x ~/catkin_ws/devel/setup.bash printf "\nsource ~/catkin_ws/devel/setup.bash" >> ~/.bashrc source ~/catkin_ws/devel/setup.bash sudo rosdep init rosdep update #rospack depends1 midas_cpp roscd midas_cpp #cat package.xml #rospack depends midas_cpp ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/launch_midas_cpp.sh ================================================ source ~/catkin_ws/devel/setup.bash roslaunch midas_cpp midas_cpp.launch model_name:="model-small-traced.pt" input_topic:="image_topic" output_topic:="midas_topic" out_orig_size:="true" ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.0.2) project(midas_cpp) ## Compile as C++11, supported in ROS Kinetic and newer # add_compile_options(-std=c++11) ## Find catkin macros and libraries ## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz) ## is used, also find other catkin packages find_package(catkin REQUIRED COMPONENTS cv_bridge image_transport roscpp rospy sensor_msgs std_msgs ) ## System dependencies are found with CMake's conventions # find_package(Boost REQUIRED COMPONENTS system) list(APPEND CMAKE_PREFIX_PATH "~/libtorch") list(APPEND CMAKE_PREFIX_PATH "/usr/local/lib/python3.6/dist-packages/torch/lib") list(APPEND CMAKE_PREFIX_PATH "/usr/local/lib/python2.7/dist-packages/torch/lib") if(NOT EXISTS "~/libtorch") if (EXISTS "/usr/local/lib/python3.6/dist-packages/torch") include_directories(/usr/local/include) include_directories(/usr/local/lib/python3.6/dist-packages/torch/include/torch/csrc/api/include) include_directories(/usr/local/lib/python3.6/dist-packages/torch/include) link_directories(/usr/local/lib) link_directories(/usr/local/lib/python3.6/dist-packages/torch/lib) set(CMAKE_PREFIX_PATH /usr/local/lib/python3.6/dist-packages/torch) set(Boost_USE_MULTITHREADED ON) set(Torch_DIR /usr/local/lib/python3.6/dist-packages/torch) elseif (EXISTS "/usr/local/lib/python2.7/dist-packages/torch") include_directories(/usr/local/include) include_directories(/usr/local/lib/python2.7/dist-packages/torch/include/torch/csrc/api/include) include_directories(/usr/local/lib/python2.7/dist-packages/torch/include) link_directories(/usr/local/lib) link_directories(/usr/local/lib/python2.7/dist-packages/torch/lib) set(CMAKE_PREFIX_PATH /usr/local/lib/python2.7/dist-packages/torch) set(Boost_USE_MULTITHREADED ON) set(Torch_DIR /usr/local/lib/python2.7/dist-packages/torch) endif() endif() find_package(Torch REQUIRED) find_package(OpenCV REQUIRED) include_directories( ${OpenCV_INCLUDE_DIRS} ) add_executable(midas_cpp src/main.cpp) target_link_libraries(midas_cpp "${TORCH_LIBRARIES}" "${OpenCV_LIBS} ${catkin_LIBRARIES}") set_property(TARGET midas_cpp PROPERTY CXX_STANDARD 14) ################################### ## catkin specific configuration ## ################################### ## The catkin_package macro generates cmake config files for your package ## Declare things to be passed to dependent projects ## INCLUDE_DIRS: uncomment this if your package contains header files ## LIBRARIES: libraries you create in this project that dependent projects also need ## CATKIN_DEPENDS: catkin_packages dependent projects also need ## DEPENDS: system dependencies of this project that dependent projects also need catkin_package( # INCLUDE_DIRS include # LIBRARIES midas_cpp # CATKIN_DEPENDS cv_bridge image_transport roscpp sensor_msgs std_msgs # DEPENDS system_lib ) ########### ## Build ## ########### ## Specify additional locations of header files ## Your package locations should be listed before other locations include_directories( # include ${catkin_INCLUDE_DIRS} ) ## Declare a C++ library # add_library(${PROJECT_NAME} # src/${PROJECT_NAME}/midas_cpp.cpp # ) ## Add cmake target dependencies of the library ## as an example, code may need to be generated before libraries ## either from message generation or dynamic reconfigure # add_dependencies(${PROJECT_NAME} ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) ## Declare a C++ executable ## With catkin_make all packages are built within a single CMake context ## The recommended prefix ensures that target names across packages don't collide # add_executable(${PROJECT_NAME}_node src/midas_cpp_node.cpp) ## Rename C++ executable without prefix ## The above recommended prefix causes long target names, the following renames the ## target back to the shorter version for ease of user use ## e.g. "rosrun someones_pkg node" instead of "rosrun someones_pkg someones_pkg_node" # set_target_properties(${PROJECT_NAME}_node PROPERTIES OUTPUT_NAME node PREFIX "") ## Add cmake target dependencies of the executable ## same as for the library above # add_dependencies(${PROJECT_NAME}_node ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) ## Specify libraries to link a library or executable target against # target_link_libraries(${PROJECT_NAME}_node # ${catkin_LIBRARIES} # ) ############# ## Install ## ############# # all install targets should use catkin DESTINATION variables # See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html ## Mark executable scripts (Python etc.) for installation ## in contrast to setup.py, you can choose the destination # catkin_install_python(PROGRAMS # scripts/my_python_script # DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} # ) ## Mark executables for installation ## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html # install(TARGETS ${PROJECT_NAME}_node # RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} # ) ## Mark libraries for installation ## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_libraries.html # install(TARGETS ${PROJECT_NAME} # ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} # LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} # RUNTIME DESTINATION ${CATKIN_GLOBAL_BIN_DESTINATION} # ) ## Mark cpp header files for installation # install(DIRECTORY include/${PROJECT_NAME}/ # DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION} # FILES_MATCHING PATTERN "*.h" # PATTERN ".svn" EXCLUDE # ) ## Mark other files for installation (e.g. launch and bag files, etc.) # install(FILES # # myfile1 # # myfile2 # DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} # ) ############# ## Testing ## ############# ## Add gtest based cpp test target and link libraries # catkin_add_gtest(${PROJECT_NAME}-test test/test_midas_cpp.cpp) # if(TARGET ${PROJECT_NAME}-test) # target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME}) # endif() ## Add folders to be run by python nosetests # catkin_add_nosetests(test) install(TARGETS ${PROJECT_NAME} ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} ) add_custom_command( TARGET midas_cpp POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/midas_cpp ${CMAKE_SOURCE_DIR}/midas_cpp ) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_cpp.launch ================================================ ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_talker_listener.launch ================================================ ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/package.xml ================================================ midas_cpp 0.1.0 The midas_cpp package Alexey Bochkovskiy MIT https://github.com/isl-org/MiDaS/tree/master/ros TODO catkin cv_bridge image_transport roscpp rospy sensor_msgs std_msgs cv_bridge image_transport roscpp rospy sensor_msgs std_msgs cv_bridge image_transport roscpp rospy sensor_msgs std_msgs ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener.py ================================================ #!/usr/bin/env python3 from __future__ import print_function import roslib #roslib.load_manifest('my_package') import sys import rospy import cv2 import numpy as np from std_msgs.msg import String from sensor_msgs.msg import Image from cv_bridge import CvBridge, CvBridgeError class video_show: def __init__(self): self.show_output = rospy.get_param('~show_output', True) self.save_output = rospy.get_param('~save_output', False) self.output_video_file = rospy.get_param('~output_video_file','result.mp4') # rospy.loginfo(f"Listener - params: show_output={self.show_output}, save_output={self.save_output}, output_video_file={self.output_video_file}") self.bridge = CvBridge() self.image_sub = rospy.Subscriber("midas_topic", Image, self.callback) def callback(self, data): try: cv_image = self.bridge.imgmsg_to_cv2(data) except CvBridgeError as e: print(e) return if cv_image.size == 0: return rospy.loginfo("Listener: Received new frame") cv_image = cv_image.astype("uint8") if self.show_output==True: cv2.imshow("video_show", cv_image) cv2.waitKey(10) if self.save_output==True: if self.video_writer_init==False: fourcc = cv2.VideoWriter_fourcc(*'XVID') self.out = cv2.VideoWriter(self.output_video_file, fourcc, 25, (cv_image.shape[1], cv_image.shape[0])) self.out.write(cv_image) def main(args): rospy.init_node('listener', anonymous=True) ic = video_show() try: rospy.spin() except KeyboardInterrupt: print("Shutting down") cv2.destroyAllWindows() if __name__ == '__main__': main(sys.argv) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener_original.py ================================================ #!/usr/bin/env python3 from __future__ import print_function import roslib #roslib.load_manifest('my_package') import sys import rospy import cv2 import numpy as np from std_msgs.msg import String from sensor_msgs.msg import Image from cv_bridge import CvBridge, CvBridgeError class video_show: def __init__(self): self.show_output = rospy.get_param('~show_output', True) self.save_output = rospy.get_param('~save_output', False) self.output_video_file = rospy.get_param('~output_video_file','result.mp4') # rospy.loginfo(f"Listener original - params: show_output={self.show_output}, save_output={self.save_output}, output_video_file={self.output_video_file}") self.bridge = CvBridge() self.image_sub = rospy.Subscriber("image_topic", Image, self.callback) def callback(self, data): try: cv_image = self.bridge.imgmsg_to_cv2(data) except CvBridgeError as e: print(e) return if cv_image.size == 0: return rospy.loginfo("Listener_original: Received new frame") cv_image = cv_image.astype("uint8") if self.show_output==True: cv2.imshow("video_show_orig", cv_image) cv2.waitKey(10) if self.save_output==True: if self.video_writer_init==False: fourcc = cv2.VideoWriter_fourcc(*'XVID') self.out = cv2.VideoWriter(self.output_video_file, fourcc, 25, (cv_image.shape[1], cv_image.shape[0])) self.out.write(cv_image) def main(args): rospy.init_node('listener_original', anonymous=True) ic = video_show() try: rospy.spin() except KeyboardInterrupt: print("Shutting down") cv2.destroyAllWindows() if __name__ == '__main__': main(sys.argv) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/talker.py ================================================ #!/usr/bin/env python3 import roslib #roslib.load_manifest('my_package') import sys import rospy import cv2 from std_msgs.msg import String from sensor_msgs.msg import Image from cv_bridge import CvBridge, CvBridgeError def talker(): rospy.init_node('talker', anonymous=True) use_camera = rospy.get_param('~use_camera', False) input_video_file = rospy.get_param('~input_video_file','test.mp4') # rospy.loginfo(f"Talker - params: use_camera={use_camera}, input_video_file={input_video_file}") # rospy.loginfo("Talker: Trying to open a video stream") if use_camera == True: cap = cv2.VideoCapture(0) else: cap = cv2.VideoCapture(input_video_file) pub = rospy.Publisher('image_topic', Image, queue_size=1) rate = rospy.Rate(30) # 30hz bridge = CvBridge() while not rospy.is_shutdown(): ret, cv_image = cap.read() if ret==False: print("Talker: Video is over") rospy.loginfo("Video is over") return try: image = bridge.cv2_to_imgmsg(cv_image, "bgr8") except CvBridgeError as e: rospy.logerr("Talker: cv2image conversion failed: ", e) print(e) continue rospy.loginfo("Talker: Publishing frame") pub.publish(image) rate.sleep() if __name__ == '__main__': try: talker() except rospy.ROSInterruptException: pass ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/src/main.cpp ================================================ #include #include #include #include #include #include // One-stop header. #include #include #include #include #include #include // includes for OpenCV >= 3.x #ifndef CV_VERSION_EPOCH #include #include #include #endif // OpenCV includes for OpenCV 2.x #ifdef CV_VERSION_EPOCH #include #include #include #include #endif static const std::string OPENCV_WINDOW = "Image window"; class Midas { ros::NodeHandle nh_; image_transport::ImageTransport it_; image_transport::Subscriber image_sub_; image_transport::Publisher image_pub_; torch::jit::script::Module module; torch::Device device; auto ToTensor(cv::Mat img, bool show_output = false, bool unsqueeze = false, int unsqueeze_dim = 0) { //std::cout << "image shape: " << img.size() << std::endl; at::Tensor tensor_image = torch::from_blob(img.data, { img.rows, img.cols, 3 }, at::kByte); if (unsqueeze) { tensor_image.unsqueeze_(unsqueeze_dim); //std::cout << "tensors new shape: " << tensor_image.sizes() << std::endl; } if (show_output) { std::cout << tensor_image.slice(2, 0, 1) << std::endl; } //std::cout << "tenor shape: " << tensor_image.sizes() << std::endl; return tensor_image; } auto ToInput(at::Tensor tensor_image) { // Create a vector of inputs. return std::vector{tensor_image}; } auto ToCvImage(at::Tensor tensor, int cv_type = CV_8UC3) { int width = tensor.sizes()[0]; int height = tensor.sizes()[1]; try { cv::Mat output_mat; if (cv_type == CV_8UC4 || cv_type == CV_8UC3 || cv_type == CV_8UC2 || cv_type == CV_8UC1) { cv::Mat cv_image(cv::Size{ height, width }, cv_type, tensor.data_ptr()); output_mat = cv_image; } else if (cv_type == CV_32FC4 || cv_type == CV_32FC3 || cv_type == CV_32FC2 || cv_type == CV_32FC1) { cv::Mat cv_image(cv::Size{ height, width }, cv_type, tensor.data_ptr()); output_mat = cv_image; } else if (cv_type == CV_64FC4 || cv_type == CV_64FC3 || cv_type == CV_64FC2 || cv_type == CV_64FC1) { cv::Mat cv_image(cv::Size{ height, width }, cv_type, tensor.data_ptr()); output_mat = cv_image; } //show_image(output_mat, "converted image from tensor"); return output_mat.clone(); } catch (const c10::Error& e) { std::cout << "an error has occured : " << e.msg() << std::endl; } return cv::Mat(height, width, CV_8UC3); } std::string input_topic, output_topic, model_name; bool out_orig_size; int net_width, net_height; torch::NoGradGuard guard; at::Tensor mean, std; at::Tensor output, tensor; public: Midas() : nh_(), it_(nh_), device(torch::Device(torch::kCPU)) { ros::param::param("~input_topic", input_topic, "image_topic"); ros::param::param("~output_topic", output_topic, "midas_topic"); ros::param::param("~model_name", model_name, "model-small-traced.pt"); ros::param::param("~out_orig_size", out_orig_size, true); ros::param::param("~net_width", net_width, 256); ros::param::param("~net_height", net_height, 256); std::cout << ", input_topic = " << input_topic << ", output_topic = " << output_topic << ", model_name = " << model_name << ", out_orig_size = " << out_orig_size << ", net_width = " << net_width << ", net_height = " << net_height << std::endl; // Subscrive to input video feed and publish output video feed image_sub_ = it_.subscribe(input_topic, 1, &Midas::imageCb, this); image_pub_ = it_.advertise(output_topic, 1); std::cout << "Try to load torchscript model \n"; try { // Deserialize the ScriptModule from a file using torch::jit::load(). module = torch::jit::load(model_name); } catch (const c10::Error& e) { std::cerr << "error loading the model\n"; exit(0); } std::cout << "ok\n"; try { module.eval(); torch::jit::getProfilingMode() = false; torch::jit::setGraphExecutorOptimize(true); mean = torch::tensor({ 0.485, 0.456, 0.406 }); std = torch::tensor({ 0.229, 0.224, 0.225 }); if (torch::hasCUDA()) { std::cout << "cuda is available" << std::endl; at::globalContext().setBenchmarkCuDNN(true); device = torch::Device(torch::kCUDA); module.to(device); mean = mean.to(device); std = std.to(device); } } catch (const c10::Error& e) { std::cerr << " module initialization: " << e.msg() << std::endl; } } ~Midas() { } void imageCb(const sensor_msgs::ImageConstPtr& msg) { cv_bridge::CvImagePtr cv_ptr; try { // sensor_msgs::Image to cv::Mat cv_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::RGB8); } catch (cv_bridge::Exception& e) { ROS_ERROR("cv_bridge exception: %s", e.what()); return; } // pre-processing auto tensor_cpu = ToTensor(cv_ptr->image); // OpenCV-image -> Libtorch-tensor try { tensor = tensor_cpu.to(device); // move to device (CPU or GPU) tensor = tensor.toType(c10::kFloat); tensor = tensor.permute({ 2, 0, 1 }); // HWC -> CHW tensor = tensor.unsqueeze(0); tensor = at::upsample_bilinear2d(tensor, { net_height, net_width }, true); // resize tensor = tensor.squeeze(0); tensor = tensor.permute({ 1, 2, 0 }); // CHW -> HWC tensor = tensor.div(255).sub(mean).div(std); // normalization tensor = tensor.permute({ 2, 0, 1 }); // HWC -> CHW tensor.unsqueeze_(0); // CHW -> NCHW } catch (const c10::Error& e) { std::cerr << " pre-processing exception: " << e.msg() << std::endl; return; } auto input_to_net = ToInput(tensor); // input to the network // inference output; try { output = module.forward(input_to_net).toTensor(); // run inference } catch (const c10::Error& e) { std::cerr << " module.forward() exception: " << e.msg() << std::endl; return; } output = output.detach().to(torch::kF32); // move to CPU temporary at::Tensor output_tmp = output; output_tmp = output_tmp.to(torch::kCPU); // normalization float min_val = std::numeric_limits::max(); float max_val = std::numeric_limits::min(); for (int i = 0; i < net_width * net_height; ++i) { float val = output_tmp.data_ptr()[i]; if (min_val > val) min_val = val; if (max_val < val) max_val = val; } float range_val = max_val - min_val; output = output.sub(min_val).div(range_val).mul(255.0F).clamp(0, 255).to(torch::kF32); // .to(torch::kU8); // resize to the original size if required if (out_orig_size) { try { output = at::upsample_bilinear2d(output.unsqueeze(0), { cv_ptr->image.size().height, cv_ptr->image.size().width }, true); output = output.squeeze(0); } catch (const c10::Error& e) { std::cout << " upsample_bilinear2d() exception: " << e.msg() << std::endl; return; } } output = output.permute({ 1, 2, 0 }).to(torch::kCPU); int cv_type = CV_32FC1; // CV_8UC1; auto cv_img = ToCvImage(output, cv_type); sensor_msgs::Image img_msg; try { // cv::Mat -> sensor_msgs::Image std_msgs::Header header; // empty header header.seq = 0; // user defined counter header.stamp = ros::Time::now();// time //cv_bridge::CvImage img_bridge = cv_bridge::CvImage(header, sensor_msgs::image_encodings::MONO8, cv_img); cv_bridge::CvImage img_bridge = cv_bridge::CvImage(header, sensor_msgs::image_encodings::TYPE_32FC1, cv_img); img_bridge.toImageMsg(img_msg); // cv_bridge -> sensor_msgs::Image } catch (cv_bridge::Exception& e) { ROS_ERROR("cv_bridge exception: %s", e.what()); return; } // Output modified video stream image_pub_.publish(img_msg); } }; int main(int argc, char** argv) { ros::init(argc, argv, "midas", ros::init_options::AnonymousName); Midas ic; ros::spin(); return 0; } ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/ros/run_talker_listener_test.sh ================================================ # place any test.mp4 file near with this file # roscore # rosnode kill -a source ~/catkin_ws/devel/setup.bash roscore & P1=$! rosrun midas_cpp talker.py & P2=$! rosrun midas_cpp listener_original.py & P3=$! rosrun midas_cpp listener.py & P4=$! wait $P1 $P2 $P3 $P4 ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/run.py ================================================ """Compute depth maps for images in the input folder. """ import os import glob import torch import utils import cv2 import argparse import time import numpy as np from imutils.video import VideoStream from midas.model_loader import default_models, load_model from modules import devices first_execution = True def process(device, model, model_type, image, input_size, target_size, optimize, use_camera): """ Run the inference and interpolate. Args: device (torch.device): the torch device used model: the model used for inference model_type: the type of the model image: the image fed into the neural network input_size: the size (width, height) of the neural network input (for OpenVINO) target_size: the size (width, height) the neural network output is interpolated to optimize: optimize the model to half-floats on CUDA? use_camera: is the camera used? Returns: the prediction """ global first_execution if "openvino" in model_type: if first_execution or not use_camera: print(f" Input resized to {input_size[0]}x{input_size[1]} before entering the encoder") first_execution = False sample = [np.reshape(image, (1, 3, *input_size))] prediction = model(sample)[model.output(0)][0] prediction = cv2.resize(prediction, dsize=target_size, interpolation=cv2.INTER_CUBIC) else: sample = torch.from_numpy(image).to(device).unsqueeze(0) if optimize and device == torch.device("cuda"): if first_execution: print(" Optimization to half-floats activated. Use with caution, because models like Swin require\n" " float precision to work properly and may yield non-finite depth values to some extent for\n" " half-floats.") sample = sample.to(memory_format=torch.channels_last) sample = sample.half() if first_execution or not use_camera: height, width = sample.shape[2:] print(f" Input resized to {width}x{height} before entering the encoder") first_execution = False prediction = model.forward(sample) prediction = ( torch.nn.functional.interpolate( prediction.unsqueeze(1), size=target_size[::-1], mode="bicubic", align_corners=False, ) .squeeze() .cpu() .numpy() ) return prediction def create_side_by_side(image, depth, grayscale): """ Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map for better visibility. Args: image: the RGB image depth: the depth map grayscale: use a grayscale colormap? Returns: the image and depth map place side by side """ depth_min = depth.min() depth_max = depth.max() normalized_depth = 255 * (depth - depth_min) / (depth_max - depth_min) normalized_depth *= 3 right_side = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3 if not grayscale: right_side = cv2.applyColorMap(np.uint8(right_side), cv2.COLORMAP_INFERNO) if image is None: return right_side else: return np.concatenate((image, right_side), axis=1) def run(input_path, output_path, model_path, model_type="dpt_beit_large_512", optimize=False, side=False, height=None, square=False, grayscale=False): """Run MonoDepthNN to compute depth maps. Args: input_path (str): path to input folder output_path (str): path to output folder model_path (str): path to saved model model_type (str): the model type optimize (bool): optimize the model to half-floats on CUDA? side (bool): RGB and depth side by side in output images? height (int): inference encoder image height square (bool): resize to a square resolution? grayscale (bool): use a grayscale colormap? """ print("Initialize") # select device device = devices.get_device_for("controlnet") if torch.cuda.is_available() else torch.device("cpu") print("Device: %s" % device) model, transform, net_w, net_h = load_model(device, model_path, model_type, optimize, height, square) # get input if input_path is not None: image_names = glob.glob(os.path.join(input_path, "*")) num_images = len(image_names) else: print("No input path specified. Grabbing images from camera.") # create output folder if output_path is not None: os.makedirs(output_path, exist_ok=True) print("Start processing") if input_path is not None: if output_path is None: print("Warning: No output path specified. Images will be processed but not shown or stored anywhere.") for index, image_name in enumerate(image_names): print(" Processing {} ({}/{})".format(image_name, index + 1, num_images)) # input original_image_rgb = utils.read_image(image_name) # in [0, 1] image = transform({"image": original_image_rgb})["image"] # compute with torch.no_grad(): prediction = process(device, model, model_type, image, (net_w, net_h), original_image_rgb.shape[1::-1], optimize, False) # output if output_path is not None: filename = os.path.join( output_path, os.path.splitext(os.path.basename(image_name))[0] + '-' + model_type ) if not side: utils.write_depth(filename, prediction, grayscale, bits=2) else: original_image_bgr = np.flip(original_image_rgb, 2) content = create_side_by_side(original_image_bgr*255, prediction, grayscale) cv2.imwrite(filename + ".png", content) utils.write_pfm(filename + ".pfm", prediction.astype(np.float32)) else: with torch.no_grad(): fps = 1 video = VideoStream(0).start() time_start = time.time() frame_index = 0 while True: frame = video.read() if frame is not None: original_image_rgb = np.flip(frame, 2) # in [0, 255] (flip required to get RGB) image = transform({"image": original_image_rgb/255})["image"] prediction = process(device, model, model_type, image, (net_w, net_h), original_image_rgb.shape[1::-1], optimize, True) original_image_bgr = np.flip(original_image_rgb, 2) if side else None content = create_side_by_side(original_image_bgr, prediction, grayscale) cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', content/255) if output_path is not None: filename = os.path.join(output_path, 'Camera' + '-' + model_type + '_' + str(frame_index)) cv2.imwrite(filename + ".png", content) alpha = 0.1 if time.time()-time_start > 0: fps = (1 - alpha) * fps + alpha * 1 / (time.time()-time_start) # exponential moving average time_start = time.time() print(f"\rFPS: {round(fps,2)}", end="") if cv2.waitKey(1) == 27: # Escape key break frame_index += 1 print() print("Finished") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_path', default=None, help='Folder with input images (if no input path is specified, images are tried to be grabbed ' 'from camera)' ) parser.add_argument('-o', '--output_path', default=None, help='Folder for output images' ) parser.add_argument('-m', '--model_weights', default=None, help='Path to the trained weights of model' ) parser.add_argument('-t', '--model_type', default='dpt_beit_large_512', help='Model type: ' 'dpt_beit_large_512, dpt_beit_large_384, dpt_beit_base_384, dpt_swin2_large_384, ' 'dpt_swin2_base_384, dpt_swin2_tiny_256, dpt_swin_large_384, dpt_next_vit_large_384, ' 'dpt_levit_224, dpt_large_384, dpt_hybrid_384, midas_v21_384, midas_v21_small_256 or ' 'openvino_midas_v21_small_256' ) parser.add_argument('-s', '--side', action='store_true', help='Output images contain RGB and depth images side by side' ) parser.add_argument('--optimize', dest='optimize', action='store_true', help='Use half-float optimization') parser.set_defaults(optimize=False) parser.add_argument('--height', type=int, default=None, help='Preferred height of images feed into the encoder during inference. Note that the ' 'preferred height may differ from the actual height, because an alignment to multiples of ' '32 takes place. Many models support only the height chosen during training, which is ' 'used automatically if this parameter is not set.' ) parser.add_argument('--square', action='store_true', help='Option to resize images to a square resolution by changing their widths when images are ' 'fed into the encoder during inference. If this parameter is not set, the aspect ratio of ' 'images is tried to be preserved if supported by the model.' ) parser.add_argument('--grayscale', action='store_true', help='Use a grayscale colormap instead of the inferno one. Although the inferno colormap, ' 'which is used by default, is better for visibility, it does not allow storing 16-bit ' 'depth values in PNGs but only 8-bit ones due to the precision limitation of this ' 'colormap.' ) args = parser.parse_args() if args.model_weights is None: args.model_weights = default_models[args.model_type] # set torch options torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True # compute depth maps run(args.input_path, args.output_path, args.model_weights, args.model_type, args.optimize, args.side, args.height, args.square, args.grayscale) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/tf/README.md ================================================ ## Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer ### TensorFlow inference using `.pb` and `.onnx` models 1. [Run inference on TensorFlow-model by using TensorFlow](#run-inference-on-tensorflow-model-by-using-tensorFlow) 2. [Run inference on ONNX-model by using TensorFlow](#run-inference-on-onnx-model-by-using-tensorflow) 3. [Make ONNX model from downloaded Pytorch model file](#make-onnx-model-from-downloaded-pytorch-model-file) ### Run inference on TensorFlow-model by using TensorFlow 1) Download the model weights [model-f6b98070.pb](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-f6b98070.pb) and [model-small.pb](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-small.pb) and place the file in the `/tf/` folder. 2) Set up dependencies: ```shell # install OpenCV pip install --upgrade pip pip install opencv-python # install TensorFlow pip install -I grpcio tensorflow==2.3.0 tensorflow-addons==0.11.2 numpy==1.18.0 ``` #### Usage 1) Place one or more input images in the folder `tf/input`. 2) Run the model: ```shell python tf/run_pb.py ``` Or run the small model: ```shell python tf/run_pb.py --model_weights model-small.pb --model_type small ``` 3) The resulting inverse depth maps are written to the `tf/output` folder. ### Run inference on ONNX-model by using ONNX-Runtime 1) Download the model weights [model-f6b98070.onnx](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-f6b98070.onnx) and [model-small.onnx](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-small.onnx) and place the file in the `/tf/` folder. 2) Set up dependencies: ```shell # install OpenCV pip install --upgrade pip pip install opencv-python # install ONNX pip install onnx==1.7.0 # install ONNX Runtime pip install onnxruntime==1.5.2 ``` #### Usage 1) Place one or more input images in the folder `tf/input`. 2) Run the model: ```shell python tf/run_onnx.py ``` Or run the small model: ```shell python tf/run_onnx.py --model_weights model-small.onnx --model_type small ``` 3) The resulting inverse depth maps are written to the `tf/output` folder. ### Make ONNX model from downloaded Pytorch model file 1) Download the model weights [model-f6b98070.pt](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-f6b98070.pt) and place the file in the root folder. 2) Set up dependencies: ```shell # install OpenCV pip install --upgrade pip pip install opencv-python # install PyTorch TorchVision pip install -I torch==1.7.0 torchvision==0.8.0 # install TensorFlow pip install -I grpcio tensorflow==2.3.0 tensorflow-addons==0.11.2 numpy==1.18.0 # install ONNX pip install onnx==1.7.0 # install ONNX-TensorFlow git clone https://github.com/onnx/onnx-tensorflow.git cd onnx-tensorflow git checkout 095b51b88e35c4001d70f15f80f31014b592b81e pip install -e . ``` #### Usage 1) Run the converter: ```shell python tf/make_onnx_model.py ``` 2) The resulting `model-f6b98070.onnx` file is written to the `/tf/` folder. ### Requirements The code was tested with Python 3.6.9, PyTorch 1.5.1, TensorFlow 2.2.0, TensorFlow-addons 0.8.3, ONNX 1.7.0, ONNX-TensorFlow (GitHub-master-17.07.2020) and OpenCV 4.3.0. ### Citation Please cite our paper if you use this code or any of the models: ``` @article{Ranftl2019, author = {Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun}, title = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)}, year = {2020}, } ``` ### License MIT License ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/tf/input/.placeholder ================================================ ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/tf/make_onnx_model.py ================================================ """Compute depth maps for images in the input folder. """ import os import ntpath import glob import torch import utils import cv2 import numpy as np from torchvision.transforms import Compose, Normalize from torchvision import transforms from shutil import copyfile import fileinput import sys sys.path.append(os.getcwd() + '/..') def modify_file(): modify_filename = '../midas/blocks.py' copyfile(modify_filename, modify_filename+'.bak') with open(modify_filename, 'r') as file : filedata = file.read() filedata = filedata.replace('align_corners=True', 'align_corners=False') filedata = filedata.replace('import torch.nn as nn', 'import torch.nn as nn\nimport torchvision.models as models') filedata = filedata.replace('torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")', 'models.resnext101_32x8d()') with open(modify_filename, 'w') as file: file.write(filedata) def restore_file(): modify_filename = '../midas/blocks.py' copyfile(modify_filename+'.bak', modify_filename) modify_file() from midas.midas_net import MidasNet from midas.transforms import Resize, NormalizeImage, PrepareForNet restore_file() class MidasNet_preprocessing(MidasNet): """Network for monocular depth estimation. """ def forward(self, x): """Forward pass. Args: x (tensor): input data (image) Returns: tensor: depth """ mean = torch.tensor([0.485, 0.456, 0.406]) std = torch.tensor([0.229, 0.224, 0.225]) x.sub_(mean[None, :, None, None]).div_(std[None, :, None, None]) return MidasNet.forward(self, x) def run(model_path): """Run MonoDepthNN to compute depth maps. Args: model_path (str): path to saved model """ print("initialize") # select device # load network #model = MidasNet(model_path, non_negative=True) model = MidasNet_preprocessing(model_path, non_negative=True) model.eval() print("start processing") # input img_input = np.zeros((3, 384, 384), np.float32) # compute with torch.no_grad(): sample = torch.from_numpy(img_input).unsqueeze(0) prediction = model.forward(sample) prediction = ( torch.nn.functional.interpolate( prediction.unsqueeze(1), size=img_input.shape[:2], mode="bicubic", align_corners=False, ) .squeeze() .cpu() .numpy() ) torch.onnx.export(model, sample, ntpath.basename(model_path).rsplit('.', 1)[0]+'.onnx', opset_version=9) print("finished") if __name__ == "__main__": # set paths # MODEL_PATH = "model.pt" MODEL_PATH = "../model-f6b98070.pt" # compute depth maps run(MODEL_PATH) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/tf/output/.placeholder ================================================ ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_onnx.py ================================================ """Compute depth maps for images in the input folder. """ import os import glob import utils import cv2 import sys import numpy as np import argparse import onnx import onnxruntime as rt from transforms import Resize, NormalizeImage, PrepareForNet def run(input_path, output_path, model_path, model_type="large"): """Run MonoDepthNN to compute depth maps. Args: input_path (str): path to input folder output_path (str): path to output folder model_path (str): path to saved model """ print("initialize") # select device device = "CUDA:0" #device = "CPU" print("device: %s" % device) # network resolution if model_type == "large": net_w, net_h = 384, 384 elif model_type == "small": net_w, net_h = 256, 256 else: print(f"model_type '{model_type}' not implemented, use: --model_type large") assert False # load network print("loading model...") model = rt.InferenceSession(model_path) input_name = model.get_inputs()[0].name output_name = model.get_outputs()[0].name resize_image = Resize( net_w, net_h, resize_target=None, keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="upper_bound", image_interpolation_method=cv2.INTER_CUBIC, ) def compose2(f1, f2): return lambda x: f2(f1(x)) transform = compose2(resize_image, PrepareForNet()) # get input img_names = glob.glob(os.path.join(input_path, "*")) num_images = len(img_names) # create output folder os.makedirs(output_path, exist_ok=True) print("start processing") for ind, img_name in enumerate(img_names): print(" processing {} ({}/{})".format(img_name, ind + 1, num_images)) # input img = utils.read_image(img_name) img_input = transform({"image": img})["image"] # compute output = model.run([output_name], {input_name: img_input.reshape(1, 3, net_h, net_w).astype(np.float32)})[0] prediction = np.array(output).reshape(net_h, net_w) prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC) # output filename = os.path.join( output_path, os.path.splitext(os.path.basename(img_name))[0] ) utils.write_depth(filename, prediction, bits=2) print("finished") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_path', default='input', help='folder with input images' ) parser.add_argument('-o', '--output_path', default='output', help='folder for output images' ) parser.add_argument('-m', '--model_weights', default='model-f6b98070.onnx', help='path to the trained weights of model' ) parser.add_argument('-t', '--model_type', default='large', help='model type: large or small' ) args = parser.parse_args() # compute depth maps run(args.input_path, args.output_path, args.model_weights, args.model_type) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_pb.py ================================================ """Compute depth maps for images in the input folder. """ import os import glob import utils import cv2 import argparse import tensorflow as tf from transforms import Resize, NormalizeImage, PrepareForNet def run(input_path, output_path, model_path, model_type="large"): """Run MonoDepthNN to compute depth maps. Args: input_path (str): path to input folder output_path (str): path to output folder model_path (str): path to saved model """ print("initialize") # the runtime initialization will not allocate all memory on the device to avoid out of GPU memory gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: for gpu in gpus: #tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_virtual_device_configuration(gpu, [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)]) except RuntimeError as e: print(e) # network resolution if model_type == "large": net_w, net_h = 384, 384 elif model_type == "small": net_w, net_h = 256, 256 else: print(f"model_type '{model_type}' not implemented, use: --model_type large") assert False # load network graph_def = tf.compat.v1.GraphDef() with tf.io.gfile.GFile(model_path, 'rb') as f: graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def, name='') model_operations = tf.compat.v1.get_default_graph().get_operations() input_node = '0:0' output_layer = model_operations[len(model_operations) - 1].name + ':0' print("Last layer name: ", output_layer) resize_image = Resize( net_w, net_h, resize_target=None, keep_aspect_ratio=False, ensure_multiple_of=32, resize_method="upper_bound", image_interpolation_method=cv2.INTER_CUBIC, ) def compose2(f1, f2): return lambda x: f2(f1(x)) transform = compose2(resize_image, PrepareForNet()) # get input img_names = glob.glob(os.path.join(input_path, "*")) num_images = len(img_names) # create output folder os.makedirs(output_path, exist_ok=True) print("start processing") with tf.compat.v1.Session() as sess: try: # load images for ind, img_name in enumerate(img_names): print(" processing {} ({}/{})".format(img_name, ind + 1, num_images)) # input img = utils.read_image(img_name) img_input = transform({"image": img})["image"] # compute prob_tensor = sess.graph.get_tensor_by_name(output_layer) prediction, = sess.run(prob_tensor, {input_node: [img_input] }) prediction = prediction.reshape(net_h, net_w) prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC) # output filename = os.path.join( output_path, os.path.splitext(os.path.basename(img_name))[0] ) utils.write_depth(filename, prediction, bits=2) except KeyError: print ("Couldn't find input node: ' + input_node + ' or output layer: " + output_layer + ".") exit(-1) print("finished") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_path', default='input', help='folder with input images' ) parser.add_argument('-o', '--output_path', default='output', help='folder for output images' ) parser.add_argument('-m', '--model_weights', default='model-f6b98070.pb', help='path to the trained weights of model' ) parser.add_argument('-t', '--model_type', default='large', help='model type: large or small' ) args = parser.parse_args() # compute depth maps run(args.input_path, args.output_path, args.model_weights, args.model_type) ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/tf/transforms.py ================================================ import numpy as np import cv2 import math def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): """Rezise the sample to ensure the given size. Keeps aspect ratio. Args: sample (dict): sample size (tuple): image size Returns: tuple: new size """ shape = list(sample["disparity"].shape) if shape[0] >= size[0] and shape[1] >= size[1]: return sample scale = [0, 0] scale[0] = size[0] / shape[0] scale[1] = size[1] / shape[1] scale = max(scale) shape[0] = math.ceil(scale * shape[0]) shape[1] = math.ceil(scale * shape[1]) # resize sample["image"] = cv2.resize( sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method ) sample["disparity"] = cv2.resize( sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST ) sample["mask"] = cv2.resize( sample["mask"].astype(np.float32), tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST, ) sample["mask"] = sample["mask"].astype(bool) return tuple(shape) class Resize(object): """Resize sample to given size (width, height). """ def __init__( self, width, height, resize_target=True, keep_aspect_ratio=False, ensure_multiple_of=1, resize_method="lower_bound", image_interpolation_method=cv2.INTER_AREA, ): """Init. Args: width (int): desired output width height (int): desired output height resize_target (bool, optional): True: Resize the full sample (image, mask, target). False: Resize image only. Defaults to True. keep_aspect_ratio (bool, optional): True: Keep the aspect ratio of the input sample. Output sample might not have the given width and height, and resize behaviour depends on the parameter 'resize_method'. Defaults to False. ensure_multiple_of (int, optional): Output width and height is constrained to be multiple of this parameter. Defaults to 1. resize_method (str, optional): "lower_bound": Output will be at least as large as the given size. "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) "minimal": Scale as least as possible. (Output size might be smaller than given size.) Defaults to "lower_bound". """ self.__width = width self.__height = height self.__resize_target = resize_target self.__keep_aspect_ratio = keep_aspect_ratio self.__multiple_of = ensure_multiple_of self.__resize_method = resize_method self.__image_interpolation_method = image_interpolation_method def constrain_to_multiple_of(self, x, min_val=0, max_val=None): y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) if max_val is not None and y > max_val: y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) if y < min_val: y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) return y def get_size(self, width, height): # determine new height and width scale_height = self.__height / height scale_width = self.__width / width if self.__keep_aspect_ratio: if self.__resize_method == "lower_bound": # scale such that output size is lower bound if scale_width > scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "upper_bound": # scale such that output size is upper bound if scale_width < scale_height: # fit width scale_height = scale_width else: # fit height scale_width = scale_height elif self.__resize_method == "minimal": # scale as least as possbile if abs(1 - scale_width) < abs(1 - scale_height): # fit width scale_height = scale_width else: # fit height scale_width = scale_height else: raise ValueError( f"resize_method {self.__resize_method} not implemented" ) if self.__resize_method == "lower_bound": new_height = self.constrain_to_multiple_of( scale_height * height, min_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, min_val=self.__width ) elif self.__resize_method == "upper_bound": new_height = self.constrain_to_multiple_of( scale_height * height, max_val=self.__height ) new_width = self.constrain_to_multiple_of( scale_width * width, max_val=self.__width ) elif self.__resize_method == "minimal": new_height = self.constrain_to_multiple_of(scale_height * height) new_width = self.constrain_to_multiple_of(scale_width * width) else: raise ValueError(f"resize_method {self.__resize_method} not implemented") return (new_width, new_height) def __call__(self, sample): width, height = self.get_size( sample["image"].shape[1], sample["image"].shape[0] ) # resize sample sample["image"] = cv2.resize( sample["image"], (width, height), interpolation=self.__image_interpolation_method, ) if self.__resize_target: if "disparity" in sample: sample["disparity"] = cv2.resize( sample["disparity"], (width, height), interpolation=cv2.INTER_NEAREST, ) if "depth" in sample: sample["depth"] = cv2.resize( sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST ) sample["mask"] = cv2.resize( sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST, ) sample["mask"] = sample["mask"].astype(bool) return sample class NormalizeImage(object): """Normlize image by given mean and std. """ def __init__(self, mean, std): self.__mean = mean self.__std = std def __call__(self, sample): sample["image"] = (sample["image"] - self.__mean) / self.__std return sample class PrepareForNet(object): """Prepare sample for usage as network input. """ def __init__(self): pass def __call__(self, sample): image = np.transpose(sample["image"], (2, 0, 1)) sample["image"] = np.ascontiguousarray(image).astype(np.float32) if "mask" in sample: sample["mask"] = sample["mask"].astype(np.float32) sample["mask"] = np.ascontiguousarray(sample["mask"]) if "disparity" in sample: disparity = sample["disparity"].astype(np.float32) sample["disparity"] = np.ascontiguousarray(disparity) if "depth" in sample: depth = sample["depth"].astype(np.float32) sample["depth"] = np.ascontiguousarray(depth) return sample ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/tf/utils.py ================================================ import numpy as np import sys import cv2 def write_pfm(path, image, scale=1): """Write pfm file. Args: path (str): pathto file image (array): data scale (int, optional): Scale. Defaults to 1. """ with open(path, "wb") as file: color = None if image.dtype.name != "float32": raise Exception("Image dtype must be float32.") image = np.flipud(image) if len(image.shape) == 3 and image.shape[2] == 3: # color image color = True elif ( len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1 ): # greyscale color = False else: raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") file.write("PF\n" if color else "Pf\n".encode()) file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) endian = image.dtype.byteorder if endian == "<" or endian == "=" and sys.byteorder == "little": scale = -scale file.write("%f\n".encode() % scale) image.tofile(file) def read_image(path): """Read image and output RGB image (0-1). Args: path (str): path to file Returns: array: RGB image (0-1) """ img = cv2.imread(path) if img.ndim == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 return img def write_depth(path, depth, bits=1): """Write depth map to pfm and png file. Args: path (str): filepath without extension depth (array): depth """ write_pfm(path + ".pfm", depth.astype(np.float32)) depth_min = depth.min() depth_max = depth.max() max_val = (2**(8*bits))-1 if depth_max - depth_min > np.finfo("float").eps: out = max_val * (depth - depth_min) / (depth_max - depth_min) else: out = 0 if bits == 1: cv2.imwrite(path + ".png", out.astype("uint8")) elif bits == 2: cv2.imwrite(path + ".png", out.astype("uint16")) return ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/utils.py ================================================ """Utils for monoDepth. """ import sys import re import numpy as np import cv2 import torch def read_pfm(path): """Read pfm file. Args: path (str): path to file Returns: tuple: (data, scale) """ with open(path, "rb") as file: color = None width = None height = None scale = None endian = None header = file.readline().rstrip() if header.decode("ascii") == "PF": color = True elif header.decode("ascii") == "Pf": color = False else: raise Exception("Not a PFM file: " + path) dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii")) if dim_match: width, height = list(map(int, dim_match.groups())) else: raise Exception("Malformed PFM header.") scale = float(file.readline().decode("ascii").rstrip()) if scale < 0: # little-endian endian = "<" scale = -scale else: # big-endian endian = ">" data = np.fromfile(file, endian + "f") shape = (height, width, 3) if color else (height, width) data = np.reshape(data, shape) data = np.flipud(data) return data, scale def write_pfm(path, image, scale=1): """Write pfm file. Args: path (str): pathto file image (array): data scale (int, optional): Scale. Defaults to 1. """ with open(path, "wb") as file: color = None if image.dtype.name != "float32": raise Exception("Image dtype must be float32.") image = np.flipud(image) if len(image.shape) == 3 and image.shape[2] == 3: # color image color = True elif ( len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1 ): # greyscale color = False else: raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") file.write("PF\n" if color else "Pf\n".encode()) file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) endian = image.dtype.byteorder if endian == "<" or endian == "=" and sys.byteorder == "little": scale = -scale file.write("%f\n".encode() % scale) image.tofile(file) def read_image(path): """Read image and output RGB image (0-1). Args: path (str): path to file Returns: array: RGB image (0-1) """ img = cv2.imread(path) if img.ndim == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 return img def resize_image(img): """Resize image and make it fit for network. Args: img (array): image Returns: tensor: data ready for network """ height_orig = img.shape[0] width_orig = img.shape[1] if width_orig > height_orig: scale = width_orig / 384 else: scale = height_orig / 384 height = (np.ceil(height_orig / scale / 32) * 32).astype(int) width = (np.ceil(width_orig / scale / 32) * 32).astype(int) img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA) img_resized = ( torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float() ) img_resized = img_resized.unsqueeze(0) return img_resized def resize_depth(depth, width, height): """Resize depth map and bring to CPU (numpy). Args: depth (tensor): depth width (int): image width height (int): image height Returns: array: processed depth """ depth = torch.squeeze(depth[0, :, :, :]).to("cpu") depth_resized = cv2.resize( depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC ) return depth_resized def write_depth(path, depth, grayscale, bits=1): """Write depth map to png file. Args: path (str): filepath without extension depth (array): depth grayscale (bool): use a grayscale colormap? """ if not grayscale: bits = 1 if not np.isfinite(depth).all(): depth=np.nan_to_num(depth, nan=0.0, posinf=0.0, neginf=0.0) print("WARNING: Non-finite depth values present") depth_min = depth.min() depth_max = depth.max() max_val = (2**(8*bits))-1 if depth_max - depth_min > np.finfo("float").eps: out = max_val * (depth - depth_min) / (depth_max - depth_min) else: out = np.zeros(depth.shape, dtype=depth.dtype) if not grayscale: out = cv2.applyColorMap(np.uint8(out), cv2.COLORMAP_INFERNO) if bits == 1: cv2.imwrite(path + ".png", out.astype("uint8")) elif bits == 2: cv2.imwrite(path + ".png", out.astype("uint16")) return ================================================ FILE: annotator/zoe/zoedepth/models/base_models/midas_repo/weights/.placeholder ================================================ ================================================ FILE: annotator/zoe/zoedepth/models/builder.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat from importlib import import_module from .depth_model import DepthModel def build_model(config) -> DepthModel: """Builds a model from a config. The model is specified by the model name and version in the config. The model is then constructed using the build_from_config function of the model interface. This function should be used to construct models for training and evaluation. Args: config (dict): Config dict. Config is constructed in utils/config.py. Each model has its own config file(s) saved in its root model folder. Returns: torch.nn.Module: Model corresponding to name and version as specified in config """ module_name = f"zoedepth.models.{config.model}" try: module = import_module(module_name) except ModuleNotFoundError as e: # print the original error message print(e) raise ValueError( f"Model {config.model} not found. Refer above error for details.") from e try: get_version = getattr(module, "get_version") except AttributeError as e: raise ValueError( f"Model {config.model} has no get_version function.") from e return get_version(config.version_name).build_from_config(config) ================================================ FILE: annotator/zoe/zoedepth/models/depth_model.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torchvision import transforms import PIL.Image from PIL import Image from typing import Union class DepthModel(nn.Module): def __init__(self): super().__init__() self.device = 'cpu' def to(self, device) -> nn.Module: self.device = device return super().to(device) def forward(self, x, *args, **kwargs): raise NotImplementedError def _infer(self, x: torch.Tensor): """ Inference interface for the model Args: x (torch.Tensor): input tensor of shape (b, c, h, w) Returns: torch.Tensor: output tensor of shape (b, 1, h, w) """ return self(x)['metric_depth'] def _infer_with_pad_aug(self, x: torch.Tensor, pad_input: bool=True, fh: float=3, fw: float=3, upsampling_mode: str='bicubic', padding_mode="reflect", **kwargs) -> torch.Tensor: """ Inference interface for the model with padding augmentation Padding augmentation fixes the boundary artifacts in the output depth map. Boundary artifacts are sometimes caused by the fact that the model is trained on NYU raw dataset which has a black or white border around the image. This augmentation pads the input image and crops the prediction back to the original size / view. Note: This augmentation is not required for the models trained with 'avoid_boundary'=True. Args: x (torch.Tensor): input tensor of shape (b, c, h, w) pad_input (bool, optional): whether to pad the input or not. Defaults to True. fh (float, optional): height padding factor. The padding is calculated as sqrt(h/2) * fh. Defaults to 3. fw (float, optional): width padding factor. The padding is calculated as sqrt(w/2) * fw. Defaults to 3. upsampling_mode (str, optional): upsampling mode. Defaults to 'bicubic'. padding_mode (str, optional): padding mode. Defaults to "reflect". Returns: torch.Tensor: output tensor of shape (b, 1, h, w) """ # assert x is nchw and c = 3 assert x.dim() == 4, "x must be 4 dimensional, got {}".format(x.dim()) assert x.shape[1] == 3, "x must have 3 channels, got {}".format(x.shape[1]) if pad_input: assert fh > 0 or fw > 0, "atlease one of fh and fw must be greater than 0" pad_h = int(np.sqrt(x.shape[2]/2) * fh) pad_w = int(np.sqrt(x.shape[3]/2) * fw) padding = [pad_w, pad_w] if pad_h > 0: padding += [pad_h, pad_h] x = F.pad(x, padding, mode=padding_mode, **kwargs) out = self._infer(x) if out.shape[-2:] != x.shape[-2:]: out = F.interpolate(out, size=(x.shape[2], x.shape[3]), mode=upsampling_mode, align_corners=False) if pad_input: # crop to the original size, handling the case where pad_h and pad_w is 0 if pad_h > 0: out = out[:, :, pad_h:-pad_h,:] if pad_w > 0: out = out[:, :, :, pad_w:-pad_w] return out def infer_with_flip_aug(self, x, pad_input: bool=True, **kwargs) -> torch.Tensor: """ Inference interface for the model with horizontal flip augmentation Horizontal flip augmentation improves the accuracy of the model by averaging the output of the model with and without horizontal flip. Args: x (torch.Tensor): input tensor of shape (b, c, h, w) pad_input (bool, optional): whether to use padding augmentation. Defaults to True. Returns: torch.Tensor: output tensor of shape (b, 1, h, w) """ # infer with horizontal flip and average out = self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs) out_flip = self._infer_with_pad_aug(torch.flip(x, dims=[3]), pad_input=pad_input, **kwargs) out = (out + torch.flip(out_flip, dims=[3])) / 2 return out def infer(self, x, pad_input: bool=True, with_flip_aug: bool=True, **kwargs) -> torch.Tensor: """ Inference interface for the model Args: x (torch.Tensor): input tensor of shape (b, c, h, w) pad_input (bool, optional): whether to use padding augmentation. Defaults to True. with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True. Returns: torch.Tensor: output tensor of shape (b, 1, h, w) """ if with_flip_aug: return self.infer_with_flip_aug(x, pad_input=pad_input, **kwargs) else: return self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs) @torch.no_grad() def infer_pil(self, pil_img, pad_input: bool=True, with_flip_aug: bool=True, output_type: str="numpy", **kwargs) -> Union[np.ndarray, PIL.Image.Image, torch.Tensor]: """ Inference interface for the model for PIL image Args: pil_img (PIL.Image.Image): input PIL image pad_input (bool, optional): whether to use padding augmentation. Defaults to True. with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True. output_type (str, optional): output type. Supported values are 'numpy', 'pil' and 'tensor'. Defaults to "numpy". """ x = transforms.ToTensor()(pil_img).unsqueeze(0).to(self.device) out_tensor = self.infer(x, pad_input=pad_input, with_flip_aug=with_flip_aug, **kwargs) if output_type == "numpy": return out_tensor.squeeze().cpu().numpy() elif output_type == "pil": # uint16 is required for depth pil image out_16bit_numpy = (out_tensor.squeeze().cpu().numpy()*256).astype(np.uint16) return Image.fromarray(out_16bit_numpy) elif output_type == "tensor": return out_tensor.squeeze().cpu() else: raise ValueError(f"output_type {output_type} not supported. Supported values are 'numpy', 'pil' and 'tensor'") ================================================ FILE: annotator/zoe/zoedepth/models/layers/attractor.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import torch import torch.nn as nn @torch.jit.script def exp_attractor(dx, alpha: float = 300, gamma: int = 2): """Exponential attractor: dc = exp(-alpha*|dx|^gamma) * dx , where dx = a - c, a = attractor point, c = bin center, dc = shift in bin centermmary for exp_attractor Args: dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center. alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300. gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2. Returns: torch.Tensor : Delta shifts - dc; New bin centers = Old bin centers + dc """ return torch.exp(-alpha*(torch.abs(dx)**gamma)) * (dx) @torch.jit.script def inv_attractor(dx, alpha: float = 300, gamma: int = 2): """Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center This is the default one according to the accompanying paper. Args: dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center. alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300. gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2. Returns: torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc """ return dx.div(1+alpha*dx.pow(gamma)) class AttractorLayer(nn.Module): def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10, alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False): """ Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth) """ super().__init__() self.n_attractors = n_attractors self.n_bins = n_bins self.min_depth = min_depth self.max_depth = max_depth self.alpha = alpha self.gamma = gamma self.kind = kind self.attractor_type = attractor_type self.memory_efficient = memory_efficient self._net = nn.Sequential( nn.Conv2d(in_features, mlp_dim, 1, 1, 0), nn.ReLU(inplace=True), nn.Conv2d(mlp_dim, n_attractors*2, 1, 1, 0), # x2 for linear norm nn.ReLU(inplace=True) ) def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False): """ Args: x (torch.Tensor) : feature block; shape - n, c, h, w b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w Returns: tuple(torch.Tensor,torch.Tensor) : new bin centers normed and scaled; shape - n, nbins, h, w """ if prev_b_embedding is not None: if interpolate: prev_b_embedding = nn.functional.interpolate( prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True) x = x + prev_b_embedding A = self._net(x) eps = 1e-3 A = A + eps n, c, h, w = A.shape A = A.view(n, self.n_attractors, 2, h, w) A_normed = A / A.sum(dim=2, keepdim=True) # n, a, 2, h, w A_normed = A[:, :, 0, ...] # n, na, h, w b_prev = nn.functional.interpolate( b_prev, (h, w), mode='bilinear', align_corners=True) b_centers = b_prev if self.attractor_type == 'exp': dist = exp_attractor else: dist = inv_attractor if not self.memory_efficient: func = {'mean': torch.mean, 'sum': torch.sum}[self.kind] # .shape N, nbins, h, w delta_c = func(dist(A_normed.unsqueeze( 2) - b_centers.unsqueeze(1)), dim=1) else: delta_c = torch.zeros_like(b_centers, device=b_centers.device) for i in range(self.n_attractors): # .shape N, nbins, h, w delta_c += dist(A_normed[:, i, ...].unsqueeze(1) - b_centers) if self.kind == 'mean': delta_c = delta_c / self.n_attractors b_new_centers = b_centers + delta_c B_centers = (self.max_depth - self.min_depth) * \ b_new_centers + self.min_depth B_centers, _ = torch.sort(B_centers, dim=1) B_centers = torch.clip(B_centers, self.min_depth, self.max_depth) return b_new_centers, B_centers class AttractorLayerUnnormed(nn.Module): def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10, alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False): """ Attractor layer for bin centers. Bin centers are unbounded """ super().__init__() self.n_attractors = n_attractors self.n_bins = n_bins self.min_depth = min_depth self.max_depth = max_depth self.alpha = alpha self.gamma = gamma self.kind = kind self.attractor_type = attractor_type self.memory_efficient = memory_efficient self._net = nn.Sequential( nn.Conv2d(in_features, mlp_dim, 1, 1, 0), nn.ReLU(inplace=True), nn.Conv2d(mlp_dim, n_attractors, 1, 1, 0), nn.Softplus() ) def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False): """ Args: x (torch.Tensor) : feature block; shape - n, c, h, w b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w Returns: tuple(torch.Tensor,torch.Tensor) : new bin centers unbounded; shape - n, nbins, h, w. Two outputs just to keep the API consistent with the normed version """ if prev_b_embedding is not None: if interpolate: prev_b_embedding = nn.functional.interpolate( prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True) x = x + prev_b_embedding A = self._net(x) n, c, h, w = A.shape b_prev = nn.functional.interpolate( b_prev, (h, w), mode='bilinear', align_corners=True) b_centers = b_prev if self.attractor_type == 'exp': dist = exp_attractor else: dist = inv_attractor if not self.memory_efficient: func = {'mean': torch.mean, 'sum': torch.sum}[self.kind] # .shape N, nbins, h, w delta_c = func( dist(A.unsqueeze(2) - b_centers.unsqueeze(1)), dim=1) else: delta_c = torch.zeros_like(b_centers, device=b_centers.device) for i in range(self.n_attractors): delta_c += dist(A[:, i, ...].unsqueeze(1) - b_centers) # .shape N, nbins, h, w if self.kind == 'mean': delta_c = delta_c / self.n_attractors b_new_centers = b_centers + delta_c B_centers = b_new_centers return b_new_centers, B_centers ================================================ FILE: annotator/zoe/zoedepth/models/layers/dist_layers.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import torch import torch.nn as nn def log_binom(n, k, eps=1e-7): """ log(nCk) using stirling approximation """ n = n + eps k = k + eps return n * torch.log(n) - k * torch.log(k) - (n-k) * torch.log(n-k+eps) class LogBinomial(nn.Module): def __init__(self, n_classes=256, act=torch.softmax): """Compute log binomial distribution for n_classes Args: n_classes (int, optional): number of output classes. Defaults to 256. """ super().__init__() self.K = n_classes self.act = act self.register_buffer('k_idx', torch.arange( 0, n_classes).view(1, -1, 1, 1)) self.register_buffer('K_minus_1', torch.Tensor( [self.K-1]).view(1, -1, 1, 1)) def forward(self, x, t=1., eps=1e-4): """Compute log binomial distribution for x Args: x (torch.Tensor - NCHW): probabilities t (float, torch.Tensor - NCHW, optional): Temperature of distribution. Defaults to 1.. eps (float, optional): Small number for numerical stability. Defaults to 1e-4. Returns: torch.Tensor -NCHW: log binomial distribution logbinomial(p;t) """ if x.ndim == 3: x = x.unsqueeze(1) # make it nchw one_minus_x = torch.clamp(1 - x, eps, 1) x = torch.clamp(x, eps, 1) y = log_binom(self.K_minus_1, self.k_idx) + self.k_idx * \ torch.log(x) + (self.K - 1 - self.k_idx) * torch.log(one_minus_x) return self.act(y/t, dim=1) class ConditionalLogBinomial(nn.Module): def __init__(self, in_features, condition_dim, n_classes=256, bottleneck_factor=2, p_eps=1e-4, max_temp=50, min_temp=1e-7, act=torch.softmax): """Conditional Log Binomial distribution Args: in_features (int): number of input channels in main feature condition_dim (int): number of input channels in condition feature n_classes (int, optional): Number of classes. Defaults to 256. bottleneck_factor (int, optional): Hidden dim factor. Defaults to 2. p_eps (float, optional): small eps value. Defaults to 1e-4. max_temp (float, optional): Maximum temperature of output distribution. Defaults to 50. min_temp (float, optional): Minimum temperature of output distribution. Defaults to 1e-7. """ super().__init__() self.p_eps = p_eps self.max_temp = max_temp self.min_temp = min_temp self.log_binomial_transform = LogBinomial(n_classes, act=act) bottleneck = (in_features + condition_dim) // bottleneck_factor self.mlp = nn.Sequential( nn.Conv2d(in_features + condition_dim, bottleneck, kernel_size=1, stride=1, padding=0), nn.GELU(), # 2 for p linear norm, 2 for t linear norm nn.Conv2d(bottleneck, 2+2, kernel_size=1, stride=1, padding=0), nn.Softplus() ) def forward(self, x, cond): """Forward pass Args: x (torch.Tensor - NCHW): Main feature cond (torch.Tensor - NCHW): condition feature Returns: torch.Tensor: Output log binomial distribution """ pt = self.mlp(torch.concat((x, cond), dim=1)) p, t = pt[:, :2, ...], pt[:, 2:, ...] p = p + self.p_eps p = p[:, 0, ...] / (p[:, 0, ...] + p[:, 1, ...]) t = t + self.p_eps t = t[:, 0, ...] / (t[:, 0, ...] + t[:, 1, ...]) t = t.unsqueeze(1) t = (self.max_temp - self.min_temp) * t + self.min_temp return self.log_binomial_transform(p, t) ================================================ FILE: annotator/zoe/zoedepth/models/layers/localbins_layers.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import torch import torch.nn as nn class SeedBinRegressor(nn.Module): def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10): """Bin center regressor network. Bin centers are bounded on (min_depth, max_depth) interval. Args: in_features (int): input channels n_bins (int, optional): Number of bin centers. Defaults to 16. mlp_dim (int, optional): Hidden dimension. Defaults to 256. min_depth (float, optional): Min depth value. Defaults to 1e-3. max_depth (float, optional): Max depth value. Defaults to 10. """ super().__init__() self.version = "1_1" self.min_depth = min_depth self.max_depth = max_depth self._net = nn.Sequential( nn.Conv2d(in_features, mlp_dim, 1, 1, 0), nn.ReLU(inplace=True), nn.Conv2d(mlp_dim, n_bins, 1, 1, 0), nn.ReLU(inplace=True) ) def forward(self, x): """ Returns tensor of bin_width vectors (centers). One vector b for every pixel """ B = self._net(x) eps = 1e-3 B = B + eps B_widths_normed = B / B.sum(dim=1, keepdim=True) B_widths = (self.max_depth - self.min_depth) * \ B_widths_normed # .shape NCHW # pad has the form (left, right, top, bottom, front, back) B_widths = nn.functional.pad( B_widths, (0, 0, 0, 0, 1, 0), mode='constant', value=self.min_depth) B_edges = torch.cumsum(B_widths, dim=1) # .shape NCHW B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:, 1:, ...]) return B_widths_normed, B_centers class SeedBinRegressorUnnormed(nn.Module): def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10): """Bin center regressor network. Bin centers are unbounded Args: in_features (int): input channels n_bins (int, optional): Number of bin centers. Defaults to 16. mlp_dim (int, optional): Hidden dimension. Defaults to 256. min_depth (float, optional): Not used. (for compatibility with SeedBinRegressor) max_depth (float, optional): Not used. (for compatibility with SeedBinRegressor) """ super().__init__() self.version = "1_1" self._net = nn.Sequential( nn.Conv2d(in_features, mlp_dim, 1, 1, 0), nn.ReLU(inplace=True), nn.Conv2d(mlp_dim, n_bins, 1, 1, 0), nn.Softplus() ) def forward(self, x): """ Returns tensor of bin_width vectors (centers). One vector b for every pixel """ B_centers = self._net(x) return B_centers, B_centers class Projector(nn.Module): def __init__(self, in_features, out_features, mlp_dim=128): """Projector MLP Args: in_features (int): input channels out_features (int): output channels mlp_dim (int, optional): hidden dimension. Defaults to 128. """ super().__init__() self._net = nn.Sequential( nn.Conv2d(in_features, mlp_dim, 1, 1, 0), nn.ReLU(inplace=True), nn.Conv2d(mlp_dim, out_features, 1, 1, 0), ) def forward(self, x): return self._net(x) class LinearSplitter(nn.Module): def __init__(self, in_features, prev_nbins, split_factor=2, mlp_dim=128, min_depth=1e-3, max_depth=10): super().__init__() self.prev_nbins = prev_nbins self.split_factor = split_factor self.min_depth = min_depth self.max_depth = max_depth self._net = nn.Sequential( nn.Conv2d(in_features, mlp_dim, 1, 1, 0), nn.GELU(), nn.Conv2d(mlp_dim, prev_nbins * split_factor, 1, 1, 0), nn.ReLU() ) def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False): """ x : feature block; shape - n, c, h, w b_prev : previous bin widths normed; shape - n, prev_nbins, h, w """ if prev_b_embedding is not None: if interpolate: prev_b_embedding = nn.functional.interpolate(prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True) x = x + prev_b_embedding S = self._net(x) eps = 1e-3 S = S + eps n, c, h, w = S.shape S = S.view(n, self.prev_nbins, self.split_factor, h, w) S_normed = S / S.sum(dim=2, keepdim=True) # fractional splits b_prev = nn.functional.interpolate(b_prev, (h,w), mode='bilinear', align_corners=True) b_prev = b_prev / b_prev.sum(dim=1, keepdim=True) # renormalize for gurantees # print(b_prev.shape, S_normed.shape) # if is_for_query:(1).expand(-1, b_prev.size(0)//n, -1, -1, -1, -1).flatten(0,1) # TODO ? can replace all this with a single torch.repeat? b = b_prev.unsqueeze(2) * S_normed b = b.flatten(1,2) # .shape n, prev_nbins * split_factor, h, w # calculate bin centers for loss calculation B_widths = (self.max_depth - self.min_depth) * b # .shape N, nprev * splitfactor, H, W # pad has the form (left, right, top, bottom, front, back) B_widths = nn.functional.pad(B_widths, (0,0,0,0,1,0), mode='constant', value=self.min_depth) B_edges = torch.cumsum(B_widths, dim=1) # .shape NCHW B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:,1:,...]) return b, B_centers ================================================ FILE: annotator/zoe/zoedepth/models/layers/patch_transformer.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import torch import torch.nn as nn class PatchTransformerEncoder(nn.Module): def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4, use_class_token=False): """ViT-like transformer block Args: in_channels (int): Input channels patch_size (int, optional): patch size. Defaults to 10. embedding_dim (int, optional): Embedding dimension in transformer model. Defaults to 128. num_heads (int, optional): number of attention heads. Defaults to 4. use_class_token (bool, optional): Whether to use extra token at the start for global accumulation (called as "class token"). Defaults to False. """ super(PatchTransformerEncoder, self).__init__() self.use_class_token = use_class_token encoder_layers = nn.TransformerEncoderLayer( embedding_dim, num_heads, dim_feedforward=1024) self.transformer_encoder = nn.TransformerEncoder( encoder_layers, num_layers=4) # takes shape S,N,E self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim, kernel_size=patch_size, stride=patch_size, padding=0) def positional_encoding_1d(self, sequence_length, batch_size, embedding_dim, device='cpu'): """Generate positional encodings Args: sequence_length (int): Sequence length embedding_dim (int): Embedding dimension Returns: torch.Tensor SBE: Positional encodings """ position = torch.arange( 0, sequence_length, dtype=torch.float32, device=device).unsqueeze(1) index = torch.arange( 0, embedding_dim, 2, dtype=torch.float32, device=device).unsqueeze(0) div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim)) pos_encoding = position * div_term pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1) pos_encoding = pos_encoding.unsqueeze(1).repeat(1, batch_size, 1) return pos_encoding def forward(self, x): """Forward pass Args: x (torch.Tensor - NCHW): Input feature tensor Returns: torch.Tensor - SNE: Transformer output embeddings. S - sequence length (=HW/patch_size^2), N - batch size, E - embedding dim """ embeddings = self.embedding_convPxP(x).flatten( 2) # .shape = n,c,s = n, embedding_dim, s if self.use_class_token: # extra special token at start ? embeddings = nn.functional.pad(embeddings, (1, 0)) # change to S,N,E format required by transformer embeddings = embeddings.permute(2, 0, 1) S, N, E = embeddings.shape embeddings = embeddings + self.positional_encoding_1d(S, N, E, device=embeddings.device) x = self.transformer_encoder(embeddings) # .shape = S, N, E return x ================================================ FILE: annotator/zoe/zoedepth/models/model_io.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import torch def load_state_dict(model, state_dict): """Load state_dict into model, handling DataParallel and DistributedDataParallel. Also checks for "model" key in state_dict. DataParallel prefixes state_dict keys with 'module.' when saving. If the model is not a DataParallel model but the state_dict is, then prefixes are removed. If the model is a DataParallel model but the state_dict is not, then prefixes are added. """ state_dict = state_dict.get('model', state_dict) # if model is a DataParallel model, then state_dict keys are prefixed with 'module.' do_prefix = isinstance( model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)) state = {} for k, v in state_dict.items(): if k.startswith('module.') and not do_prefix: k = k[7:] if not k.startswith('module.') and do_prefix: k = 'module.' + k state[k] = v model.load_state_dict(state) print("Loaded successfully") return model def load_wts(model, checkpoint_path): ckpt = torch.load(checkpoint_path, map_location='cpu') return load_state_dict(model, ckpt) def load_state_dict_from_url(model, url, **kwargs): state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu', **kwargs) return load_state_dict(model, state_dict) def load_state_from_resource(model, resource: str): """Loads weights to the model from a given resource. A resource can be of following types: 1. URL. Prefixed with "url::" e.g. url::http(s)://url.resource.com/ckpt.pt 2. Local path. Prefixed with "local::" e.g. local::/path/to/ckpt.pt Args: model (torch.nn.Module): Model resource (str): resource string Returns: torch.nn.Module: Model with loaded weights """ print(f"Using pretrained resource {resource}") if resource.startswith('url::'): url = resource.split('url::')[1] return load_state_dict_from_url(model, url, progress=True) elif resource.startswith('local::'): path = resource.split('local::')[1] return load_wts(model, path) else: raise ValueError("Invalid resource type, only url:: and local:: are supported") ================================================ FILE: annotator/zoe/zoedepth/models/zoedepth/__init__.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat from .zoedepth_v1 import ZoeDepth all_versions = { "v1": ZoeDepth, } get_version = lambda v : all_versions[v] ================================================ FILE: annotator/zoe/zoedepth/models/zoedepth/config_zoedepth.json ================================================ { "model": { "name": "ZoeDepth", "version_name": "v1", "n_bins": 64, "bin_embedding_dim": 128, "bin_centers_type": "softplus", "n_attractors":[16, 8, 4, 1], "attractor_alpha": 1000, "attractor_gamma": 2, "attractor_kind" : "mean", "attractor_type" : "inv", "midas_model_type" : "DPT_BEiT_L_384", "min_temp": 0.0212, "max_temp": 50.0, "output_distribution": "logbinomial", "memory_efficient": true, "inverse_midas": false, "img_size": [384, 512] }, "train": { "train_midas": true, "use_pretrained_midas": true, "trainer": "zoedepth", "epochs": 5, "bs": 16, "optim_kwargs": {"lr": 0.000161, "wd": 0.01}, "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true}, "same_lr": false, "w_si": 1, "w_domain": 0.2, "w_reg": 0, "w_grad": 0, "avoid_boundary": false, "random_crop": false, "input_width": 640, "input_height": 480, "midas_lr_factor": 1, "encoder_lr_factor":10, "pos_enc_lr_factor":10, "freeze_midas_bn": true }, "infer":{ "train_midas": false, "use_pretrained_midas": false, "pretrained_resource" : null, "force_keep_ar": true }, "eval":{ "train_midas": false, "use_pretrained_midas": false, "pretrained_resource" : null } } ================================================ FILE: annotator/zoe/zoedepth/models/zoedepth/config_zoedepth_kitti.json ================================================ { "model": { "bin_centers_type": "normed", "img_size": [384, 768] }, "train": { }, "infer":{ "train_midas": false, "use_pretrained_midas": false, "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt", "force_keep_ar": true }, "eval":{ "train_midas": false, "use_pretrained_midas": false, "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt" } } ================================================ FILE: annotator/zoe/zoedepth/models/zoedepth/zoedepth_v1.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import itertools import torch import torch.nn as nn from ..depth_model import DepthModel from ..base_models.midas import MidasCore from ..layers.attractor import AttractorLayer, AttractorLayerUnnormed from ..layers.dist_layers import ConditionalLogBinomial from ..layers.localbins_layers import (Projector, SeedBinRegressor, SeedBinRegressorUnnormed) from ..model_io import load_state_from_resource class ZoeDepth(DepthModel): def __init__(self, core, n_bins=64, bin_centers_type="softplus", bin_embedding_dim=128, min_depth=1e-3, max_depth=10, n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, train_midas=True, midas_lr_factor=10, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs): """ZoeDepth model. This is the version of ZoeDepth that has a single metric head Args: core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features n_bins (int, optional): Number of bin centers. Defaults to 64. bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers. For "softplus", softplus activation is used and thus are unbounded. Defaults to "softplus". bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128. min_depth (float, optional): Lower bound for normed bin centers. Defaults to 1e-3. max_depth (float, optional): Upper bound for normed bin centers. Defaults to 10. n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1]. attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300. attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2. attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'. attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'. min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5. max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50. train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True. midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10. encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10. pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10. """ super().__init__() self.core = core self.max_depth = max_depth self.min_depth = min_depth self.min_temp = min_temp self.bin_centers_type = bin_centers_type self.midas_lr_factor = midas_lr_factor self.encoder_lr_factor = encoder_lr_factor self.pos_enc_lr_factor = pos_enc_lr_factor self.train_midas = train_midas self.inverse_midas = inverse_midas if self.encoder_lr_factor <= 0: self.core.freeze_encoder( freeze_rel_pos=self.pos_enc_lr_factor <= 0) N_MIDAS_OUT = 32 btlnck_features = self.core.output_channels[0] num_out_features = self.core.output_channels[1:] self.conv2 = nn.Conv2d(btlnck_features, btlnck_features, kernel_size=1, stride=1, padding=0) # btlnck conv if bin_centers_type == "normed": SeedBinRegressorLayer = SeedBinRegressor Attractor = AttractorLayer elif bin_centers_type == "softplus": SeedBinRegressorLayer = SeedBinRegressorUnnormed Attractor = AttractorLayerUnnormed elif bin_centers_type == "hybrid1": SeedBinRegressorLayer = SeedBinRegressor Attractor = AttractorLayerUnnormed elif bin_centers_type == "hybrid2": SeedBinRegressorLayer = SeedBinRegressorUnnormed Attractor = AttractorLayer else: raise ValueError( "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'") self.seed_bin_regressor = SeedBinRegressorLayer( btlnck_features, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth) self.seed_projector = Projector(btlnck_features, bin_embedding_dim) self.projectors = nn.ModuleList([ Projector(num_out, bin_embedding_dim) for num_out in num_out_features ]) self.attractors = nn.ModuleList([ Attractor(bin_embedding_dim, n_bins, n_attractors=n_attractors[i], min_depth=min_depth, max_depth=max_depth, alpha=attractor_alpha, gamma=attractor_gamma, kind=attractor_kind, attractor_type=attractor_type) for i in range(len(num_out_features)) ]) last_in = N_MIDAS_OUT + 1 # +1 for relative depth # use log binomial instead of softmax self.conditional_log_binomial = ConditionalLogBinomial( last_in, bin_embedding_dim, n_classes=n_bins, min_temp=min_temp, max_temp=max_temp) def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs): """ Args: x (torch.Tensor): Input image tensor of shape (B, C, H, W) return_final_centers (bool, optional): Whether to return the final bin centers. Defaults to False. denorm (bool, optional): Whether to denormalize the input image. This reverses ImageNet normalization as midas normalization is different. Defaults to False. return_probs (bool, optional): Whether to return the output probability distribution. Defaults to False. Returns: dict: Dictionary containing the following keys: - rel_depth (torch.Tensor): Relative depth map of shape (B, H, W) - metric_depth (torch.Tensor): Metric depth map of shape (B, 1, H, W) - bin_centers (torch.Tensor): Bin centers of shape (B, n_bins). Present only if return_final_centers is True - probs (torch.Tensor): Output probability distribution of shape (B, n_bins, H, W). Present only if return_probs is True """ b, c, h, w = x.shape # print("input shape ", x.shape) self.orig_input_width = w self.orig_input_height = h rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True) # print("output shapes", rel_depth.shape, out.shape) outconv_activation = out[0] btlnck = out[1] x_blocks = out[2:] x_d0 = self.conv2(btlnck) x = x_d0 _, seed_b_centers = self.seed_bin_regressor(x) if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2': b_prev = (seed_b_centers - self.min_depth) / \ (self.max_depth - self.min_depth) else: b_prev = seed_b_centers prev_b_embedding = self.seed_projector(x) # unroll this loop for better performance for projector, attractor, x in zip(self.projectors, self.attractors, x_blocks): b_embedding = projector(x) b, b_centers = attractor( b_embedding, b_prev, prev_b_embedding, interpolate=True) b_prev = b.clone() prev_b_embedding = b_embedding.clone() last = outconv_activation if self.inverse_midas: # invert depth followed by normalization rel_depth = 1.0 / (rel_depth + 1e-6) rel_depth = (rel_depth - rel_depth.min()) / \ (rel_depth.max() - rel_depth.min()) # concat rel depth with last. First interpolate rel depth to last size rel_cond = rel_depth.unsqueeze(1) rel_cond = nn.functional.interpolate( rel_cond, size=last.shape[2:], mode='bilinear', align_corners=True) last = torch.cat([last, rel_cond], dim=1) b_embedding = nn.functional.interpolate( b_embedding, last.shape[-2:], mode='bilinear', align_corners=True) x = self.conditional_log_binomial(last, b_embedding) # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor # print(x.shape, b_centers.shape) b_centers = nn.functional.interpolate( b_centers, x.shape[-2:], mode='bilinear', align_corners=True) out = torch.sum(x * b_centers, dim=1, keepdim=True) # Structure output dict output = dict(metric_depth=out) if return_final_centers or return_probs: output['bin_centers'] = b_centers if return_probs: output['probs'] = x return output def get_lr_params(self, lr): """ Learning rate configuration for different layers of the model Args: lr (float) : Base learning rate Returns: list : list of parameters to optimize and their learning rates, in the format required by torch optimizers. """ param_conf = [] if self.train_midas: if self.encoder_lr_factor > 0: param_conf.append({'params': self.core.get_enc_params_except_rel_pos( ), 'lr': lr / self.encoder_lr_factor}) if self.pos_enc_lr_factor > 0: param_conf.append( {'params': self.core.get_rel_pos_params(), 'lr': lr / self.pos_enc_lr_factor}) midas_params = self.core.core.scratch.parameters() midas_lr_factor = self.midas_lr_factor param_conf.append( {'params': midas_params, 'lr': lr / midas_lr_factor}) remaining_modules = [] for name, child in self.named_children(): if name != 'core': remaining_modules.append(child) remaining_params = itertools.chain( *[child.parameters() for child in remaining_modules]) param_conf.append({'params': remaining_params, 'lr': lr}) return param_conf @staticmethod def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs): core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas, train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs) model = ZoeDepth(core, **kwargs) if pretrained_resource: assert isinstance(pretrained_resource, str), "pretrained_resource must be a string" model = load_state_from_resource(model, pretrained_resource) return model @staticmethod def build_from_config(config): return ZoeDepth.build(**config) ================================================ FILE: annotator/zoe/zoedepth/models/zoedepth_nk/__init__.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat from .zoedepth_nk_v1 import ZoeDepthNK all_versions = { "v1": ZoeDepthNK, } get_version = lambda v : all_versions[v] ================================================ FILE: annotator/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json ================================================ { "model": { "name": "ZoeDepthNK", "version_name": "v1", "bin_conf" : [ { "name": "nyu", "n_bins": 64, "min_depth": 1e-3, "max_depth": 10.0 }, { "name": "kitti", "n_bins": 64, "min_depth": 1e-3, "max_depth": 80.0 } ], "bin_embedding_dim": 128, "bin_centers_type": "softplus", "n_attractors":[16, 8, 4, 1], "attractor_alpha": 1000, "attractor_gamma": 2, "attractor_kind" : "mean", "attractor_type" : "inv", "min_temp": 0.0212, "max_temp": 50.0, "memory_efficient": true, "midas_model_type" : "DPT_BEiT_L_384", "img_size": [384, 512] }, "train": { "train_midas": true, "use_pretrained_midas": true, "trainer": "zoedepth_nk", "epochs": 5, "bs": 16, "optim_kwargs": {"lr": 0.0002512, "wd": 0.01}, "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true}, "same_lr": false, "w_si": 1, "w_domain": 100, "avoid_boundary": false, "random_crop": false, "input_width": 640, "input_height": 480, "w_grad": 0, "w_reg": 0, "midas_lr_factor": 10, "encoder_lr_factor":10, "pos_enc_lr_factor":10 }, "infer": { "train_midas": false, "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt", "use_pretrained_midas": false, "force_keep_ar": true }, "eval": { "train_midas": false, "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt", "use_pretrained_midas": false } } ================================================ FILE: annotator/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import itertools import torch import torch.nn as nn from zoedepth.models.depth_model import DepthModel from zoedepth.models.base_models.midas import MidasCore from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed from zoedepth.models.layers.dist_layers import ConditionalLogBinomial from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor, SeedBinRegressorUnnormed) from zoedepth.models.layers.patch_transformer import PatchTransformerEncoder from zoedepth.models.model_io import load_state_from_resource class ZoeDepthNK(DepthModel): def __init__(self, core, bin_conf, bin_centers_type="softplus", bin_embedding_dim=128, n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, memory_efficient=False, train_midas=True, is_midas_pretrained=True, midas_lr_factor=1, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs): """ZoeDepthNK model. This is the version of ZoeDepth that has two metric heads and uses a learned router to route to experts. Args: core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features bin_conf (List[dict]): A list of dictionaries that contain the bin configuration for each metric head. Each dictionary should contain the following keys: "name" (str, typically same as the dataset name), "n_bins" (int), "min_depth" (float), "max_depth" (float) The length of this list determines the number of metric heads. bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers. For "softplus", softplus activation is used and thus are unbounded. Defaults to "normed". bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128. n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1]. attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300. attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2. attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'. attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'. min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5. max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50. memory_efficient (bool, optional): Whether to use memory efficient version of attractor layers. Memory efficient version is slower but is recommended incase of multiple metric heads in order save GPU memory. Defaults to False. train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True. is_midas_pretrained (bool, optional): Is "core" pretrained? Defaults to True. midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10. encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10. pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10. """ super().__init__() self.core = core self.bin_conf = bin_conf self.min_temp = min_temp self.max_temp = max_temp self.memory_efficient = memory_efficient self.train_midas = train_midas self.is_midas_pretrained = is_midas_pretrained self.midas_lr_factor = midas_lr_factor self.encoder_lr_factor = encoder_lr_factor self.pos_enc_lr_factor = pos_enc_lr_factor self.inverse_midas = inverse_midas N_MIDAS_OUT = 32 btlnck_features = self.core.output_channels[0] num_out_features = self.core.output_channels[1:] # self.scales = [16, 8, 4, 2] # spatial scale factors self.conv2 = nn.Conv2d( btlnck_features, btlnck_features, kernel_size=1, stride=1, padding=0) # Transformer classifier on the bottleneck self.patch_transformer = PatchTransformerEncoder( btlnck_features, 1, 128, use_class_token=True) self.mlp_classifier = nn.Sequential( nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 2) ) if bin_centers_type == "normed": SeedBinRegressorLayer = SeedBinRegressor Attractor = AttractorLayer elif bin_centers_type == "softplus": SeedBinRegressorLayer = SeedBinRegressorUnnormed Attractor = AttractorLayerUnnormed elif bin_centers_type == "hybrid1": SeedBinRegressorLayer = SeedBinRegressor Attractor = AttractorLayerUnnormed elif bin_centers_type == "hybrid2": SeedBinRegressorLayer = SeedBinRegressorUnnormed Attractor = AttractorLayer else: raise ValueError( "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'") self.bin_centers_type = bin_centers_type # We have bins for each bin conf. # Create a map (ModuleDict) of 'name' -> seed_bin_regressor self.seed_bin_regressors = nn.ModuleDict( {conf['name']: SeedBinRegressorLayer(btlnck_features, conf["n_bins"], mlp_dim=bin_embedding_dim//2, min_depth=conf["min_depth"], max_depth=conf["max_depth"]) for conf in bin_conf} ) self.seed_projector = Projector( btlnck_features, bin_embedding_dim, mlp_dim=bin_embedding_dim//2) self.projectors = nn.ModuleList([ Projector(num_out, bin_embedding_dim, mlp_dim=bin_embedding_dim//2) for num_out in num_out_features ]) # Create a map (ModuleDict) of 'name' -> attractors (ModuleList) self.attractors = nn.ModuleDict( {conf['name']: nn.ModuleList([ Attractor(bin_embedding_dim, n_attractors[i], mlp_dim=bin_embedding_dim, alpha=attractor_alpha, gamma=attractor_gamma, kind=attractor_kind, attractor_type=attractor_type, memory_efficient=memory_efficient, min_depth=conf["min_depth"], max_depth=conf["max_depth"]) for i in range(len(n_attractors)) ]) for conf in bin_conf} ) last_in = N_MIDAS_OUT # conditional log binomial for each bin conf self.conditional_log_binomial = nn.ModuleDict( {conf['name']: ConditionalLogBinomial(last_in, bin_embedding_dim, conf['n_bins'], bottleneck_factor=4, min_temp=self.min_temp, max_temp=self.max_temp) for conf in bin_conf} ) def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs): """ Args: x (torch.Tensor): Input image tensor of shape (B, C, H, W). Assumes all images are from the same domain. return_final_centers (bool, optional): Whether to return the final centers of the attractors. Defaults to False. denorm (bool, optional): Whether to denormalize the input image. Defaults to False. return_probs (bool, optional): Whether to return the probabilities of the bins. Defaults to False. Returns: dict: Dictionary of outputs with keys: - "rel_depth": Relative depth map of shape (B, 1, H, W) - "metric_depth": Metric depth map of shape (B, 1, H, W) - "domain_logits": Domain logits of shape (B, 2) - "bin_centers": Bin centers of shape (B, N, H, W). Present only if return_final_centers is True - "probs": Bin probabilities of shape (B, N, H, W). Present only if return_probs is True """ b, c, h, w = x.shape self.orig_input_width = w self.orig_input_height = h rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True) outconv_activation = out[0] btlnck = out[1] x_blocks = out[2:] x_d0 = self.conv2(btlnck) x = x_d0 # Predict which path to take embedding = self.patch_transformer(x)[0] # N, E domain_logits = self.mlp_classifier(embedding) # N, 2 domain_vote = torch.softmax(domain_logits.sum( dim=0, keepdim=True), dim=-1) # 1, 2 # Get the path bin_conf_name = ["nyu", "kitti"][torch.argmax( domain_vote, dim=-1).squeeze().item()] try: conf = [c for c in self.bin_conf if c.name == bin_conf_name][0] except IndexError: raise ValueError( f"bin_conf_name {bin_conf_name} not found in bin_confs") min_depth = conf['min_depth'] max_depth = conf['max_depth'] seed_bin_regressor = self.seed_bin_regressors[bin_conf_name] _, seed_b_centers = seed_bin_regressor(x) if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2': b_prev = (seed_b_centers - min_depth)/(max_depth - min_depth) else: b_prev = seed_b_centers prev_b_embedding = self.seed_projector(x) attractors = self.attractors[bin_conf_name] for projector, attractor, x in zip(self.projectors, attractors, x_blocks): b_embedding = projector(x) b, b_centers = attractor( b_embedding, b_prev, prev_b_embedding, interpolate=True) b_prev = b prev_b_embedding = b_embedding last = outconv_activation b_centers = nn.functional.interpolate( b_centers, last.shape[-2:], mode='bilinear', align_corners=True) b_embedding = nn.functional.interpolate( b_embedding, last.shape[-2:], mode='bilinear', align_corners=True) clb = self.conditional_log_binomial[bin_conf_name] x = clb(last, b_embedding) # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor # print(x.shape, b_centers.shape) # b_centers = nn.functional.interpolate(b_centers, x.shape[-2:], mode='bilinear', align_corners=True) out = torch.sum(x * b_centers, dim=1, keepdim=True) output = dict(domain_logits=domain_logits, metric_depth=out) if return_final_centers or return_probs: output['bin_centers'] = b_centers if return_probs: output['probs'] = x return output def get_lr_params(self, lr): """ Learning rate configuration for different layers of the model Args: lr (float) : Base learning rate Returns: list : list of parameters to optimize and their learning rates, in the format required by torch optimizers. """ param_conf = [] if self.train_midas: def get_rel_pos_params(): for name, p in self.core.core.pretrained.named_parameters(): if "relative_position" in name: yield p def get_enc_params_except_rel_pos(): for name, p in self.core.core.pretrained.named_parameters(): if "relative_position" not in name: yield p encoder_params = get_enc_params_except_rel_pos() rel_pos_params = get_rel_pos_params() midas_params = self.core.core.scratch.parameters() midas_lr_factor = self.midas_lr_factor if self.is_midas_pretrained else 1.0 param_conf.extend([ {'params': encoder_params, 'lr': lr / self.encoder_lr_factor}, {'params': rel_pos_params, 'lr': lr / self.pos_enc_lr_factor}, {'params': midas_params, 'lr': lr / midas_lr_factor} ]) remaining_modules = [] for name, child in self.named_children(): if name != 'core': remaining_modules.append(child) remaining_params = itertools.chain( *[child.parameters() for child in remaining_modules]) param_conf.append({'params': remaining_params, 'lr': lr}) return param_conf def get_conf_parameters(self, conf_name): """ Returns parameters of all the ModuleDicts children that are exclusively used for the given bin configuration """ params = [] for name, child in self.named_children(): if isinstance(child, nn.ModuleDict): for bin_conf_name, module in child.items(): if bin_conf_name == conf_name: params += list(module.parameters()) return params def freeze_conf(self, conf_name): """ Freezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration """ for p in self.get_conf_parameters(conf_name): p.requires_grad = False def unfreeze_conf(self, conf_name): """ Unfreezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration """ for p in self.get_conf_parameters(conf_name): p.requires_grad = True def freeze_all_confs(self): """ Freezes all the parameters of all the ModuleDicts children """ for name, child in self.named_children(): if isinstance(child, nn.ModuleDict): for bin_conf_name, module in child.items(): for p in module.parameters(): p.requires_grad = False @staticmethod def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs): core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas, train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs) model = ZoeDepthNK(core, **kwargs) if pretrained_resource: assert isinstance(pretrained_resource, str), "pretrained_resource must be a string" model = load_state_from_resource(model, pretrained_resource) return model @staticmethod def build_from_config(config): return ZoeDepthNK.build(**config) ================================================ FILE: annotator/zoe/zoedepth/utils/__init__.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat ================================================ FILE: annotator/zoe/zoedepth/utils/arg_utils.py ================================================ def infer_type(x): # hacky way to infer type from string args if not isinstance(x, str): return x try: x = int(x) return x except ValueError: pass try: x = float(x) return x except ValueError: pass return x def parse_unknown(unknown_args): clean = [] for a in unknown_args: if "=" in a: k, v = a.split("=") clean.extend([k, v]) else: clean.append(a) keys = clean[::2] values = clean[1::2] return {k.replace("--", ""): infer_type(v) for k, v in zip(keys, values)} ================================================ FILE: annotator/zoe/zoedepth/utils/config.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import json import os from .easydict import EasyDict as edict from .arg_utils import infer_type import pathlib import platform ROOT = pathlib.Path(__file__).parent.parent.resolve() HOME_DIR = os.path.expanduser("~") COMMON_CONFIG = { "save_dir": os.path.expanduser("~/shortcuts/monodepth3_checkpoints"), "project": "ZoeDepth", "tags": '', "notes": "", "gpu": None, "root": ".", "uid": None, "print_losses": False } DATASETS_CONFIG = { "kitti": { "dataset": "kitti", "min_depth": 0.001, "max_depth": 80, "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"), "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"), "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt", "input_height": 352, "input_width": 1216, # 704 "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"), "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"), "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt", "min_depth_eval": 1e-3, "max_depth_eval": 80, "do_random_rotate": True, "degree": 1.0, "do_kb_crop": True, "garg_crop": True, "eigen_crop": False, "use_right": False }, "kitti_test": { "dataset": "kitti", "min_depth": 0.001, "max_depth": 80, "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"), "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"), "filenames_file": "./train_test_inputs/kitti_eigen_train_files_with_gt.txt", "input_height": 352, "input_width": 1216, "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/raw"), "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/kitti/gts"), "filenames_file_eval": "./train_test_inputs/kitti_eigen_test_files_with_gt.txt", "min_depth_eval": 1e-3, "max_depth_eval": 80, "do_random_rotate": False, "degree": 1.0, "do_kb_crop": True, "garg_crop": True, "eigen_crop": False, "use_right": False }, "nyu": { "dataset": "nyu", "avoid_boundary": False, "min_depth": 1e-3, # originally 0.1 "max_depth": 10, "data_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"), "gt_path": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/sync/"), "filenames_file": "./train_test_inputs/nyudepthv2_train_files_with_gt.txt", "input_height": 480, "input_width": 640, "data_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"), "gt_path_eval": os.path.join(HOME_DIR, "shortcuts/datasets/nyu_depth_v2/official_splits/test/"), "filenames_file_eval": "./train_test_inputs/nyudepthv2_test_files_with_gt.txt", "min_depth_eval": 1e-3, "max_depth_eval": 10, "min_depth_diff": -10, "max_depth_diff": 10, "do_random_rotate": True, "degree": 1.0, "do_kb_crop": False, "garg_crop": False, "eigen_crop": True }, "ibims": { "dataset": "ibims", "ibims_root": os.path.join(HOME_DIR, "shortcuts/datasets/ibims/ibims1_core_raw/"), "eigen_crop": True, "garg_crop": False, "do_kb_crop": False, "min_depth_eval": 0, "max_depth_eval": 10, "min_depth": 1e-3, "max_depth": 10 }, "sunrgbd": { "dataset": "sunrgbd", "sunrgbd_root": os.path.join(HOME_DIR, "shortcuts/datasets/SUNRGBD/test/"), "eigen_crop": True, "garg_crop": False, "do_kb_crop": False, "min_depth_eval": 0, "max_depth_eval": 8, "min_depth": 1e-3, "max_depth": 10 }, "diml_indoor": { "dataset": "diml_indoor", "diml_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_indoor_test/"), "eigen_crop": True, "garg_crop": False, "do_kb_crop": False, "min_depth_eval": 0, "max_depth_eval": 10, "min_depth": 1e-3, "max_depth": 10 }, "diml_outdoor": { "dataset": "diml_outdoor", "diml_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diml_outdoor_test/"), "eigen_crop": False, "garg_crop": True, "do_kb_crop": False, "min_depth_eval": 2, "max_depth_eval": 80, "min_depth": 1e-3, "max_depth": 80 }, "diode_indoor": { "dataset": "diode_indoor", "diode_indoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_indoor/"), "eigen_crop": True, "garg_crop": False, "do_kb_crop": False, "min_depth_eval": 1e-3, "max_depth_eval": 10, "min_depth": 1e-3, "max_depth": 10 }, "diode_outdoor": { "dataset": "diode_outdoor", "diode_outdoor_root": os.path.join(HOME_DIR, "shortcuts/datasets/diode_outdoor/"), "eigen_crop": False, "garg_crop": True, "do_kb_crop": False, "min_depth_eval": 1e-3, "max_depth_eval": 80, "min_depth": 1e-3, "max_depth": 80 }, "hypersim_test": { "dataset": "hypersim_test", "hypersim_test_root": os.path.join(HOME_DIR, "shortcuts/datasets/hypersim_test/"), "eigen_crop": True, "garg_crop": False, "do_kb_crop": False, "min_depth_eval": 1e-3, "max_depth_eval": 80, "min_depth": 1e-3, "max_depth": 10 }, "vkitti": { "dataset": "vkitti", "vkitti_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti_test/"), "eigen_crop": False, "garg_crop": True, "do_kb_crop": True, "min_depth_eval": 1e-3, "max_depth_eval": 80, "min_depth": 1e-3, "max_depth": 80 }, "vkitti2": { "dataset": "vkitti2", "vkitti2_root": os.path.join(HOME_DIR, "shortcuts/datasets/vkitti2/"), "eigen_crop": False, "garg_crop": True, "do_kb_crop": True, "min_depth_eval": 1e-3, "max_depth_eval": 80, "min_depth": 1e-3, "max_depth": 80, }, "ddad": { "dataset": "ddad", "ddad_root": os.path.join(HOME_DIR, "shortcuts/datasets/ddad/ddad_val/"), "eigen_crop": False, "garg_crop": True, "do_kb_crop": True, "min_depth_eval": 1e-3, "max_depth_eval": 80, "min_depth": 1e-3, "max_depth": 80, }, } ALL_INDOOR = ["nyu", "ibims", "sunrgbd", "diode_indoor", "hypersim_test"] ALL_OUTDOOR = ["kitti", "diml_outdoor", "diode_outdoor", "vkitti2", "ddad"] ALL_EVAL_DATASETS = ALL_INDOOR + ALL_OUTDOOR COMMON_TRAINING_CONFIG = { "dataset": "nyu", "distributed": True, "workers": 16, "clip_grad": 0.1, "use_shared_dict": False, "shared_dict": None, "use_amp": False, "aug": True, "random_crop": False, "random_translate": False, "translate_prob": 0.2, "max_translation": 100, "validate_every": 0.25, "log_images_every": 0.1, "prefetch": False, } def flatten(config, except_keys=('bin_conf')): def recurse(inp): if isinstance(inp, dict): for key, value in inp.items(): if key in except_keys: yield (key, value) if isinstance(value, dict): yield from recurse(value) else: yield (key, value) return dict(list(recurse(config))) def split_combined_args(kwargs): """Splits the arguments that are combined with '__' into multiple arguments. Combined arguments should have equal number of keys and values. Keys are separated by '__' and Values are separated with ';'. For example, '__n_bins__lr=256;0.001' Args: kwargs (dict): key-value pairs of arguments where key-value is optionally combined according to the above format. Returns: dict: Parsed dict with the combined arguments split into individual key-value pairs. """ new_kwargs = dict(kwargs) for key, value in kwargs.items(): if key.startswith("__"): keys = key.split("__")[1:] values = value.split(";") assert len(keys) == len( values), f"Combined arguments should have equal number of keys and values. Keys are separated by '__' and Values are separated with ';'. For example, '__n_bins__lr=256;0.001. Given (keys,values) is ({keys}, {values})" for k, v in zip(keys, values): new_kwargs[k] = v return new_kwargs def parse_list(config, key, dtype=int): """Parse a list of values for the key if the value is a string. The values are separated by a comma. Modifies the config in place. """ if key in config: if isinstance(config[key], str): config[key] = list(map(dtype, config[key].split(','))) assert isinstance(config[key], list) and all([isinstance(e, dtype) for e in config[key]] ), f"{key} should be a list of values dtype {dtype}. Given {config[key]} of type {type(config[key])} with values of type {[type(e) for e in config[key]]}." def get_model_config(model_name, model_version=None): """Find and parse the .json config file for the model. Args: model_name (str): name of the model. The config file should be named config_{model_name}[_{model_version}].json under the models/{model_name} directory. model_version (str, optional): Specific config version. If specified config_{model_name}_{model_version}.json is searched for and used. Otherwise config_{model_name}.json is used. Defaults to None. Returns: easydict: the config dictionary for the model. """ config_fname = f"config_{model_name}_{model_version}.json" if model_version is not None else f"config_{model_name}.json" config_file = os.path.join(ROOT, "models", model_name, config_fname) if not os.path.exists(config_file): return None with open(config_file, "r") as f: config = edict(json.load(f)) # handle dictionary inheritance # only training config is supported for inheritance if "inherit" in config.train and config.train.inherit is not None: inherit_config = get_model_config(config.train["inherit"]).train for key, value in inherit_config.items(): if key not in config.train: config.train[key] = value return edict(config) def update_model_config(config, mode, model_name, model_version=None, strict=False): model_config = get_model_config(model_name, model_version) if model_config is not None: config = {**config, ** flatten({**model_config.model, **model_config[mode]})} elif strict: raise ValueError(f"Config file for model {model_name} not found.") return config def check_choices(name, value, choices): # return # No checks in dev branch if value not in choices: raise ValueError(f"{name} {value} not in supported choices {choices}") KEYS_TYPE_BOOL = ["use_amp", "distributed", "use_shared_dict", "same_lr", "aug", "three_phase", "prefetch", "cycle_momentum"] # Casting is not necessary as their int casted values in config are 0 or 1 def get_config(model_name, mode='train', dataset=None, **overwrite_kwargs): """Main entry point to get the config for the model. Args: model_name (str): name of the desired model. mode (str, optional): "train" or "infer". Defaults to 'train'. dataset (str, optional): If specified, the corresponding dataset configuration is loaded as well. Defaults to None. Keyword Args: key-value pairs of arguments to overwrite the default config. The order of precedence for overwriting the config is (Higher precedence first): # 1. overwrite_kwargs # 2. "config_version": Config file version if specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{config_version}.json # 3. "version_name": Default Model version specific config specified in overwrite_kwargs. The corresponding config loaded is config_{model_name}_{version_name}.json # 4. common_config: Default config for all models specified in COMMON_CONFIG Returns: easydict: The config dictionary for the model. """ check_choices("Model", model_name, ["zoedepth", "zoedepth_nk"]) check_choices("Mode", mode, ["train", "infer", "eval"]) if mode == "train": check_choices("Dataset", dataset, ["nyu", "kitti", "mix", None]) config = flatten({**COMMON_CONFIG, **COMMON_TRAINING_CONFIG}) config = update_model_config(config, mode, model_name) # update with model version specific config version_name = overwrite_kwargs.get("version_name", config["version_name"]) config = update_model_config(config, mode, model_name, version_name) # update with config version if specified config_version = overwrite_kwargs.get("config_version", None) if config_version is not None: print("Overwriting config with config_version", config_version) config = update_model_config(config, mode, model_name, config_version) # update with overwrite_kwargs # Combined args are useful for hyperparameter search overwrite_kwargs = split_combined_args(overwrite_kwargs) config = {**config, **overwrite_kwargs} # Casting to bool # TODO: Not necessary. Remove and test for key in KEYS_TYPE_BOOL: if key in config: config[key] = bool(config[key]) # Model specific post processing of config parse_list(config, "n_attractors") # adjust n_bins for each bin configuration if bin_conf is given and n_bins is passed in overwrite_kwargs if 'bin_conf' in config and 'n_bins' in overwrite_kwargs: bin_conf = config['bin_conf'] # list of dicts n_bins = overwrite_kwargs['n_bins'] new_bin_conf = [] for conf in bin_conf: conf['n_bins'] = n_bins new_bin_conf.append(conf) config['bin_conf'] = new_bin_conf if mode == "train": orig_dataset = dataset if dataset == "mix": dataset = 'nyu' # Use nyu as default for mix. Dataset config is changed accordingly while loading the dataloader if dataset is not None: config['project'] = f"MonoDepth3-{orig_dataset}" # Set project for wandb if dataset is not None: config['dataset'] = dataset config = {**DATASETS_CONFIG[dataset], **config} config['model'] = model_name typed_config = {k: infer_type(v) for k, v in config.items()} # add hostname to config config['hostname'] = platform.node() return edict(typed_config) def change_dataset(config, new_dataset): config.update(DATASETS_CONFIG[new_dataset]) return config ================================================ FILE: annotator/zoe/zoedepth/utils/easydict/__init__.py ================================================ """ EasyDict Copy/pasted from https://github.com/makinacorpus/easydict Original author: Mathieu Leplatre """ class EasyDict(dict): """ Get attributes >>> d = EasyDict({'foo':3}) >>> d['foo'] 3 >>> d.foo 3 >>> d.bar Traceback (most recent call last): ... AttributeError: 'EasyDict' object has no attribute 'bar' Works recursively >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}}) >>> isinstance(d.bar, dict) True >>> d.bar.x 1 Bullet-proof >>> EasyDict({}) {} >>> EasyDict(d={}) {} >>> EasyDict(None) {} >>> d = {'a': 1} >>> EasyDict(**d) {'a': 1} >>> EasyDict((('a', 1), ('b', 2))) {'a': 1, 'b': 2} Set attributes >>> d = EasyDict() >>> d.foo = 3 >>> d.foo 3 >>> d.bar = {'prop': 'value'} >>> d.bar.prop 'value' >>> d {'foo': 3, 'bar': {'prop': 'value'}} >>> d.bar.prop = 'newer' >>> d.bar.prop 'newer' Values extraction >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]}) >>> isinstance(d.bar, list) True >>> from operator import attrgetter >>> list(map(attrgetter('x'), d.bar)) [1, 3] >>> list(map(attrgetter('y'), d.bar)) [2, 4] >>> d = EasyDict() >>> list(d.keys()) [] >>> d = EasyDict(foo=3, bar=dict(x=1, y=2)) >>> d.foo 3 >>> d.bar.x 1 Still like a dict though >>> o = EasyDict({'clean':True}) >>> list(o.items()) [('clean', True)] And like a class >>> class Flower(EasyDict): ... power = 1 ... >>> f = Flower() >>> f.power 1 >>> f = Flower({'height': 12}) >>> f.height 12 >>> f['power'] 1 >>> sorted(f.keys()) ['height', 'power'] update and pop items >>> d = EasyDict(a=1, b='2') >>> e = EasyDict(c=3.0, a=9.0) >>> d.update(e) >>> d.c 3.0 >>> d['c'] 3.0 >>> d.get('c') 3.0 >>> d.update(a=4, b=4) >>> d.b 4 >>> d.pop('a') 4 >>> d.a Traceback (most recent call last): ... AttributeError: 'EasyDict' object has no attribute 'a' """ def __init__(self, d=None, **kwargs): if d is None: d = {} else: d = dict(d) if kwargs: d.update(**kwargs) for k, v in d.items(): setattr(self, k, v) # Class attributes for k in self.__class__.__dict__.keys(): if not (k.startswith('__') and k.endswith('__')) and not k in ('update', 'pop'): setattr(self, k, getattr(self, k)) def __setattr__(self, name, value): if isinstance(value, (list, tuple)): value = [self.__class__(x) if isinstance(x, dict) else x for x in value] elif isinstance(value, dict) and not isinstance(value, self.__class__): value = self.__class__(value) super(EasyDict, self).__setattr__(name, value) super(EasyDict, self).__setitem__(name, value) __setitem__ = __setattr__ def update(self, e=None, **f): d = e or dict() d.update(f) for k in d: setattr(self, k, d[k]) def pop(self, k, d=None): delattr(self, k) return super(EasyDict, self).pop(k, d) if __name__ == "__main__": import doctest doctest.testmod() ================================================ FILE: annotator/zoe/zoedepth/utils/geometry.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat import numpy as np def get_intrinsics(H,W): """ Intrinsics for a pinhole camera model. Assume fov of 55 degrees and central principal point. """ f = 0.5 * W / np.tan(0.5 * 55 * np.pi / 180.0) cx = 0.5 * W cy = 0.5 * H return np.array([[f, 0, cx], [0, f, cy], [0, 0, 1]]) def depth_to_points(depth, R=None, t=None): K = get_intrinsics(depth.shape[1], depth.shape[2]) Kinv = np.linalg.inv(K) if R is None: R = np.eye(3) if t is None: t = np.zeros(3) # M converts from your coordinate to PyTorch3D's coordinate system M = np.eye(3) M[0, 0] = -1.0 M[1, 1] = -1.0 height, width = depth.shape[1:3] x = np.arange(width) y = np.arange(height) coord = np.stack(np.meshgrid(x, y), -1) coord = np.concatenate((coord, np.ones_like(coord)[:, :, [0]]), -1) # z=1 coord = coord.astype(np.float32) # coord = torch.as_tensor(coord, dtype=torch.float32, device=device) coord = coord[None] # bs, h, w, 3 D = depth[:, :, :, None, None] # print(D.shape, Kinv[None, None, None, ...].shape, coord[:, :, :, :, None].shape ) pts3D_1 = D * Kinv[None, None, None, ...] @ coord[:, :, :, :, None] # pts3D_1 live in your coordinate system. Convert them to Py3D's pts3D_1 = M[None, None, None, ...] @ pts3D_1 # from reference to targe tviewpoint pts3D_2 = R[None, None, None, ...] @ pts3D_1 + t[None, None, None, :, None] # pts3D_2 = pts3D_1 # depth_2 = pts3D_2[:, :, :, 2, :] # b,1,h,w return pts3D_2[:, :, :, :3, 0][0] def create_triangles(h, w, mask=None): """ Reference: https://github.com/google-research/google-research/blob/e96197de06613f1b027d20328e06d69829fa5a89/infinite_nature/render_utils.py#L68 Creates mesh triangle indices from a given pixel grid size. This function is not and need not be differentiable as triangle indices are fixed. Args: h: (int) denoting the height of the image. w: (int) denoting the width of the image. Returns: triangles: 2D numpy array of indices (int) with shape (2(W-1)(H-1) x 3) """ x, y = np.meshgrid(range(w - 1), range(h - 1)) tl = y * w + x tr = y * w + x + 1 bl = (y + 1) * w + x br = (y + 1) * w + x + 1 triangles = np.array([tl, bl, tr, br, tr, bl]) triangles = np.transpose(triangles, (1, 2, 0)).reshape( ((w - 1) * (h - 1) * 2, 3)) if mask is not None: mask = mask.reshape(-1) triangles = triangles[mask[triangles].all(1)] return triangles ================================================ FILE: annotator/zoe/zoedepth/utils/misc.py ================================================ # MIT License # Copyright (c) 2022 Intelligent Systems Lab Org # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # File author: Shariq Farooq Bhat """Miscellaneous utility functions.""" from scipy import ndimage import base64 import math import re from io import BytesIO import matplotlib import matplotlib.cm import numpy as np import requests import torch import torch.distributed as dist import torch.nn import torch.nn as nn import torch.utils.data.distributed from PIL import Image from torchvision.transforms import ToTensor class RunningAverage: def __init__(self): self.avg = 0 self.count = 0 def append(self, value): self.avg = (value + self.count * self.avg) / (self.count + 1) self.count += 1 def get_value(self): return self.avg def denormalize(x): """Reverses the imagenet normalization applied to the input. Args: x (torch.Tensor - shape(N,3,H,W)): input tensor Returns: torch.Tensor - shape(N,3,H,W): Denormalized input """ mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device) std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device) return x * std + mean class RunningAverageDict: """A dictionary of running averages.""" def __init__(self): self._dict = None def update(self, new_dict): if new_dict is None: return if self._dict is None: self._dict = dict() for key, value in new_dict.items(): self._dict[key] = RunningAverage() for key, value in new_dict.items(): self._dict[key].append(value) def get_value(self): if self._dict is None: return None return {key: value.get_value() for key, value in self._dict.items()} def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None): """Converts a depth map to a color image. Args: value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None. vmax (float, optional): vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None. cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'. invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99. invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None. background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255). gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False. value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None. Returns: numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4) """ if isinstance(value, torch.Tensor): value = value.detach().cpu().numpy() value = value.squeeze() if invalid_mask is None: invalid_mask = value == invalid_val mask = np.logical_not(invalid_mask) # normalize vmin = np.percentile(value[mask],2) if vmin is None else vmin vmax = np.percentile(value[mask],85) if vmax is None else vmax if vmin != vmax: value = (value - vmin) / (vmax - vmin) # vmin..vmax else: # Avoid 0-division value = value * 0. # squeeze last dim if it exists # grey out the invalid values value[invalid_mask] = np.nan cmapper = matplotlib.cm.get_cmap(cmap) if value_transform: value = value_transform(value) # value = value / value.max() value = cmapper(value, bytes=True) # (nxmx4) # img = value[:, :, :] img = value[...] img[invalid_mask] = background_color # return img.transpose((2, 0, 1)) if gamma_corrected: # gamma correction img = img / 255 img = np.power(img, 2.2) img = img * 255 img = img.astype(np.uint8) return img def count_parameters(model, include_all=False): return sum(p.numel() for p in model.parameters() if p.requires_grad or include_all) def compute_errors(gt, pred): """Compute metrics for 'pred' compared to 'gt' Args: gt (numpy.ndarray): Ground truth values pred (numpy.ndarray): Predicted values gt.shape should be equal to pred.shape Returns: dict: Dictionary containing the following metrics: 'a1': Delta1 accuracy: Fraction of pixels that are within a scale factor of 1.25 'a2': Delta2 accuracy: Fraction of pixels that are within a scale factor of 1.25^2 'a3': Delta3 accuracy: Fraction of pixels that are within a scale factor of 1.25^3 'abs_rel': Absolute relative error 'rmse': Root mean squared error 'log_10': Absolute log10 error 'sq_rel': Squared relative error 'rmse_log': Root mean squared error on the log scale 'silog': Scale invariant log error """ thresh = np.maximum((gt / pred), (pred / gt)) a1 = (thresh < 1.25).mean() a2 = (thresh < 1.25 ** 2).mean() a3 = (thresh < 1.25 ** 3).mean() abs_rel = np.mean(np.abs(gt - pred) / gt) sq_rel = np.mean(((gt - pred) ** 2) / gt) rmse = (gt - pred) ** 2 rmse = np.sqrt(rmse.mean()) rmse_log = (np.log(gt) - np.log(pred)) ** 2 rmse_log = np.sqrt(rmse_log.mean()) err = np.log(pred) - np.log(gt) silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100 log_10 = (np.abs(np.log10(gt) - np.log10(pred))).mean() return dict(a1=a1, a2=a2, a3=a3, abs_rel=abs_rel, rmse=rmse, log_10=log_10, rmse_log=rmse_log, silog=silog, sq_rel=sq_rel) def compute_metrics(gt, pred, interpolate=True, garg_crop=False, eigen_crop=True, dataset='nyu', min_depth_eval=0.1, max_depth_eval=10, **kwargs): """Compute metrics of predicted depth maps. Applies cropping and masking as necessary or specified via arguments. Refer to compute_errors for more details on metrics. """ if 'config' in kwargs: config = kwargs['config'] garg_crop = config.garg_crop eigen_crop = config.eigen_crop min_depth_eval = config.min_depth_eval max_depth_eval = config.max_depth_eval if gt.shape[-2:] != pred.shape[-2:] and interpolate: pred = nn.functional.interpolate( pred, gt.shape[-2:], mode='bilinear', align_corners=True) pred = pred.squeeze().cpu().numpy() pred[pred < min_depth_eval] = min_depth_eval pred[pred > max_depth_eval] = max_depth_eval pred[np.isinf(pred)] = max_depth_eval pred[np.isnan(pred)] = min_depth_eval gt_depth = gt.squeeze().cpu().numpy() valid_mask = np.logical_and( gt_depth > min_depth_eval, gt_depth < max_depth_eval) if garg_crop or eigen_crop: gt_height, gt_width = gt_depth.shape eval_mask = np.zeros(valid_mask.shape) if garg_crop: eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height), int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1 elif eigen_crop: # print("-"*10, " EIGEN CROP ", "-"*10) if dataset == 'kitti': eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height), int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1 else: # assert gt_depth.shape == (480, 640), "Error: Eigen crop is currently only valid for (480, 640) images" eval_mask[45:471, 41:601] = 1 else: eval_mask = np.ones(valid_mask.shape) valid_mask = np.logical_and(valid_mask, eval_mask) return compute_errors(gt_depth[valid_mask], pred[valid_mask]) #################################### Model uilts ################################################ def parallelize(config, model, find_unused_parameters=True): if config.gpu is not None: torch.cuda.set_device(config.gpu) model = model.cuda(config.gpu) config.multigpu = False if config.distributed: # Use DDP config.multigpu = True config.rank = config.rank * config.ngpus_per_node + config.gpu dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url, world_size=config.world_size, rank=config.rank) config.batch_size = int(config.batch_size / config.ngpus_per_node) # config.batch_size = 8 config.workers = int( (config.num_workers + config.ngpus_per_node - 1) / config.ngpus_per_node) print("Device", config.gpu, "Rank", config.rank, "batch size", config.batch_size, "Workers", config.workers) torch.cuda.set_device(config.gpu) model = nn.SyncBatchNorm.convert_sync_batchnorm(model) model = model.cuda(config.gpu) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu], output_device=config.gpu, find_unused_parameters=find_unused_parameters) elif config.gpu is None: # Use DP config.multigpu = True model = model.cuda() model = torch.nn.DataParallel(model) return model ################################################################################################# ##################################################################################################### class colors: '''Colors class: Reset all colors with colors.reset Two subclasses fg for foreground and bg for background. Use as colors.subclass.colorname. i.e. colors.fg.red or colors.bg.green Also, the generic bold, disable, underline, reverse, strikethrough, and invisible work with the main class i.e. colors.bold ''' reset = '\033[0m' bold = '\033[01m' disable = '\033[02m' underline = '\033[04m' reverse = '\033[07m' strikethrough = '\033[09m' invisible = '\033[08m' class fg: black = '\033[30m' red = '\033[31m' green = '\033[32m' orange = '\033[33m' blue = '\033[34m' purple = '\033[35m' cyan = '\033[36m' lightgrey = '\033[37m' darkgrey = '\033[90m' lightred = '\033[91m' lightgreen = '\033[92m' yellow = '\033[93m' lightblue = '\033[94m' pink = '\033[95m' lightcyan = '\033[96m' class bg: black = '\033[40m' red = '\033[41m' green = '\033[42m' orange = '\033[43m' blue = '\033[44m' purple = '\033[45m' cyan = '\033[46m' lightgrey = '\033[47m' def printc(text, color): print(f"{color}{text}{colors.reset}") ############################################ def get_image_from_url(url): response = requests.get(url) img = Image.open(BytesIO(response.content)).convert("RGB") return img def url_to_torch(url, size=(384, 384)): img = get_image_from_url(url) img = img.resize(size, Image.ANTIALIAS) img = torch.from_numpy(np.asarray(img)).float() img = img.permute(2, 0, 1) img.div_(255) return img def pil_to_batched_tensor(img): return ToTensor()(img).unsqueeze(0) def save_raw_16bit(depth, fpath="raw.png"): if isinstance(depth, torch.Tensor): depth = depth.squeeze().cpu().numpy() assert isinstance(depth, np.ndarray), "Depth must be a torch tensor or numpy array" assert depth.ndim == 2, "Depth must be 2D" depth = depth * 256 # scale for 16-bit png depth = depth.astype(np.uint16) depth = Image.fromarray(depth) depth.save(fpath) print("Saved raw depth to", fpath) ================================================ FILE: example/advanced_weighting_example/api_advanced_weighting.py ================================================ import os import io import cv2 import base64 import requests """ To use this example make sure you've done the following steps before executing: 1. Ensure automatic1111 is running in api mode with the controlnet extension. Use the following command in your terminal to activate: ./webui.sh --no-half --api 2. Validate python environment meet package dependencies. If running in a local repo you'll likely need to pip install cv2, requests and PIL """ def generate(url: str, payload: dict, file_suffix: str = ""): response = requests.post(url=url, json=payload).json() if "images" not in response: print(response) else: for i, base64image in enumerate(response["images"]): with open(f"{os.path.basename(url)}-{i}{file_suffix}.png", 'wb') as f: f.write(base64.b64decode(response['images'][i])) def read_image(img_path: str) -> str: img = cv2.imread(img_path) _, bytes = cv2.imencode(".png", img) encoded_image = base64.b64encode(bytes).decode("utf-8") return encoded_image input_image = read_image("stock_mountain.png") txt2img_payload = { "alwayson_scripts": { "ControlNet": { "args": [ { "batch_images": "", "control_mode": "Balanced", "enabled": True, "guidance_end": 1, "guidance_start": 0, "image": input_image, "low_vram": False, "model": "control_v11p_sd15_canny [d14c016b]", "module": "canny", "pixel_perfect": False, "processor_res": -1, "resize_mode": "Crop and Resize", "save_detected_map": True, "threshold_a": -1, "threshold_b": -1, "weight": 1, } ] } }, "batch_size": 1, "cfg_scale": 7, "comments": {}, "disable_extra_networks": False, "do_not_save_grid": False, "do_not_save_samples": False, "enable_hr": False, "height": 512, "width": 768, "hr_negative_prompt": "", "hr_prompt": "", "hr_resize_x": 0, "hr_resize_y": 0, "hr_scale": 2, "hr_second_pass_steps": 0, "hr_upscaler": "Latent", "n_iter": 1, "negative_prompt": "", "override_settings": {}, "override_settings_restore_afterwards": True, "prompt": "(masterpiece: 1.3), (highres: 1.3), best quality, a large avalanche", "restore_faces": False, "s_churn": 0.0, "s_min_uncond": 0, "s_noise": 1.0, "s_tmax": None, "s_tmin": 0.0, "sampler_name": "DPM++ 2M Karras", "script_args": [], "script_name": None, "seed": 42, "seed_enable_extras": True, "seed_resize_from_h": -1, "seed_resize_from_w": -1, "steps": 30, "styles": [], "subseed": -1, "subseed_strength": 0, "tiling": False, } if __name__ == "__main__": url = "http://localhost:7860/sdapi/v1/" for weight_factor in (0.3, 0.5, 0.8): advanced_weighting = [weight_factor ** float(12 - i) for i in range(13)] txt2img_payload["alwayson_scripts"]["ControlNet"]["args"][0][ "advanced_weighting" ] = advanced_weighting generate(url + "txt2img", txt2img_payload, file_suffix=f"fac{weight_factor}") for linear_start in (0.3, 0.5, 0.8): step = (1.0 - linear_start) / 12 advanced_weighting = [linear_start + i * step for i in range(13)] txt2img_payload["alwayson_scripts"]["ControlNet"]["args"][0][ "advanced_weighting" ] = advanced_weighting generate(url + "txt2img", txt2img_payload, file_suffix=f"linear{linear_start}") ================================================ FILE: example/chatgpt.py ================================================ import os import re import uuid import cv2 import torch import requests import io, base64 import numpy as np import gradio as gr from PIL import Image from omegaconf import OmegaConf from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation from langchain.agents.initialize import initialize_agent from langchain.agents.tools import Tool from langchain.chains.conversation.memory import ConversationBufferMemory from langchain.llms.openai import OpenAI VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand. Visual ChatGPT is able to process and understand large amounts of text and images. As a language model, Visual ChatGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and Visual ChatGPT can invoke different tools to indirectly understand pictures. When talking about images, Visual ChatGPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Visual ChatGPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. Visual ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated. Human may provide new figures to Visual ChatGPT with a description. The description helps Visual ChatGPT to understand this image, but Visual ChatGPT should use tools to finish following tasks, rather than directly imagine from the description. Overall, Visual ChatGPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. TOOLS: ------ Visual ChatGPT has access to the following tools:""" VISUAL_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format: ``` Thought: Do I need to use a tool? Yes Action: the action to take, should be one of [{tool_names}] Action Input: the input to the action Observation: the result of the action ``` When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format: ``` Thought: Do I need to use a tool? No {ai_prefix}: [your response here] ``` """ VISUAL_CHATGPT_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if it does not exist. You will remember to provide the image file name loyally if it's provided in the last tool observation. Begin! Previous conversation history: {chat_history} New input: {input} Since Visual ChatGPT is a text language model, Visual ChatGPT must use tools to observe images rather than imagination. The thoughts and observations are only visible for Visual ChatGPT, Visual ChatGPT should remember to repeat important information in the final response for Human. Thought: Do I need to use a tool? {agent_scratchpad}""" ENDPOINT = "http://localhost:7860" T2IAPI = ENDPOINT + "/controlnet/txt2img" DETECTAPI = ENDPOINT + "/controlnet/detect" MODELLIST = ENDPOINT + "/controlnet/model_list" device = "cpu" if torch.cuda.is_available(): device = "cuda" def readImage(path): img = cv2.imread(path) retval, buffer = cv2.imencode('.jpg', img) b64img = base64.b64encode(buffer).decode("utf-8") return b64img def get_model(pattern='^control_canny.*'): r = requests.get(MODELLIST) result = r.json()["model_list"] for item in result: if re.match(pattern, item): return item def do_webui_request(url=T2IAPI, **kwargs): reqbody = { "prompt": "best quality, extremely detailed", "negative_prompt": "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", "seed": -1, "subseed": -1, "subseed_strength": 0, "batch_size": 1, "n_iter": 1, "steps": 15, "cfg_scale": 7, "width": 512, "height": 768, "restore_faces": True, "eta": 0, "sampler_index": "Euler a", "controlnet_input_images": [], "controlnet_module": 'canny', "controlnet_model": 'control_canny-fp16 [e3fe7712]', "controlnet_guidance": 1.0, } reqbody.update(kwargs) r = requests.post(url, json=reqbody) return r.json() def cut_dialogue_history(history_memory, keep_last_n_words=500): tokens = history_memory.split() n_tokens = len(tokens) print(f"hitory_memory:{history_memory}, n_tokens: {n_tokens}") if n_tokens < keep_last_n_words: return history_memory else: paragraphs = history_memory.split('\n') last_n_tokens = n_tokens while last_n_tokens >= keep_last_n_words: last_n_tokens = last_n_tokens - len(paragraphs[0].split(' ')) paragraphs = paragraphs[1:] return '\n' + '\n'.join(paragraphs) def get_new_image_name(org_img_name, func_name="update"): head_tail = os.path.split(org_img_name) head = head_tail[0] tail = head_tail[1] name_split = tail.split('.')[0].split('_') this_new_uuid = str(uuid.uuid4())[0:4] if len(name_split) == 1: most_org_file_name = name_split[0] recent_prev_file_name = name_split[0] new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name) else: assert len(name_split) == 4 most_org_file_name = name_split[3] recent_prev_file_name = name_split[0] new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name) return os.path.join(head, new_file_name) class MaskFormer: def __init__(self, device): self.device = device self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined") self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device) def inference(self, image_path, text): threshold = 0.5 min_area = 0.02 padding = 20 original_image = Image.open(image_path) image = original_image.resize((512, 512)) inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt",).to(self.device) with torch.no_grad(): outputs = self.model(**inputs) mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold area_ratio = len(np.argwhere(mask)) / (mask.shape[0] * mask.shape[1]) if area_ratio < min_area: return None true_indices = np.argwhere(mask) mask_array = np.zeros_like(mask, dtype=bool) for idx in true_indices: padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx) mask_array[padded_slice] = True visual_mask = (mask_array * 255).astype(np.uint8) image_mask = Image.fromarray(visual_mask) return image_mask.resize(image.size) # class ImageEditing: # def __init__(self, device): # print("Initializing StableDiffusionInpaint to %s" % device) # self.device = device # self.mask_former = MaskFormer(device=self.device) # # self.inpainting = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting",).to(device) # def remove_part_of_image(self, input): # image_path, to_be_removed_txt = input.split(",") # print(f'remove_part_of_image: to_be_removed {to_be_removed_txt}') # return self.replace_part_of_image(f"{image_path},{to_be_removed_txt},background") # def replace_part_of_image(self, input): # image_path, to_be_replaced_txt, replace_with_txt = input.split(",") # print(f'replace_part_of_image: replace_with_txt {replace_with_txt}') # mask_image = self.mask_former.inference(image_path, to_be_replaced_txt) # buffered = io.BytesIO() # mask_image.save(buffered, format="JPEG") # resp = do_webui_request( # url=ENDPOINT + "/sdapi/v1/img2img", # init_images=[readImage(image_path)], # mask=base64.b64encode(buffered.getvalue()).decode("utf-8"), # prompt=replace_with_txt, # ) # updated_image_path = get_new_image_name(image_path, func_name="replace-something") # with open(updated_image_path, 'wb') as f: # f.write(base64.b64decode(resp['images'][0])) # return updated_image_path # class Pix2Pix: # def __init__(self, device): # print("Initializing Pix2Pix to %s" % device) # self.device = device # self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix", torch_dtype=torch.float16, safety_checker=None).to(device) # self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config) # def inference(self, inputs): # """Change style of image.""" # print("===>Starting Pix2Pix Inference") # image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) # original_image = Image.open(image_path) # image = self.pipe(instruct_text,image=original_image,num_inference_steps=40,image_guidance_scale=1.2,).images[0] # updated_image_path = get_new_image_name(image_path, func_name="pix2pix") # image.save(updated_image_path) # return updated_image_path class T2I: def __init__(self, device): print("Initializing T2I to %s" % device) self.device = device self.text_refine_tokenizer = AutoTokenizer.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion") self.text_refine_model = AutoModelForCausalLM.from_pretrained("Gustavosta/MagicPrompt-Stable-Diffusion") self.text_refine_gpt2_pipe = pipeline("text-generation", model=self.text_refine_model, tokenizer=self.text_refine_tokenizer, device=self.device) def inference(self, text): image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png") refined_text = self.text_refine_gpt2_pipe(text)[0]["generated_text"] print(f'{text} refined to {refined_text}') resp = do_webui_request( url=ENDPOINT + "/sdapi/v1/txt2img", prompt=refined_text, ) with open(image_filename, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) print(f"Processed T2I.run, text: {text}, image_filename: {image_filename}") return image_filename class ImageCaptioning: def __init__(self, device): print("Initializing ImageCaptioning to %s" % device) self.device = device self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device) def inference(self, image_path): inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device) out = self.model.generate(**inputs) captions = self.processor.decode(out[0], skip_special_tokens=True) return captions class image2canny: def inference(self, inputs): print("===>Starting image2canny Inference") resp = do_webui_request( url=DETECTAPI, controlnet_input_images=[readImage(inputs)], controlnet_module="segmentation", ) updated_image_path = get_new_image_name(inputs, func_name="edge") image.save(updated_image_path) return updated_image_path class canny2image: def inference(self, inputs): print("===>Starting canny2image Inference") image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) resp = do_webui_request( prompt=instruct_text, controlnet_input_images=[readImage(image_path)], controlnet_module="none", controlnet_model=get_model(pattern='^control_canny.*'), ) updated_image_path = get_new_image_name(image_path, func_name="canny2image") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class image2line: def inference(self, inputs): print("===>Starting image2hough Inference") resp = do_webui_request( url=DETECTAPI, controlnet_input_images=[readImage(inputs)], controlnet_module="mlsd", ) updated_image_path = get_new_image_name(inputs, func_name="line-of") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class line2image: def inference(self, inputs): print("===>Starting line2image Inference") image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) resp = do_webui_request( prompt=instruct_text, controlnet_input_images=[readImage(image_path)], controlnet_module="none", controlnet_model=get_model(pattern='^control_mlsd.*'), ) updated_image_path = get_new_image_name(image_path, func_name="line2image") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class image2hed: def inference(self, inputs): print("===>Starting image2hed Inference") resp = do_webui_request( url=DETECTAPI, controlnet_input_images=[readImage(inputs)], controlnet_module="hed", ) updated_image_path = get_new_image_name(inputs, func_name="hed-boundary") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class hed2image: def inference(self, inputs): print("===>Starting hed2image Inference") resp = do_webui_request( prompt=instruct_text, controlnet_input_images=[readImage(image_path)], controlnet_module="none", controlnet_model=get_model(pattern='^control_hed.*'), ) updated_image_path = get_new_image_name(image_path, func_name="hed2image") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class image2scribble: def inference(self, inputs): print("===>Starting image2scribble Inference") resp = do_webui_request( url=DETECTAPI, controlnet_input_images=[readImage(inputs)], controlnet_module="scribble", ) updated_image_path = get_new_image_name(inputs, func_name="scribble") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class scribble2image: def inference(self, inputs): print("===>Starting seg2image Inference") image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) resp = do_webui_request( prompt=instruct_text, controlnet_input_images=[readImage(image_path)], controlnet_module="none", controlnet_model=get_model(pattern='^control_scribble.*'), ) updated_image_path = get_new_image_name(image_path, func_name="scribble2image") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class image2pose: def inference(self, inputs): print("===>Starting image2pose Inference") resp = do_webui_request( url=DETECTAPI, controlnet_input_images=[readImage(inputs)], controlnet_module="openpose", ) updated_image_path = get_new_image_name(inputs, func_name="human-pose") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class pose2image: def inference(self, inputs): print("===>Starting pose2image Inference") image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) resp = do_webui_request( prompt=instruct_text, controlnet_input_images=[readImage(image_path)], controlnet_module="none", controlnet_model=get_model(pattern='^control_openpose.*'), ) updated_image_path = get_new_image_name(image_path, func_name="pose2image") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class image2seg: def inference(self, inputs): print("===>Starting image2seg Inference") resp = do_webui_request( url=DETECTAPI, controlnet_input_images=[readImage(inputs)], controlnet_module="segmentation", ) updated_image_path = get_new_image_name(inputs, func_name="segmentation") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class seg2image: def inference(self, inputs): print("===>Starting seg2image Inference") image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) resp = do_webui_request( prompt=instruct_text, controlnet_input_images=[readImage(image_path)], controlnet_module="none", controlnet_model=get_model(pattern='^control_seg.*'), ) updated_image_path = get_new_image_name(image_path, func_name="segment2image") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class image2depth: def inference(self, inputs): print("===>Starting image2depth Inference") resp = do_webui_request( url=DETECTAPI, controlnet_input_images=[readImage(inputs)], controlnet_module="depth", ) updated_image_path = get_new_image_name(inputs, func_name="depth") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class depth2image: def inference(self, inputs): print("===>Starting depth2image Inference") image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) resp = do_webui_request( prompt=instruct_text, controlnet_input_images=[readImage(image_path)], controlnet_module="depth", controlnet_model=get_model(pattern='^control_depth.*'), ) updated_image_path = get_new_image_name(image_path, func_name="depth2image") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class image2normal: def inference(self, inputs): print("===>Starting image2 normal Inference") resp = do_webui_request( url=DETECTAPI, controlnet_input_images=[readImage(inputs)], controlnet_module="normal", ) updated_image_path = get_new_image_name(inputs, func_name="normal-map") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class normal2image: def inference(self, inputs): print("===>Starting normal2image Inference") image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:]) resp = do_webui_request( prompt=instruct_text, controlnet_input_images=[readImage(image_path)], controlnet_module="normal", controlnet_model=get_model(pattern='^control_normal.*'), ) updated_image_path = get_new_image_name(image_path, func_name="normal2image") with open(updated_image_path, 'wb') as f: f.write(base64.b64decode(resp['images'][0])) return updated_image_path class BLIPVQA: def __init__(self, device): print("Initializing BLIP VQA to %s" % device) self.device = device self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(self.device) def get_answer_from_question_and_image(self, inputs): image_path, question = inputs.split(",") raw_image = Image.open(image_path).convert('RGB') print(F'BLIPVQA :question :{question}') inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device) out = self.model.generate(**inputs) answer = self.processor.decode(out[0], skip_special_tokens=True) return answer class ConversationBot: def __init__(self): print("Initializing VisualChatGPT") # self.edit = ImageEditing(device=device) self.i2t = ImageCaptioning(device=device) self.t2i = T2I(device=device) self.image2canny = image2canny() self.canny2image = canny2image() self.image2line = image2line() self.line2image = line2image() self.image2hed = image2hed() self.hed2image = hed2image() self.image2scribble = image2scribble() self.scribble2image = scribble2image() self.image2pose = image2pose() self.pose2image = pose2image() self.BLIPVQA = BLIPVQA(device=device) self.image2seg = image2seg() self.seg2image = seg2image() self.image2depth = image2depth() self.depth2image = depth2image() self.image2normal = image2normal() self.normal2image = normal2image() # self.pix2pix = Pix2Pix(device="cuda:3") self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output') self.tools = [ Tool(name="Get Photo Description", func=self.i2t.inference, description="useful when you want to know what is inside the photo. receives image_path as input. " "The input to this tool should be a string, representing the image_path. "), Tool(name="Generate Image From User Input Text", func=self.t2i.inference, description="useful when you want to generate an image from a user input text and save it to a file. like: generate an image of an object or something, or generate an image that includes some objects. " "The input to this tool should be a string, representing the text used to generate image. "), # Tool(name="Remove Something From The Photo", func=self.edit.remove_part_of_image, # description="useful when you want to remove and object or something from the photo from its description or location. " # "The input to this tool should be a comma seperated string of two, representing the image_path and the object need to be removed. "), # Tool(name="Replace Something From The Photo", func=self.edit.replace_part_of_image, # description="useful when you want to replace an object from the object description or location with another object from its description. " # "The input to this tool should be a comma seperated string of three, representing the image_path, the object to be replaced, the object to be replaced with "), # Tool(name="Instruct Image Using Text", func=self.pix2pix.inference, # description="useful when you want to the style of the image to be like the text. like: make it look like a painting. or make it like a robot. " # "The input to this tool should be a comma seperated string of two, representing the image_path and the text. "), Tool(name="Answer Question About The Image", func=self.BLIPVQA.get_answer_from_question_and_image, description="useful when you need an answer for a question based on an image. like: what is the background color of the last image, how many cats in this figure, what is in this figure. " "The input to this tool should be a comma seperated string of two, representing the image_path and the question"), Tool(name="Edge Detection On Image", func=self.image2canny.inference, description="useful when you want to detect the edge of the image. like: detect the edges of this image, or canny detection on image, or peform edge detection on this image, or detect the canny image of this image. " "The input to this tool should be a string, representing the image_path"), Tool(name="Generate Image Condition On Canny Image", func=self.canny2image.inference, description="useful when you want to generate a new real image from both the user desciption and a canny image. like: generate a real image of a object or something from this canny image, or generate a new real image of a object or something from this edge image. " "The input to this tool should be a comma seperated string of two, representing the image_path and the user description. "), Tool(name="Line Detection On Image", func=self.image2line.inference, description="useful when you want to detect the straight line of the image. like: detect the straight lines of this image, or straight line detection on image, or peform straight line detection on this image, or detect the straight line image of this image. " "The input to this tool should be a string, representing the image_path"), Tool(name="Generate Image Condition On Line Image", func=self.line2image.inference, description="useful when you want to generate a new real image from both the user desciption and a straight line image. like: generate a real image of a object or something from this straight line image, or generate a new real image of a object or something from this straight lines. " "The input to this tool should be a comma seperated string of two, representing the image_path and the user description. "), Tool(name="Hed Detection On Image", func=self.image2hed.inference, description="useful when you want to detect the soft hed boundary of the image. like: detect the soft hed boundary of this image, or hed boundary detection on image, or peform hed boundary detection on this image, or detect soft hed boundary image of this image. " "The input to this tool should be a string, representing the image_path"), Tool(name="Generate Image Condition On Soft Hed Boundary Image", func=self.hed2image.inference, description="useful when you want to generate a new real image from both the user desciption and a soft hed boundary image. like: generate a real image of a object or something from this soft hed boundary image, or generate a new real image of a object or something from this hed boundary. " "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"), Tool(name="Segmentation On Image", func=self.image2seg.inference, description="useful when you want to detect segmentations of the image. like: segment this image, or generate segmentations on this image, or peform segmentation on this image. " "The input to this tool should be a string, representing the image_path"), Tool(name="Generate Image Condition On Segmentations", func=self.seg2image.inference, description="useful when you want to generate a new real image from both the user desciption and segmentations. like: generate a real image of a object or something from this segmentation image, or generate a new real image of a object or something from these segmentations. " "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"), Tool(name="Predict Depth On Image", func=self.image2depth.inference, description="useful when you want to detect depth of the image. like: generate the depth from this image, or detect the depth map on this image, or predict the depth for this image. " "The input to this tool should be a string, representing the image_path"), Tool(name="Generate Image Condition On Depth", func=self.depth2image.inference, description="useful when you want to generate a new real image from both the user desciption and depth image. like: generate a real image of a object or something from this depth image, or generate a new real image of a object or something from the depth map. " "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"), Tool(name="Predict Normal Map On Image", func=self.image2normal.inference, description="useful when you want to detect norm map of the image. like: generate normal map from this image, or predict normal map of this image. " "The input to this tool should be a string, representing the image_path"), Tool(name="Generate Image Condition On Normal Map", func=self.normal2image.inference, description="useful when you want to generate a new real image from both the user desciption and normal map. like: generate a real image of a object or something from this normal map, or generate a new real image of a object or something from the normal map. " "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"), Tool(name="Sketch Detection On Image", func=self.image2scribble.inference, description="useful when you want to generate a scribble of the image. like: generate a scribble of this image, or generate a sketch from this image, detect the sketch from this image. " "The input to this tool should be a string, representing the image_path"), Tool(name="Generate Image Condition On Sketch Image", func=self.scribble2image.inference, description="useful when you want to generate a new real image from both the user desciption and a scribble image or a sketch image. " "The input to this tool should be a comma seperated string of two, representing the image_path and the user description"), Tool(name="Pose Detection On Image", func=self.image2pose.inference, description="useful when you want to detect the human pose of the image. like: generate human poses of this image, or generate a pose image from this image. " "The input to this tool should be a string, representing the image_path"), Tool(name="Generate Image Condition On Pose Image", func=self.pose2image.inference, description="useful when you want to generate a new real image from both the user desciption and a human pose image. like: generate a real image of a human from this human pose image, or generate a new real image of a human from this pose. " "The input to this tool should be a comma seperated string of two, representing the image_path and the user description")] def init_langchain(self, openai_api_key): self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key) self.agent = initialize_agent( self.tools, self.llm, agent="conversational-react-description", verbose=True, memory=self.memory, return_intermediate_steps=True, agent_kwargs={'prefix': VISUAL_CHATGPT_PREFIX, 'format_instructions': VISUAL_CHATGPT_FORMAT_INSTRUCTIONS, 'suffix': VISUAL_CHATGPT_SUFFIX} ) def run_text(self, openai_api_key, text, state): if not hasattr(self, "agent"): self.init_langchain(openai_api_key) print("===============Running run_text =============") print("Inputs:", text, state) print("======>Previous memory:\n %s" % self.agent.memory) self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500) res = self.agent({"input": text}) print("======>Current memory:\n %s" % self.agent.memory) response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output']) state = state + [(text, response)] print("Outputs:", state) return state, state def run_image(self, openai_api_key, image, state, txt): if not hasattr(self, "agent"): self.init_langchain(openai_api_key) print("===============Running run_image =============") print("Inputs:", image, state) print("======>Previous memory:\n %s" % self.agent.memory) image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png") print("======>Auto Resize Image...") img = Image.open(image.name) width, height = img.size ratio = min(512 / width, 512 / height) width_new, height_new = (round(width * ratio), round(height * ratio)) img = img.resize((width_new, height_new)) img = img.convert('RGB') img.save(image_filename, "PNG") print(f"Resize image form {width}x{height} to {width_new}x{height_new}") description = self.i2t.inference(image_filename) Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. This information helps you to understand this image, but you should use tools to finish following tasks, " \ "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(image_filename, description) AI_prompt = "Received. " self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt print("======>Current memory:\n %s" % self.agent.memory) state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)] print("Outputs:", state) return state, state, txt + ' ' + image_filename + ' ' if __name__ == '__main__': os.makedirs("image/", exist_ok=True) bot = ConversationBot() with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo: openai_api_key = gr.Textbox(type="password", label="Enter your OpenAI API key here") chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT") state = gr.State([]) with gr.Row(): with gr.Column(scale=0.7): txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(container=False) with gr.Column(scale=0.15, min_width=0): clear = gr.Button("Clear️") with gr.Column(scale=0.15, min_width=0): btn = gr.UploadButton("Upload", file_types=["image"]) txt.submit(bot.run_text, [openai_api_key, txt, state], [chatbot, state]) txt.submit(lambda: "", None, txt) btn.upload(bot.run_image, [openai_api_key, btn, state, txt], [chatbot, state, txt]) clear.click(bot.memory.clear) clear.click(lambda: [], None, chatbot) clear.click(lambda: [], None, state) demo.launch(server_name="0.0.0.0", server_port=7864) ================================================ FILE: example/inpaint_example/api_inpaint.py ================================================ import os import io import cv2 import base64 import requests """ To use this example make sure you've done the following steps before executing: 1. Ensure automatic1111 is running in api mode with the controlnet extension. Use the following command in your terminal to activate: ./webui.sh --no-half --api 2. Validate python environment meet package dependencies. If running in a local repo you'll likely need to pip install cv2, requests and PIL """ def generate(url: str, payload: dict): response = requests.post(url=url, json=payload).json() if "images" not in response: print(response) else: for i, base64image in enumerate(response["images"]): with open(f"{os.path.basename(url)}-{i}{file_suffix}.png", 'wb') as f: f.write(base64.b64decode(response['images'][i])) def read_image(img_path: str) -> str: img = cv2.imread(img_path) _, bytes = cv2.imencode(".png", img) encoded_image = base64.b64encode(bytes).decode("utf-8") return encoded_image input_image = read_image("1girl.png") mask_image = read_image("mask.png") img2img_payload = { "batch_size": 1, "cfg_scale": 7, "height": 768, "width": 512, "n_iter": 1, "steps": 30, "sampler_name": "DPM++ 2M Karras", "prompt": "(masterpiece: 1.3), (highres: 1.3), best quality,", "negative_prompt": "(worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, backlight,(ugly:1.331), (duplicate:1.331), (morbid:1.21), (mutilated:1.21), (tranny:1.331), mutated hands, (poorly drawn hands:1.331), blurry, (bad anatomy:1.21), (bad proportions:1.331), extra limbs, (more than 2 nipples:1.331), (missing arms:1.331), (extra legs:1.331), (fused fingers:1.61051), (too many fingers:1.61051), (unclear eyes:1.331), bad hands, missing fingers, extra digit, (futa:1.1), bad body, pubic hair, glans, easynegative,more than 2 tits, ng_deepnegative_v1_75t,(big fee:1),more than 2 feet,incorrect feet", "seed": 42, "seed_enable_extras": False, "seed_resize_from_h": 0, "seed_resize_from_w": 0, "subseed": -1, "subseed_strength": 0, "override_settings": {}, "override_settings_restore_afterwards": False, "do_not_save_grid": False, "do_not_save_samples": False, "s_churn": 0, "s_min_uncond": 0, "s_noise": 1, "s_tmax": None, "s_tmin": 0, "script_args": [], "script_name": None, "styles": [], "alwayson_scripts": { "ControlNet": { "args": [ { "control_mode": "Balanced", "enabled": True, "guidance_end": 1, "guidance_start": 0, "low_vram": False, "model": "control_v11p_sd15_inpaint [ebff9138]", "module": "inpaint_only", "pixel_perfect": True, "processor_res": 512, "resize_mode": "Crop and Resize", "threshold_a": 64, "threshold_b": 64, "weight": 1, } ] } }, "denoising_strength": 0.75, "initial_noise_multiplier": 1, "inpaint_full_res": 0, "inpaint_full_res_padding": 32, "inpainting_fill": 1, "inpainting_mask_invert": 0, "mask_blur_x": 4, "mask_blur_y": 4, "mask_blur": 4, "resize_mode": 0, "init_images": [input_image], "mask": mask_image, } txt2img_payload = { "alwayson_scripts": { "ControlNet": { "args": [ { "batch_images": "", "control_mode": "Balanced", "enabled": True, "guidance_end": 1, "guidance_start": 0, "image": { "image": input_image, "mask": mask_image, }, "low_vram": False, "model": "control_v11p_sd15_inpaint [ebff9138]", "module": "inpaint_only", "pixel_perfect": False, "processor_res": -1, "resize_mode": "Crop and Resize", "save_detected_map": True, "threshold_a": -1, "threshold_b": -1, "weight": 1, } ] } }, "batch_size": 1, "cfg_scale": 7, "comments": {}, "disable_extra_networks": False, "do_not_save_grid": False, "do_not_save_samples": False, "enable_hr": False, "height": 768, "hr_negative_prompt": "", "hr_prompt": "", "hr_resize_x": 0, "hr_resize_y": 0, "hr_scale": 2, "hr_second_pass_steps": 0, "hr_upscaler": "Latent", "n_iter": 1, "negative_prompt": "(worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, backlight,(ugly:1.331), (duplicate:1.331), (morbid:1.21), (mutilated:1.21), (tranny:1.331), mutated hands, (poorly drawn hands:1.331), blurry, (bad anatomy:1.21), (bad proportions:1.331), extra limbs, (more than 2 nipples:1.331), (missing arms:1.331), (extra legs:1.331), (fused fingers:1.61051), (too many fingers:1.61051), (unclear eyes:1.331), bad hands, missing fingers, extra digit, (futa:1.1), bad body, pubic hair, glans, easynegative,more than 2 tits, ng_deepnegative_v1_75t,(big fee:1),more than 2 feet,incorrect feet", "override_settings": {}, "override_settings_restore_afterwards": True, "prompt": "(masterpiece: 1.3), (highres: 1.3), best quality,", "restore_faces": False, "s_churn": 0.0, "s_min_uncond": 0, "s_noise": 1.0, "s_tmax": None, "s_tmin": 0.0, "sampler_name": "DPM++ 2M Karras", "script_args": [], "script_name": None, "seed": 42, "seed_enable_extras": True, "seed_resize_from_h": -1, "seed_resize_from_w": -1, "steps": 30, "styles": [], "subseed": -1, "subseed_strength": 0, "tiling": False, "width": 512, } if __name__ == "__main__": url = "http://localhost:7860/sdapi/v1/" generate(url + "img2img", img2img_payload) generate(url + "txt2img", txt2img_payload) ================================================ FILE: example/txt2img_example/api_txt2img.py ================================================ import io import cv2 import base64 import requests from PIL import Image """ To use this example make sure you've done the following steps before executing: 1. Ensure automatic1111 is running in api mode with the controlnet extension. Use the following command in your terminal to activate: ./webui.sh --no-half --api 2. Validate python environment meet package dependencies. If running in a local repo you'll likely need to pip install cv2, requests and PIL """ class ControlnetRequest: def __init__(self, prompt, path): self.url = "http://localhost:7860/sdapi/v1/txt2img" self.prompt = prompt self.img_path = path self.body = None def build_body(self): self.body = { "prompt": self.prompt, "negative_prompt": "", "batch_size": 1, "steps": 20, "cfg_scale": 7, "alwayson_scripts": { "controlnet": { "args": [ { "enabled": True, "module": "none", "model": "canny", "weight": 1.0, "image": self.read_image(), "resize_mode": "Crop and Resize", "lowvram": False, "processor_res": 64, "threshold_a": 64, "threshold_b": 64, "guidance_start": 0.0, "guidance_end": 1.0, "control_mode": "Balanced", "pixel_perfect": False } ] } } } def send_request(self): response = requests.post(url=self.url, json=self.body) return response.json() def read_image(self): img = cv2.imread(self.img_path) retval, bytes = cv2.imencode('.png', img) encoded_image = base64.b64encode(bytes).decode('utf-8') return encoded_image if __name__ == '__main__': path = 'stock_mountain.png' prompt = 'a large avalanche' control_net = ControlnetRequest(prompt, path) control_net.build_body() output = control_net.send_request() result = output['images'][0] image = Image.open(io.BytesIO(base64.b64decode(result))) image.show() ================================================ FILE: example/visual_chatgpt.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Run WebUI in API mode\n", "nohup python launch.py --api --xformers &\n", "\n", "# Wait until webui fully startup\n", "tail -f nohup.out" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install/Upgrade transformers\n", "pip install -U transformers\n", "\n", "# Install deps\n", "pip install langchain==0.0.101 openai \n", "\n", "# Run exmaple\n", "python example/chatgpt.py" ] } ], "metadata": { "kernelspec": { "display_name": "pynb", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "d73345514d8c18d9a1da7351d222dbd2834c7f4a09e728a0d1f4c4580fbec206" } } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: extract_controlnet.py ================================================ import argparse import torch from safetensors.torch import load_file, save_file if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--src", default=None, type=str, required=True, help="Path to the model to convert.") parser.add_argument("--dst", default=None, type=str, required=True, help="Path to the output model.") parser.add_argument("--half", action="store_true", help="Cast to FP16.") args = parser.parse_args() assert args.src is not None, "Must provide a model path!" assert args.dst is not None, "Must provide a checkpoint path!" if args.src.endswith(".safetensors"): state_dict = load_file(args.src) else: state_dict = torch.load(args.src) if any([k.startswith("control_model.") for k, v in state_dict.items()]): dtype = torch.float16 if args.half else torch.float32 state_dict = {k.replace("control_model.", ""): v.to(dtype) for k, v in state_dict.items() if k.startswith("control_model.")} if args.dst.endswith(".safetensors"): save_file(state_dict, args.dst) else: torch.save({"state_dict": state_dict}, args.dst) ================================================ FILE: extract_controlnet_diff.py ================================================ import argparse import torch from safetensors.torch import load_file, save_file if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--sd15", default=None, type=str, required=True, help="Path to the original sd15.") parser.add_argument("--control", default=None, type=str, required=True, help="Path to the sd15 with control.") parser.add_argument("--dst", default=None, type=str, required=True, help="Path to the output difference model.") parser.add_argument("--fp16", action="store_true", help="Save as fp16.") parser.add_argument("--bf16", action="store_true", help="Save as bf16.") args = parser.parse_args() assert args.sd15 is not None, "Must provide a original sd15 model path!" assert args.control is not None, "Must provide a sd15 with control model path!" assert args.dst is not None, "Must provide a output path!" # make differences: copy from https://github.com/lllyasviel/ControlNet/blob/main/tool_transfer_control.py def get_node_name(name, parent_name): if len(name) <= len(parent_name): return False, '' p = name[:len(parent_name)] if p != parent_name: return False, '' return True, name[len(parent_name):] # remove first/cond stage from sd to reduce memory usage def remove_first_and_cond(sd): keys = list(sd.keys()) for key in keys: is_first_stage, _ = get_node_name(key, 'first_stage_model') is_cond_stage, _ = get_node_name(key, 'cond_stage_model') if is_first_stage or is_cond_stage: sd.pop(key, None) return sd print(f"loading: {args.sd15}") if args.sd15.endswith(".safetensors"): sd15_state_dict = load_file(args.sd15) else: sd15_state_dict = torch.load(args.sd15) sd15_state_dict = sd15_state_dict.pop("state_dict", sd15_state_dict) sd15_state_dict = remove_first_and_cond(sd15_state_dict) print(f"loading: {args.control}") if args.control.endswith(".safetensors"): control_state_dict = load_file(args.control) else: control_state_dict = torch.load(args.control) control_state_dict = remove_first_and_cond(control_state_dict) # make diff of original and control print(f"create difference") keys = list(control_state_dict.keys()) final_state_dict = {"difference": torch.tensor(1.0)} # indicates difference for key in keys: p = control_state_dict.pop(key) is_control, node_name = get_node_name(key, 'control_') if not is_control: continue sd15_key_name = 'model.diffusion_' + node_name if sd15_key_name in sd15_state_dict: # part of U-Net # print("in sd15", key, sd15_key_name) p_new = p - sd15_state_dict.pop(sd15_key_name) if torch.max(torch.abs(p_new)) < 1e-6: # no difference? print("no diff", key, sd15_key_name) continue else: # print("not in sd15", key, sd15_key_name) p_new = p # hint or zero_conv final_state_dict[key] = p_new save_dtype = None if args.fp16: save_dtype = torch.float16 elif args.bf16: save_dtype = torch.bfloat16 if save_dtype is not None: for key in final_state_dict.keys(): final_state_dict[key] = final_state_dict[key].to(save_dtype) print("saving difference.") if args.dst.endswith(".safetensors"): save_file(final_state_dict, args.dst) else: torch.save({"state_dict": final_state_dict}, args.dst) print("done!") ================================================ FILE: install.py ================================================ import launch from importlib import metadata import sys import os import shutil import platform from pathlib import Path from typing import Optional from packaging.version import parse repo_root = Path(__file__).parent main_req_file = repo_root / "requirements.txt" def get_installed_version(package: str) -> Optional[str]: try: return metadata.version(package) except Exception: return None def extract_base_package(package_string: str) -> str: base_package = package_string.split("@git")[0] return base_package def install_requirements(req_file): with open(req_file) as file: for package in file: try: package = package.strip() if "==" in package: package_name, package_version = package.split("==") installed_version = get_installed_version(package_name) if installed_version != package_version: launch.run_pip( f'install -U "{package}"', f"sd-webui-controlnet requirement: changing {package_name} version from {installed_version} to {package_version}", ) elif ">=" in package: package_name, package_version = package.split(">=") installed_version = get_installed_version(package_name) if not installed_version or parse( installed_version ) < parse(package_version): launch.run_pip( f'install -U "{package}"', f"sd-webui-controlnet requirement: changing {package_name} version from {installed_version} to {package_version}", ) elif "<=" in package: package_name, package_version = package.split("<=") installed_version = get_installed_version(package_name) if not installed_version or parse( installed_version ) > parse(package_version): launch.run_pip( f'install "{package_name}=={package_version}"', f"sd-webui-controlnet requirement: changing {package_name} version from {installed_version} to {package_version}", ) elif not launch.is_installed(extract_base_package(package)): launch.run_pip( f'install "{package}"', f"sd-webui-controlnet requirement: {package}", ) except Exception as e: print(e) print( f"Warning: Failed to install {package}, some preprocessors may not work." ) def install_onnxruntime(): """ Install onnxruntime or onnxruntime-gpu based on the availability of CUDA. onnxruntime and onnxruntime-gpu can not be installed together. """ if not launch.is_installed("onnxruntime") and not launch.is_installed("onnxruntime-gpu"): import torch.cuda as cuda # torch import head to improve loading time onnxruntime = 'onnxruntime-gpu' if cuda.is_available() else 'onnxruntime' launch.run_pip( f'install {onnxruntime}', f"sd-webui-controlnet requirement: {onnxruntime}", ) def try_install_from_wheel(pkg_name: str, wheel_url: str, version: Optional[str] = None): current_version = get_installed_version(pkg_name) if current_version is not None: # No version requirement. if version is None: return # Version requirement already satisfied. if parse(current_version) >= parse(version): return try: launch.run_pip( f"install -U {wheel_url}", f"sd-webui-controlnet requirement: {pkg_name}", ) except Exception as e: print(e) print(f"Warning: Failed to install {pkg_name}. Some processors will not work.") def try_install_insight_face(): """Attempt to install insightface library. The library is necessary to use ip-adapter faceid. Note: Building insightface library from source requires compiling C++ code, which should be avoided in principle. Here the solution is to download a precompiled wheel.""" if get_installed_version("insightface") is not None: return default_win_wheel = "https://github.com/Gourieff/Assets/raw/main/Insightface/insightface-0.7.3-cp310-cp310-win_amd64.whl" wheel_url = os.environ.get("INSIGHTFACE_WHEEL", default_win_wheel) system = platform.system().lower() architecture = platform.machine().lower() python_version = sys.version_info if wheel_url != default_win_wheel or ( system == "windows" and "amd64" in architecture and python_version.major == 3 and python_version.minor == 10 ): try: launch.run_pip( f"install {wheel_url}", "sd-webui-controlnet requirement: insightface", ) except Exception as e: print(e) print( "ControlNet init warning: Unable to install insightface automatically. " ) else: print( "ControlNet init warning: Unable to install insightface automatically. " "Please try run `pip install insightface` manually." ) def try_remove_legacy_submodule(): """Try remove annotators/hand_refiner_portable submodule dir.""" submodule = repo_root / "annotator" / "hand_refiner_portable" if os.path.exists(submodule): try: shutil.rmtree(submodule) except Exception as e: print(e) print( f"Failed to remove submodule {submodule} automatically. You can manually delete the directory." ) install_requirements(main_req_file) install_onnxruntime() try_install_insight_face() try_install_from_wheel( "handrefinerportable", wheel_url=os.environ.get( "HANDREFINER_WHEEL", "https://github.com/huchenlei/HandRefinerPortable/releases/download/v1.0.1/handrefinerportable-2024.2.12.0-py2.py3-none-any.whl", ), version="2024.2.12.0", ) try_install_from_wheel( "depth_anything", wheel_url=os.environ.get( "DEPTH_ANYTHING_WHEEL", "https://github.com/huchenlei/Depth-Anything/releases/download/v1.0.0/depth_anything-2024.1.22.0-py2.py3-none-any.whl", ), ) try_install_from_wheel( "depth_anything_v2", wheel_url=os.environ.get( "DEPTH_ANYTHING_V2_WHEEL", "https://github.com/MackinationsAi/UDAV2-ControlNet/releases/download/v1.0.0/depth_anything_v2-2024.7.1.0-py2.py3-none-any.whl", ), ) try_install_from_wheel( "dsine", wheel_url=os.environ.get( "DSINE_WHEEL", "https://github.com/sdbds/DSINE/releases/download/1.0.2/dsine-2024.3.23-py3-none-any.whl", ), ) try_remove_legacy_submodule() ================================================ FILE: internal_controlnet/__init__.py ================================================ ================================================ FILE: internal_controlnet/args.py ================================================ from __future__ import annotations import os import torch import numpy as np from typing import Optional, List, Annotated, ClassVar, Callable, Any, Tuple, Union from pydantic import BaseModel, validator, root_validator, Field from PIL import Image from logging import Logger from copy import copy from enum import Enum from scripts.enums import ( InputMode, ResizeMode, ControlMode, HiResFixOption, PuLIDMode, ControlNetUnionControlType, ) from annotator.util import HWC3 def _unimplemented_func(*args, **kwargs): raise NotImplementedError("Not implemented.") def field_to_displaytext(fieldname: str) -> str: return " ".join([word.capitalize() for word in fieldname.split("_")]) def displaytext_to_field(text: str) -> str: return "_".join([word.lower() for word in text.split(" ")]) def serialize_value(value) -> str: if isinstance(value, Enum): return value.value return str(value) def parse_value(value: str) -> Union[str, float, int, bool]: if value in ("True", "False"): return value == "True" try: return int(value) except ValueError: try: return float(value) except ValueError: return value # Plain string. class ControlNetUnit(BaseModel): """ Represents an entire ControlNet processing unit. """ class Config: arbitrary_types_allowed = True extra = "ignore" cls_match_module: ClassVar[Callable[[str], bool]] = _unimplemented_func cls_match_model: ClassVar[Callable[[str], bool]] = _unimplemented_func cls_decode_base64: ClassVar[Callable[[str], np.ndarray]] = _unimplemented_func cls_torch_load_base64: ClassVar[Callable[[Any], torch.Tensor]] = _unimplemented_func cls_get_preprocessor: ClassVar[Callable[[str], Any]] = _unimplemented_func cls_logger: ClassVar[Logger] = Logger("ControlNetUnit") # UI only fields. is_ui: bool = False input_mode: InputMode = InputMode.SIMPLE batch_images: Optional[Any] = None output_dir: str = "" loopback: bool = False # General fields. enabled: bool = False module: str = "none" @validator("module", always=True, pre=True) def check_module(cls, value: str) -> str: if not ControlNetUnit.cls_match_module(value): raise ValueError(f"module({value}) not found in supported modules.") return value model: str = "None" @validator("model", always=True, pre=True) def check_model(cls, value: str) -> str: if not ControlNetUnit.cls_match_model(value): raise ValueError(f"model({value}) not found in supported models.") return value weight: Annotated[float, Field(ge=0.0, le=2.0)] = 1.0 # The image to be used for this ControlNetUnit. image: Optional[Any] = None resize_mode: ResizeMode = ResizeMode.INNER_FIT @validator("resize_mode", always=True, pre=True) def check_resize_mode(cls, value) -> ResizeMode: resize_mode_aliases = { "Inner Fit (Scale to Fit)": "Crop and Resize", "Outer Fit (Shrink to Fit)": "Resize and Fill", "Scale to Fit (Inner Fit)": "Crop and Resize", "Envelope (Outer Fit)": "Resize and Fill", } if isinstance(value, str): return ResizeMode(resize_mode_aliases.get(value, value)) assert isinstance(value, ResizeMode) return value low_vram: bool = False processor_res: int = -1 threshold_a: float = -1 threshold_b: float = -1 @root_validator def bound_check_params(cls, values: dict) -> dict: """ Checks and corrects negative parameters in ControlNetUnit 'unit' in place. Parameters 'processor_res', 'threshold_a', 'threshold_b' are reset to their default values if negative. """ enabled = values.get("enabled") if not enabled: return values module = values.get("module") if not module: return values preprocessor = cls.cls_get_preprocessor(module) assert preprocessor is not None for unit_param, param in zip( ("processor_res", "threshold_a", "threshold_b"), ("slider_resolution", "slider_1", "slider_2"), ): value = values.get(unit_param) cfg = getattr(preprocessor, param) if value < cfg.minimum or value > cfg.maximum: values[unit_param] = cfg.value # Only report warning when non-default value is used. if value != -1: cls.cls_logger.info( f"[{module}.{unit_param}] Invalid value({value}), using default value {cfg.value}." ) return values guidance_start: Annotated[float, Field(ge=0.0, le=1.0)] = 0.0 guidance_end: Annotated[float, Field(ge=0.0, le=1.0)] = 1.0 @root_validator def guidance_check(cls, values: dict) -> dict: start = values.get("guidance_start") end = values.get("guidance_end") if start > end: raise ValueError(f"guidance_start({start}) > guidance_end({end})") return values pixel_perfect: bool = False control_mode: ControlMode = ControlMode.BALANCED # Whether to crop input image based on A1111 img2img mask. This flag is only used when `inpaint area` # in A1111 is set to `Only masked`. In API, this correspond to `inpaint_full_res = True`. inpaint_crop_input_image: bool = True # If hires fix is enabled in A1111, how should this ControlNet unit be applied. # The value is ignored if the generation is not using hires fix. hr_option: HiResFixOption = HiResFixOption.BOTH # Whether save the detected map of this unit. Setting this option to False prevents saving the # detected map or sending detected map along with generated images via API. # Currently the option is only accessible in API calls. save_detected_map: bool = True # Weight for each layer of ControlNet params. # For ControlNet: # - SD1.5: 13 weights (4 encoder block * 3 + 1 middle block) # - SDXL: 10 weights (3 encoder block * 3 + 1 middle block) # For T2IAdapter # - SD1.5: 5 weights (4 encoder block + 1 middle block) # - SDXL: 4 weights (3 encoder block + 1 middle block) # For IPAdapter # - SD15: 16 (6 input blocks + 9 output blocks + 1 middle block) # - SDXL: 11 weights (4 input blocks + 6 output blocks + 1 middle block) # Note1: Setting advanced weighting will disable `soft_injection`, i.e. # It is recommended to set ControlMode = BALANCED when using `advanced_weighting`. # Note2: The field `weight` is still used in some places, e.g. reference_only, # even advanced_weighting is set. advanced_weighting: Optional[List[float]] = None # The effective region mask that unit's effect should be restricted to. effective_region_mask: Optional[np.ndarray] = None @validator("effective_region_mask", pre=True) def parse_effective_region_mask(cls, value) -> np.ndarray: if isinstance(value, str): return cls.cls_decode_base64(value) assert isinstance(value, np.ndarray) or value is None return value # The weight mode for PuLID. # https://github.com/ToTheBeginning/PuLID pulid_mode: PuLIDMode = PuLIDMode.FIDELITY # ControlNet control type for ControlNet union model. # https://github.com/xinsir6/ControlNetPlus/tree/main # The value of this field is only used when the model is ControlNetUnion. union_control_type: ControlNetUnionControlType = ControlNetUnionControlType.UNKNOWN # ------- API only fields ------- # The tensor input for ipadapter. When this field is set in the API, # the base64string will be interpret by torch.load to reconstruct ipadapter # preprocessor output. # Currently the option is only accessible in API calls. ipadapter_input: Optional[List[Any]] = None @validator("ipadapter_input", pre=True) def parse_ipadapter_input(cls, value) -> Optional[List[Any]]: if value is None: return None if isinstance(value, str): value = [value] result = [cls.cls_torch_load_base64(b) for b in value] assert result, "input cannot be empty" return result # The mask to be used on top of the image. mask: Optional[Any] = None # AnimateDiff compatibility fields. # TODO: Find a better way in AnimateDiff to deal with these extra fields. batch_mask_dir: Optional[str] = None animatediff_batch: bool = False batch_modifiers: list = Field(default_factory=list) batch_image_files: list = Field(default_factory=list) batch_keyframe_idx: Optional[str | list] = None @property def accepts_multiple_inputs(self) -> bool: """This unit can accept multiple input images.""" return self.is_ipadapter @property def is_animate_diff_batch(self) -> bool: return getattr(self, "animatediff_batch", False) @property def uses_clip(self) -> bool: """Whether this unit uses clip preprocessor.""" return any( ( ("ip-adapter" in self.module and "face_id" not in self.module), self.module in ("clip_vision", "revision_clipvision", "revision_ignore_prompt"), ) ) @property def is_inpaint(self) -> bool: return "inpaint" in self.module @property def is_ipadapter(self) -> bool: p = ControlNetUnit.cls_get_preprocessor(self.module) if p is None: return False return "IP-Adapter" in p.tags def get_actual_preprocessors(self) -> List[Any]: p = ControlNetUnit.cls_get_preprocessor(self.module) # Map "ip-adapter-auto" to actual preprocessor. if self.module == "ip-adapter-auto": p = p.get_preprocessor_by_model(self.model) # Add all dependencies. return [p] + [ ControlNetUnit.cls_get_preprocessor(dep) for dep in p.preprocessor_deps ] @classmethod def parse_image(cls, image) -> np.ndarray: if isinstance(image, np.ndarray): np_image = image elif isinstance(image, str): # Necessary for batch. if os.path.exists(image): np_image = np.array(Image.open(image)).astype("uint8") else: np_image = cls.cls_decode_base64(image) else: raise ValueError(f"Unrecognized image format {image}.") # Convert following image shapes to shape [H, W, C=3]. # - [H, W] # - [H, W, 1] # - [H, W, 4] np_image = HWC3(np_image) assert np_image.ndim == 3 assert np_image.shape[2] == 3 return np_image @classmethod def combine_image_and_mask( cls, np_image: np.ndarray, np_mask: Optional[np.ndarray] = None ) -> np.ndarray: """RGB + Alpha(Optional) => RGBA""" # TODO: Change protocol to use 255 as A channel value. # Note: mask is by default zeros, as both inpaint and # clip mask does extra work on masked area. np_mask = (np.zeros_like(np_image) if np_mask is None else np_mask)[:, :, 0:1] if np_image.shape[:2] != np_mask.shape[:2]: raise ValueError( f"image shape ({np_image.shape[:2]}) not aligned with mask shape ({np_mask.shape[:2]})" ) return np.concatenate([np_image, np_mask], axis=2) # [H, W, 4] @classmethod def legacy_field_alias(cls, values: dict) -> dict: ext_compat_keys = { "guidance": "guidance_end", "lowvram": "low_vram", "input_image": "image", } for alias, key in ext_compat_keys.items(): if alias in values: assert key not in values, f"Conflict of field '{alias}' and '{key}'" values[key] = values[alias] cls.cls_logger.warn( f"Deprecated alias '{alias}' detected. This field will be removed on 2024-06-01" f"Please use '{key}' instead." ) return values @classmethod def mask_alias(cls, values: dict) -> dict: """ Field "mask_image" is the alias of field "mask". This is for compatibility with SD Forge API. """ mask_image = values.get("mask_image") mask = values.get("mask") if mask_image is not None: if mask is not None: raise ValueError("Cannot specify both 'mask' and 'mask_image'!") values["mask"] = mask_image return values def get_input_images_rgba(self) -> Optional[List[np.ndarray]]: """ RGBA images with potentially different size. Why we cannot have [B, H, W, C=4] here is that calculation of final resolution requires generation target's dimensions. Parse image with following formats. API - image = {"image": base64image, "mask": base64image,} - image = [image, mask] - image = (image, mask) - image = [{"image": ..., "mask": ...}, {"image": ..., "mask": ...}, ...] - image = base64image, mask = base64image UI: - image = {"image": np_image, "mask": np_image,} - image = np_image, mask = np_image """ init_image = self.image init_mask = self.mask if init_image is None: assert init_mask is None return None if isinstance(init_image, (list, tuple)): if not init_image: raise ValueError(f"{init_image} is not a valid 'image' field value") if isinstance(init_image[0], dict): # [{"image": ..., "mask": ...}, {"image": ..., "mask": ...}, ...] images = init_image else: assert len(init_image) == 2 # [image, mask] # (image, mask) images = [ { "image": init_image[0], "mask": init_image[1], } ] elif isinstance(init_image, dict): # {"image": ..., "mask": ...} images = [init_image] elif isinstance(init_image, (str, np.ndarray)): # image = base64image, mask = base64image images = [ { "image": init_image, "mask": init_mask, } ] else: raise ValueError(f"Unrecognized image field {init_image}") np_images = [] for image_dict in images: assert isinstance(image_dict, dict) image = image_dict.get("image") mask = image_dict.get("mask") assert image is not None np_image = self.parse_image(image) np_mask = self.parse_image(mask) if mask is not None else None np_images.append( self.combine_image_and_mask(np_image, np_mask) ) # [H, W, 4] return np_images @classmethod def from_dict(cls, values: dict) -> ControlNetUnit: values = copy(values) values = cls.legacy_field_alias(values) values = cls.mask_alias(values) return ControlNetUnit(**values) @classmethod def from_infotext_args(cls, *args) -> ControlNetUnit: assert len(args) == len(ControlNetUnit.infotext_fields()) return cls.from_dict( {k: v for k, v in zip(ControlNetUnit.infotext_fields(), args)} ) @staticmethod def infotext_fields() -> Tuple[str]: """Fields that should be included in infotext. You should define a Gradio element with exact same name in ControlNetUiGroup as well, so that infotext can wire the value to correct field when pasting infotext. """ return ( "module", "model", "weight", "resize_mode", "processor_res", "threshold_a", "threshold_b", "guidance_start", "guidance_end", "pixel_perfect", "control_mode", ) def serialize(self) -> str: """Serialize the unit for infotext.""" infotext_dict = { field_to_displaytext(field): serialize_value(getattr(self, field)) for field in ControlNetUnit.infotext_fields() } if not all( "," not in str(v) and ":" not in str(v) for v in infotext_dict.values() ): self.cls_logger.error(f"Unexpected tokens encountered:\n{infotext_dict}") return "" return ", ".join(f"{field}: {value}" for field, value in infotext_dict.items()) @classmethod def parse(cls, text: str) -> ControlNetUnit: return ControlNetUnit( enabled=True, **{ displaytext_to_field(key): parse_value(value) for item in text.split(",") for (key, value) in (item.strip().split(": "),) }, ) def __copy__(self) -> ControlNetUnit: """Override the behavior on `copy.copy` calls.""" return self.copy() ================================================ FILE: internal_controlnet/external_code.py ================================================ from copy import copy from typing import List, Any, Optional, Union, Tuple, Dict import numpy as np from modules import scripts, processing, shared from modules.api import api from .args import ControlNetUnit from scripts import global_state from scripts.logging import logger from scripts.enums import ( ResizeMode, BatchOption, # noqa: F401 ControlMode, # noqa: F401 ) from scripts.supported_preprocessor import ( Preprocessor, PreprocessorParameter, # noqa: F401 ) import torch import base64 import io from modules.safe import unsafe_torch_load def get_api_version() -> int: return 3 resize_mode_aliases = { "Inner Fit (Scale to Fit)": "Crop and Resize", "Outer Fit (Shrink to Fit)": "Resize and Fill", "Scale to Fit (Inner Fit)": "Crop and Resize", "Envelope (Outer Fit)": "Resize and Fill", } def resize_mode_from_value(value: Union[str, int, ResizeMode]) -> ResizeMode: if isinstance(value, str): return ResizeMode(resize_mode_aliases.get(value, value)) elif isinstance(value, int): assert value >= 0 if value == 3: # 'Just Resize (Latent upscale)' return ResizeMode.RESIZE if value >= len(ResizeMode): logger.warning( f"Unrecognized ResizeMode int value {value}. Fall back to RESIZE." ) return ResizeMode.RESIZE return [e for e in ResizeMode][value] else: return value def visualize_inpaint_mask(img): if img.ndim == 3 and img.shape[2] == 4: result = img.copy() mask = result[:, :, 3] mask = 255 - mask // 2 result[:, :, 3] = mask return np.ascontiguousarray(result.copy()) return img def pixel_perfect_resolution( image: np.ndarray, target_H: int, target_W: int, resize_mode: ResizeMode, ) -> int: """ Calculate the estimated resolution for resizing an image while preserving aspect ratio. The function first calculates scaling factors for height and width of the image based on the target height and width. Then, based on the chosen resize mode, it either takes the smaller or the larger scaling factor to estimate the new resolution. If the resize mode is OUTER_FIT, the function uses the smaller scaling factor, ensuring the whole image fits within the target dimensions, potentially leaving some empty space. If the resize mode is not OUTER_FIT, the function uses the larger scaling factor, ensuring the target dimensions are fully filled, potentially cropping the image. After calculating the estimated resolution, the function prints some debugging information. Args: image (np.ndarray): A 3D numpy array representing an image. The dimensions represent [height, width, channels]. target_H (int): The target height for the image. target_W (int): The target width for the image. resize_mode (ResizeMode): The mode for resizing. Returns: int: The estimated resolution after resizing. """ raw_H, raw_W, _ = image.shape k0 = float(target_H) / float(raw_H) k1 = float(target_W) / float(raw_W) if resize_mode == ResizeMode.OUTER_FIT: estimation = min(k0, k1) * float(min(raw_H, raw_W)) else: estimation = max(k0, k1) * float(min(raw_H, raw_W)) logger.debug("Pixel Perfect Computation:") logger.debug(f"resize_mode = {resize_mode}") logger.debug(f"raw_H = {raw_H}") logger.debug(f"raw_W = {raw_W}") logger.debug(f"target_H = {target_H}") logger.debug(f"target_W = {target_W}") logger.debug(f"estimation = {estimation}") return int(np.round(estimation)) def to_base64_nparray(encoding: str) -> np.ndarray: """ Convert a base64 image into the image type the extension uses """ return np.array(api.decode_base64_to_image(encoding)).astype("uint8") def get_all_units_in_processing( p: processing.StableDiffusionProcessing, ) -> List[ControlNetUnit]: """ Fetch ControlNet processing units from a StableDiffusionProcessing. """ return get_all_units(p.scripts, p.script_args) def get_all_units( script_runner: scripts.ScriptRunner, script_args: List[Any] ) -> List[ControlNetUnit]: """ Fetch ControlNet processing units from an existing script runner. Use this function to fetch units from the list of all scripts arguments. """ cn_script = find_cn_script(script_runner) if cn_script: return get_all_units_from(script_args[cn_script.args_from : cn_script.args_to]) return [] def get_all_units_from(script_args: List[Any]) -> List[ControlNetUnit]: """ Fetch ControlNet processing units from ControlNet script arguments. Use `external_code.get_all_units` to fetch units from the list of all scripts arguments. """ def is_stale_unit(script_arg: Any) -> bool: """Returns whether the script_arg is potentially an stale version of ControlNetUnit created before module reload.""" return "ControlNetUnit" in type(script_arg).__name__ and not isinstance( script_arg, ControlNetUnit ) def is_controlnet_unit(script_arg: Any) -> bool: """Returns whether the script_arg is ControlNetUnit or anything that can be treated like ControlNetUnit.""" return isinstance(script_arg, (ControlNetUnit, dict)) or ( hasattr(script_arg, "__dict__") and set(vars(ControlNetUnit()).keys()).issubset( set(vars(script_arg).keys()) ) ) all_units = [ to_processing_unit(script_arg) for script_arg in script_args if is_controlnet_unit(script_arg) ] if not all_units: logger.warning( "No ControlNetUnit detected in args. It is very likely that you are having an extension conflict." f"Here are args received by ControlNet: {script_args}." ) if any(is_stale_unit(script_arg) for script_arg in script_args): logger.debug( "Stale version of ControlNetUnit detected. The ControlNetUnit received" "by ControlNet is created before the newest load of ControlNet extension." "They will still be used by ControlNet as long as they provide same fields" "defined in the newest version of ControlNetUnit." ) return all_units def get_single_unit_from( script_args: List[Any], index: int = 0 ) -> Optional[ControlNetUnit]: """ Fetch a single ControlNet processing unit from ControlNet script arguments. The list must not contain script positional arguments. It must only contain processing units. """ i = 0 while i < len(script_args) and index >= 0: if index == 0 and script_args[i] is not None: return to_processing_unit(script_args[i]) i += 1 index -= 1 return None def get_max_models_num(): """ Fetch the maximum number of allowed ControlNet models. """ max_models_num = shared.opts.data.get("control_net_unit_count", 3) return max_models_num def to_processing_unit(unit: Union[Dict, ControlNetUnit]) -> ControlNetUnit: """ Convert different types to processing unit. """ if isinstance(unit, dict): return ControlNetUnit.from_dict(unit) assert isinstance(unit, ControlNetUnit) return unit def update_cn_script_in_processing( p: processing.StableDiffusionProcessing, cn_units: List[ControlNetUnit], **_kwargs, # for backwards compatibility ): """ Update the arguments of the ControlNet script in `p.script_args` in place, reading from `cn_units`. `cn_units` and its elements are not modified. You can call this function repeatedly, as many times as you want. Does not update `p.script_args` if any of the folling is true: - ControlNet is not present in `p.scripts` - `p.script_args` is not filled with script arguments for scripts that are processed before ControlNet """ p.script_args = update_cn_script(p.scripts, p.script_args_value, cn_units) def update_cn_script( script_runner: scripts.ScriptRunner, script_args: Union[Tuple[Any], List[Any]], cn_units: List[ControlNetUnit], ) -> Union[Tuple[Any], List[Any]]: """ Returns: The updated `script_args` with given `cn_units` used as ControlNet script args. Does not update `script_args` if any of the folling is true: - ControlNet is not present in `script_runner` - `script_args` is not filled with script arguments for scripts that are processed before ControlNet """ script_args_type = type(script_args) assert script_args_type in (tuple, list), script_args_type updated_script_args = list(copy(script_args)) cn_script = find_cn_script(script_runner) if cn_script is None or len(script_args) < cn_script.args_from: return script_args # fill in remaining parameters to satisfy max models, just in case script needs it. max_models = shared.opts.data.get("control_net_unit_count", 3) cn_units = cn_units + [ControlNetUnit(enabled=False)] * max( max_models - len(cn_units), 0 ) cn_script_args_diff = 0 for script in script_runner.alwayson_scripts: if script is cn_script: cn_script_args_diff = len(cn_units) - ( cn_script.args_to - cn_script.args_from ) updated_script_args[script.args_from : script.args_to] = cn_units script.args_to = script.args_from + len(cn_units) else: script.args_from += cn_script_args_diff script.args_to += cn_script_args_diff return script_args_type(updated_script_args) def update_cn_script_in_place( script_runner: scripts.ScriptRunner, script_args: List[Any], cn_units: List[ControlNetUnit], **_kwargs, # for backwards compatibility ): """ @Deprecated(Raises assertion error if script_args passed in is Tuple) Update the arguments of the ControlNet script in `script_args` in place, reading from `cn_units`. `cn_units` and its elements are not modified. You can call this function repeatedly, as many times as you want. Does not update `script_args` if any of the folling is true: - ControlNet is not present in `script_runner` - `script_args` is not filled with script arguments for scripts that are processed before ControlNet """ assert isinstance(script_args, list), type(script_args) cn_script = find_cn_script(script_runner) if cn_script is None or len(script_args) < cn_script.args_from: return # fill in remaining parameters to satisfy max models, just in case script needs it. max_models = shared.opts.data.get("control_net_unit_count", 3) cn_units = cn_units + [ControlNetUnit(enabled=False)] * max( max_models - len(cn_units), 0 ) cn_script_args_diff = 0 for script in script_runner.alwayson_scripts: if script is cn_script: cn_script_args_diff = len(cn_units) - ( cn_script.args_to - cn_script.args_from ) script_args[script.args_from : script.args_to] = cn_units script.args_to = script.args_from + len(cn_units) else: script.args_from += cn_script_args_diff script.args_to += cn_script_args_diff def get_models(update: bool = False) -> List[str]: """ Fetch the list of available models. Each value is a valid candidate of `ControlNetUnit.model`. Keyword arguments: update -- Whether to refresh the list from disk. (default False) """ if update: global_state.update_cn_models() return list(global_state.cn_models_names.values()) def get_modules(alias_names: bool = False) -> List[str]: """ Fetch the list of available preprocessors. Each value is a valid candidate of `ControlNetUnit.module`. Keyword arguments: alias_names -- Whether to get the ui alias names instead of internal keys """ return [ (p.label if alias_names else p.name) for p in Preprocessor.get_sorted_preprocessors() ] def get_modules_detail(alias_names: bool = False) -> Dict[str, Any]: """ get the detail of all preprocessors including sliders: the slider config in Auto1111 webUI Keyword arguments: alias_names -- Whether to get the module detail with alias names instead of internal keys """ _module_detail = {} _module_list = get_modules(False) _module_list_alias = get_modules(True) _output_list = _module_list if not alias_names else _module_list_alias for module_name in _output_list: preprocessor = Preprocessor.get_preprocessor(module_name) assert preprocessor is not None _module_detail[module_name] = dict( model_free=preprocessor.do_not_need_model, sliders=[ s.api_json for s in ( preprocessor.slider_resolution, preprocessor.slider_1, preprocessor.slider_2, preprocessor.slider_3, ) if s.visible ], ) return _module_detail def find_cn_script(script_runner: scripts.ScriptRunner) -> Optional[scripts.Script]: """ Find the ControlNet script in `script_runner`. Returns `None` if `script_runner` does not contain a ControlNet script. """ if script_runner is None: return None for script in script_runner.alwayson_scripts: if is_cn_script(script): return script def is_cn_script(script: scripts.Script) -> bool: """ Determine whether `script` is a ControlNet script. """ return script.title().lower() == "controlnet" # TODO: Add model constraint ControlNetUnit.cls_match_model = lambda model: True ControlNetUnit.cls_match_module = ( lambda module: Preprocessor.get_preprocessor(module) is not None ) ControlNetUnit.cls_get_preprocessor = Preprocessor.get_preprocessor ControlNetUnit.cls_decode_base64 = to_base64_nparray def decode_base64(b: str) -> torch.Tensor: decoded_bytes = base64.b64decode(b) return unsafe_torch_load(io.BytesIO(decoded_bytes)) ControlNetUnit.cls_torch_load_base64 = decode_base64 ControlNetUnit.cls_logger = logger logger.debug("ControlNetUnit initialized") ================================================ FILE: javascript/canvas.js ================================================ (function () { var hasApplied = false; onUiUpdate(function () { if (!hasApplied) { if (typeof window.applyZoomAndPanIntegration === "function") { hasApplied = true; window.applyZoomAndPanIntegration("#txt2img_controlnet", Array.from({ length: 20 }, (_, i) => `#txt2img_controlnet_ControlNet-${i}_input_image`)); window.applyZoomAndPanIntegration("#img2img_controlnet", Array.from({ length: 20 }, (_, i) => `#img2img_controlnet_ControlNet-${i}_input_image`)); window.applyZoomAndPanIntegration("#txt2img_controlnet", ["#txt2img_controlnet_ControlNet_input_image"]); window.applyZoomAndPanIntegration("#img2img_controlnet", ["#img2img_controlnet_ControlNet_input_image"]); //console.log("window.applyZoomAndPanIntegration applied."); } else { //console.log("window.applyZoomAndPanIntegration is not available."); } } }); })(); ================================================ FILE: javascript/controlnet_unit.mjs ================================================ const ImgChangeType = { NO_CHANGE: 0, REMOVE: 1, ADD: 2, SRC_CHANGE: 3, }; function imgChangeObserved(mutationsList) { // Iterate over all mutations that just occured for (let mutation of mutationsList) { // Check if the mutation is an addition or removal of a node if (mutation.type === 'childList') { // Check if nodes were added if (mutation.addedNodes.length > 0) { for (const node of mutation.addedNodes) { if (node.tagName === 'IMG') { return ImgChangeType.ADD; } } } // Check if nodes were removed if (mutation.removedNodes.length > 0) { for (const node of mutation.removedNodes) { if (node.tagName === 'IMG') { return ImgChangeType.REMOVE; } } } } // Check if the mutation is a change of an attribute else if (mutation.type === 'attributes') { if (mutation.target.tagName === 'IMG' && mutation.attributeName === 'src') { return ImgChangeType.SRC_CHANGE; } } } return ImgChangeType.NO_CHANGE; } function childIndex(element) { // Get all child nodes of the parent let children = Array.from(element.parentNode.childNodes); // Filter out non-element nodes (like text nodes and comments) children = children.filter(child => child.nodeType === Node.ELEMENT_NODE); return children.indexOf(element); } export class ControlNetUnit { constructor(tab, accordion) { this.tab = tab; this.accordion = accordion; this.isImg2Img = tab.querySelector('.cnet-unit-enabled').id.includes('img2img'); this.enabledCheckbox = tab.querySelector('.cnet-unit-enabled input'); this.inputImage = tab.querySelector('.cnet-input-image-group .cnet-image input[type="file"]'); this.inputImageContainer = tab.querySelector('.cnet-input-image-group .cnet-image'); this.inputImageGroup = tab.querySelector('.cnet-input-image-group'); this.controlTypeSelector = tab.querySelectorAll('.controlnet_control_type_filter_group input'); this.resizeModeRadios = tab.querySelectorAll('.controlnet_resize_mode_radio input[type="radio"]'); this.runPreprocessorButton = tab.querySelector('.cnet-run-preprocessor'); this.generatedImageGroup = tab.querySelector('.cnet-generated-image-group'); this.poseEditButton = tab.querySelector('.cnet-edit-pose'); this.allowPreviewCheckbox = tab.querySelector('.cnet-allow-preview input'); const tabs = tab.parentNode; this.tabNav = tabs.querySelector('.tab-nav'); this.tabIndex = childIndex(tab) - 1; // -1 because tab-nav is also at the same level. this.attachEnabledButtonListener(); this.attachControlTypeRadioListener(); this.attachTabNavChangeObserver(); this.attachImageUploadListener(); this.attachImageStateChangeObserver(); this.attachA1111SendInfoObserver(); } getTabNavButton() { return this.tabNav.querySelector(`:nth-child(${this.tabIndex + 1})`); } // Control type selector can be // - Radio // - Dropdown controlTypeSelectorIsDropdown() { return this.controlTypeSelector.length == 1; } getActiveControlType() { if (this.controlTypeSelectorIsDropdown()) { return this.controlTypeSelector[0].value; } for (let radio of this.controlTypeSelector) { if (radio.checked) { return radio.value; } } return undefined; } updateActiveState() { const tabNavButton = this.getTabNavButton(); if (!tabNavButton) return; if (this.enabledCheckbox.checked) { tabNavButton.classList.add('cnet-unit-active'); } else { tabNavButton.classList.remove('cnet-unit-active'); } } updateActiveUnitCount() { function getActiveUnitCount(checkboxes) { let activeUnitCount = 0; for (const checkbox of checkboxes) { if (checkbox.checked) activeUnitCount++; } return activeUnitCount; } const checkboxes = this.accordion.querySelectorAll('.cnet-unit-enabled input'); const span = this.accordion.querySelector('.label-wrap span'); // Remove existing badge. if (span.childNodes.length !== 1) { span.removeChild(span.lastChild); } // Add new badge if necessary. const activeUnitCount = getActiveUnitCount(checkboxes); if (activeUnitCount > 0) { const div = document.createElement('div'); div.classList.add('cnet-badge'); div.classList.add('primary'); div.innerHTML = `${activeUnitCount} unit${activeUnitCount > 1 ? 's' : ''}`; span.appendChild(div); } } /** * Add the active control type to tab displayed text. */ updateActiveControlType() { const tabNavButton = this.getTabNavButton(); if (!tabNavButton) return; // Remove the control if exists const controlTypeSuffix = tabNavButton.querySelector('.control-type-suffix'); if (controlTypeSuffix) controlTypeSuffix.remove(); // Add new suffix. const controlType = this.getActiveControlType(); if (controlType === 'All') return; const span = document.createElement('span'); span.innerHTML = `[${controlType}]`; span.classList.add('control-type-suffix'); tabNavButton.appendChild(span); } attachEnabledButtonListener() { this.enabledCheckbox.addEventListener('change', () => { this.updateActiveState(); this.updateActiveUnitCount(); }); } attachControlTypeRadioListener() { if (this.controlTypeSelectorIsDropdown()) { const input = this.controlTypeSelector[0]; const desc = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, "value"); const tab = this; Object.defineProperty(input, "value", { get: desc.get, set: function (v) { desc.set.call(this, v); tab.updateActiveControlType(); }, }); return; } for (const radio of this.controlTypeSelector) { radio.addEventListener('change', () => { this.updateActiveControlType(); }); } } /** * Each time the active tab change, all tab nav buttons are cleared and * regenerated by gradio. So we need to reapply the active states on * them. */ attachTabNavChangeObserver() { new MutationObserver((mutationsList) => { for (const mutation of mutationsList) { if (mutation.type === 'childList') { this.updateActiveState(); this.updateActiveControlType(); } } }).observe(this.tabNav, { childList: true }); } attachImageUploadListener() { // Automatically check `enable` checkbox when image is uploaded. this.inputImage.addEventListener('change', (event) => { if (!event.target.files) return; if (!this.enabledCheckbox.checked) this.enabledCheckbox.click(); }); // Automatically check `enable` checkbox when JSON pose file is uploaded. this.tab.querySelector('.cnet-upload-pose input').addEventListener('change', (event) => { if (!event.target.files) return; if (!this.enabledCheckbox.checked) this.enabledCheckbox.click(); }); } attachImageStateChangeObserver() { new MutationObserver((mutationsList) => { const changeObserved = imgChangeObserved(mutationsList); if (changeObserved === ImgChangeType.ADD) { // enabling the run preprocessor button this.runPreprocessorButton.removeAttribute("disabled"); this.runPreprocessorButton.title = 'Run preprocessor'; } if (changeObserved === ImgChangeType.REMOVE) { // disabling the run preprocessor button this.runPreprocessorButton.setAttribute("disabled", true); this.runPreprocessorButton.title = "No ControlNet input image available"; } }).observe(this.inputImageContainer, { childList: true, subtree: true, }); } /** * Observe send PNG info buttons in A1111, as they can also directly * set states of ControlNetUnit. */ attachA1111SendInfoObserver() { const pasteButtons = gradioApp().querySelectorAll('#paste'); const pngButtons = gradioApp().querySelectorAll( this.isImg2Img ? '#img2img_tab, #inpaint_tab' : '#txt2img_tab' ); for (const button of [...pasteButtons, ...pngButtons]) { button.addEventListener('click', () => { // The paste/send img generation info feature goes // though gradio, which is pretty slow. Ideally we should // observe the event when gradio has done the job, but // that is not an easy task. // Here we just do a 2 second delay until the refresh. setTimeout(() => { this.updateActiveState(); this.updateActiveUnitCount(); }, 2000); }); } } } ================================================ FILE: javascript/index.mjs ================================================ import { ControlNetUnit } from "./controlnet_unit.mjs"; import { initControlNetModals } from "./modal.mjs"; import { OpenposeEditor } from "./openpose_editor.mjs"; import { loadPhotopea } from "./photopea.mjs"; (function () { const cnetAllAccordions = new Set(); onUiUpdate(() => { gradioApp().querySelectorAll('#controlnet').forEach(accordion => { if (cnetAllAccordions.has(accordion)) return; accordion.querySelectorAll('.cnet-unit-tab') .forEach(tab => { const unit = new ControlNetUnit(tab, accordion); const openposeEditor = new OpenposeEditor(unit); }); initControlNetModals(accordion); loadPhotopea(accordion); cnetAllAccordions.add(accordion); }); }); })(); ================================================ FILE: javascript/modal.mjs ================================================ export function initControlNetModals(container) { // Get all the buttons that open a modal const btns = container.querySelectorAll(".cnet-modal-open"); // Get all the elements that close a modal const spans = container.querySelectorAll(".cnet-modal-close"); // For each button, add a click event listener that opens the corresponding modal btns.forEach((btn) => { const modalId = btn.id.replace('cnet-modal-open-', ''); const modal = container.querySelector("#cnet-modal-" + modalId); btn.addEventListener('click', () => { modal.style.display = "block"; }); }); // For each element, add a click event listener that closes the corresponding modal spans.forEach((span) => { const modal = span.parentNode; span.addEventListener('click', () => { modal.style.display = "none"; }); }); } ================================================ FILE: javascript/openpose_editor.mjs ================================================ import { ControlNetUnit } from "./controlnet_unit.mjs"; export class OpenposeEditor { /** * OpenposeEditor * @param {ControlNetUnit} unit */ constructor(unit) { this.unit = unit; this.iframe = this.unit.generatedImageGroup.querySelector('.cnet-modal iframe'); this.closeModalButton = this.unit.generatedImageGroup.querySelector('.cnet-modal .cnet-modal-close'); this.uploadButton = this.unit.inputImageGroup.querySelector('.cnet-upload-pose input'); this.editorURL = null; this.unit.poseEditButton.addEventListener('click', this.trigger.bind(this)); // Updates preview image when edit is done. window.addEventListener('message', ((event) => { const message = event.data; const modalId = this.unit.poseEditButton.id.replace('cnet-modal-open-', ''); if (message.modalId !== modalId) return; this.updatePreviewPose(message.poseURL); this.closeModalButton.click(); }).bind(this)); // Updates preview image when JSON file is uploaded. this.uploadButton.addEventListener('change', ((event) => { const file = event.target.files[0]; if (!file) return; const reader = new FileReader(); reader.onload = (e) => { const contents = e.target.result; const poseURL = `data:application/json;base64,${btoa(contents)}`; this.updatePreviewPose(poseURL); }; reader.readAsText(file); // Reset the file input value so that uploading the same file still triggers callback. event.target.value = ''; }).bind(this)); } async checkEditorAvailable() { const LOCAL_EDITOR_PATH = '/openpose_editor_index'; const REMOTE_EDITOR_PATH = 'https://huchenlei.github.io/sd-webui-openpose-editor/'; async function testEditorPath(path) { const res = await fetch(path); return res.status === 200 ? path : null; } // Use local editor if the user has the extension installed. Fallback // onto remote editor if the local editor is not ready yet. // See https://github.com/huchenlei/sd-webui-openpose-editor/issues/53 // for more details. return await testEditorPath(LOCAL_EDITOR_PATH) || await testEditorPath(REMOTE_EDITOR_PATH); } navigateIframe() { const iframe = this.iframe; const editorURL = this.editorURL; function getPathname(rawURL) { try { return new URL(rawURL).pathname; } catch (e) { return rawURL; } } return new Promise((resolve) => { const darkThemeParam = document.body.classList.contains('dark') ? new URLSearchParams({ theme: 'dark' }).toString() : ''; window.addEventListener('message', (event) => { const message = event.data; if (message['ready']) resolve(); }, { once: true }); if ((editorURL.startsWith("http") ? iframe.src : getPathname(iframe.src)) !== editorURL) { iframe.src = `${editorURL}?${darkThemeParam}`; // By default assume 5 second is enough for the openpose editor // to load. setTimeout(resolve, 5000); } else { // If no navigation is required, immediately return. resolve(); } }); } // When edit button is clicked. async trigger() { const inputImageGroup = this.unit.tab.querySelector('.cnet-input-image-group'); const inputImage = inputImageGroup.querySelector('.cnet-image img'); const downloadLink = this.unit.generatedImageGroup.querySelector('.cnet-download-pose a'); const modalId = this.unit.poseEditButton.id.replace('cnet-modal-open-', ''); if (!this.editorURL) { this.editorURL = await this.checkEditorAvailable(); if (!this.editorURL) { alert("No openpose editor available.") } } await this.navigateIframe(); this.iframe.contentWindow.postMessage({ modalId, imageURL: inputImage ? inputImage.src : undefined, poseURL: downloadLink.href, }, '*'); // Focus the iframe so that the focus is no longer on the `Edit` button. // Pressing space when the focus is on `Edit` button will trigger // the click again to resend the frame message. this.iframe.contentWindow.focus(); } /* * Writes the pose data URL to an link element on input image group. * Click a hidden button to trigger a backend rendering of the pose JSON. * * The backend should: * - Set the rendered pose image as preprocessor generated image. */ updatePreviewPose(poseURL) { const downloadLink = this.unit.generatedImageGroup.querySelector('.cnet-download-pose a'); const renderButton = this.unit.generatedImageGroup.querySelector('.cnet-render-pose'); const poseTextbox = this.unit.generatedImageGroup.querySelector('.cnet-pose-json textarea'); const allowPreviewCheckbox = this.unit.allowPreviewCheckbox; if (!allowPreviewCheckbox.checked) allowPreviewCheckbox.click(); // Only set href when download link exists and needs an update. `downloadLink` // can be null when user closes preview and click `Upload JSON` button again. // https://github.com/Mikubill/sd-webui-controlnet/issues/2308 if (downloadLink !== null) downloadLink.href = poseURL; poseTextbox.value = poseURL; // Simulate an `input` DOM event for Gradio Textbox component. Needed after you edit its contents in javascript, otherwise your edits // will only visible on web page and not sent to python. function updateInput(target) { let e = new Event("input", { bubbles: true }) Object.defineProperty(e, "target", { value: target }) target.dispatchEvent(e); } updateInput(poseTextbox); renderButton.click(); } } ================================================ FILE: javascript/photopea.mjs ================================================ /* MIT LICENSE Copyright 2011 Jon Leighton Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // From: https://gist.github.com/jonleighton/958841 function base64ArrayBuffer(arrayBuffer) { var base64 = '' var encodings = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' var bytes = new Uint8Array(arrayBuffer) var byteLength = bytes.byteLength var byteRemainder = byteLength % 3 var mainLength = byteLength - byteRemainder var a, b, c, d var chunk // Main loop deals with bytes in chunks of 3 for (var i = 0; i < mainLength; i = i + 3) { // Combine the three bytes into a single integer chunk = (bytes[i] << 16) | (bytes[i + 1] << 8) | bytes[i + 2] // Use bitmasks to extract 6-bit segments from the triplet a = (chunk & 16515072) >> 18 // 16515072 = (2^6 - 1) << 18 b = (chunk & 258048) >> 12 // 258048 = (2^6 - 1) << 12 c = (chunk & 4032) >> 6 // 4032 = (2^6 - 1) << 6 d = chunk & 63 // 63 = 2^6 - 1 // Convert the raw binary segments to the appropriate ASCII encoding base64 += encodings[a] + encodings[b] + encodings[c] + encodings[d] } // Deal with the remaining bytes and padding if (byteRemainder == 1) { chunk = bytes[mainLength] a = (chunk & 252) >> 2 // 252 = (2^6 - 1) << 2 // Set the 4 least significant bits to zero b = (chunk & 3) << 4 // 3 = 2^2 - 1 base64 += encodings[a] + encodings[b] + '==' } else if (byteRemainder == 2) { chunk = (bytes[mainLength] << 8) | bytes[mainLength + 1] a = (chunk & 64512) >> 10 // 64512 = (2^6 - 1) << 10 b = (chunk & 1008) >> 4 // 1008 = (2^6 - 1) << 4 // Set the 2 least significant bits to zero c = (chunk & 15) << 2 // 15 = 2^4 - 1 base64 += encodings[a] + encodings[b] + encodings[c] + '=' } return base64 } // Turn a base64 string into a blob. // From https://gist.github.com/gauravmehla/7a7dfd87dd7d1b13697b6e894426615f function b64toBlob(b64Data, contentType, sliceSize) { var contentType = contentType || ''; var sliceSize = sliceSize || 512; var byteCharacters = atob(b64Data); var byteArrays = []; for (var offset = 0; offset < byteCharacters.length; offset += sliceSize) { var slice = byteCharacters.slice(offset, offset + sliceSize); var byteNumbers = new Array(slice.length); for (var i = 0; i < slice.length; i++) { byteNumbers[i] = slice.charCodeAt(i); } var byteArray = new Uint8Array(byteNumbers); byteArrays.push(byteArray); } return new Blob(byteArrays, { type: contentType }); } function createBlackImageBase64(width, height) { // Create a canvas element var canvas = document.createElement('canvas'); canvas.width = width; canvas.height = height; // Get the context of the canvas var ctx = canvas.getContext('2d'); // Fill the canvas with black color ctx.fillStyle = 'black'; ctx.fillRect(0, 0, width, height); // Get the base64 encoded string var base64Image = canvas.toDataURL('image/png'); return base64Image; } // Functions to be called within photopea context. // Start of photopea functions function pasteImage(base64image) { app.open(base64image, null, /* asSmart */ true); app.echoToOE("success"); } function setLayerNames(names) { const layers = app.activeDocument.layers; if (layers.length !== names.length) { console.error("layer length does not match names length"); echoToOE("error"); return; } for (let i = 0; i < names.length; i++) { const layer = layers[i]; layer.name = names[i]; } app.echoToOE("success"); } function removeLayersWithNames(names) { const layers = app.activeDocument.layers; for (let i = 0; i < layers.length; i++) { const layer = layers[i]; if (names.includes(layer.name)) { layer.remove(); } } app.echoToOE("success"); } function getAllLayerNames() { const layers = app.activeDocument.layers; const names = []; for (let i = 0; i < layers.length; i++) { const layer = layers[i]; names.push(layer.name); } app.echoToOE(JSON.stringify(names)); } // Hides all layers except the current one, outputs the whole image, then restores the previous // layers state. function exportSelectedLayerOnly(format, layerName) { // Gets all layers recursively, including the ones inside folders. function getAllArtLayers(document) { let allArtLayers = []; for (let i = 0; i < document.layers.length; i++) { const currentLayer = document.layers[i]; allArtLayers.push(currentLayer); if (currentLayer.typename === "LayerSet") { allArtLayers = allArtLayers.concat(getAllArtLayers(currentLayer)); } } return allArtLayers; } function makeLayerVisible(layer) { let currentLayer = layer; while (currentLayer != app.activeDocument) { currentLayer.visible = true; if (currentLayer.parent.typename != 'Document') { currentLayer = currentLayer.parent; } else { break; } } } const allLayers = getAllArtLayers(app.activeDocument); // Make all layers except the currently selected one invisible, and store // their initial state. const layerStates = []; for (let i = 0; i < allLayers.length; i++) { const layer = allLayers[i]; layerStates.push(layer.visible); } // Hide all layers to begin with for (let i = 0; i < allLayers.length; i++) { const layer = allLayers[i]; layer.visible = false; } for (let i = 0; i < allLayers.length; i++) { const layer = allLayers[i]; const selected = layer.name === layerName; if (selected) { makeLayerVisible(layer); } } app.activeDocument.saveToOE(format); for (let i = 0; i < allLayers.length; i++) { const layer = allLayers[i]; layer.visible = layerStates[i]; } } function hasActiveDocument() { app.echoToOE(app.documents.length > 0 ? "true" : "false"); } // End of photopea functions const MESSAGE_END_ACK = "done"; const MESSAGE_ERROR = "error"; const PHOTOPEA_URL = "https://www.photopea.com/"; class PhotopeaContext { constructor(photopeaIframe) { this.photopeaIframe = photopeaIframe; this.timeout = 1000; } navigateIframe() { const iframe = this.photopeaIframe; const editorURL = PHOTOPEA_URL; return new Promise(async (resolve) => { if (iframe.src !== editorURL) { iframe.src = editorURL; // Stop waiting after 10s. setTimeout(resolve, 10000); // Testing whether photopea is able to accept message. while (true) { try { await this.invoke(hasActiveDocument); break; } catch (e) { console.log("Keep waiting for photopea to accept message."); } } this.timeout = 5000; // Restore to a longer timeout in normal messaging. } resolve(); }); } // From https://github.com/huchenlei/stable-diffusion-ps-pea/blob/main/src/Photopea.ts postMessageToPhotopea(message) { return new Promise((resolve, reject) => { const responseDataPieces = []; let hasError = false; const photopeaMessageHandle = (event) => { if (event.source !== this.photopeaIframe.contentWindow) { return; } // Filter out the ping messages if (typeof event.data === 'string' && event.data.includes('MSFAPI#')) { return; } // Ignore "done" when no data has been received. The "done" can come from // MSFAPI ping. if (event.data === MESSAGE_END_ACK && responseDataPieces.length === 0) { return; } if (event.data === MESSAGE_END_ACK) { window.removeEventListener("message", photopeaMessageHandle); if (hasError) { reject('Photopea Error.'); } else { resolve(responseDataPieces.length === 1 ? responseDataPieces[0] : responseDataPieces); } } else if (event.data === MESSAGE_ERROR) { responseDataPieces.push(event.data); hasError = true; } else { responseDataPieces.push(event.data); } }; window.addEventListener("message", photopeaMessageHandle); setTimeout(() => reject("Photopea message timeout"), this.timeout); this.photopeaIframe.contentWindow.postMessage(message, "*"); }); } // From https://github.com/huchenlei/stable-diffusion-ps-pea/blob/main/src/Photopea.ts async invoke(func, ...args) { await this.navigateIframe(); const message = `${func.toString()} ${func.name}(${args.map(arg => JSON.stringify(arg)).join(',')});`; try { return await this.postMessageToPhotopea(message); } catch (e) { throw `Failed to invoke ${func.name}. ${e}.`; } } /** * Fetch detected maps from each ControlNet units. * Create a new photopea document. * Add those detected maps to the created document. */ async fetchFromControlNet(tabs) { if (tabs.length === 0) return; const isImg2Img = tabs[0].querySelector('.cnet-unit-enabled').id.includes('img2img'); const generationType = isImg2Img ? 'img2img' : 'txt2img'; const width = gradioApp().querySelector(`#${generationType}_width input[type=number]`).value; const height = gradioApp().querySelector(`#${generationType}_height input[type=number]`).value; const layerNames = ["background"]; await this.invoke(pasteImage, createBlackImageBase64(width, height)); await new Promise(r => setTimeout(r, 200)); for (const [i, tab] of tabs.entries()) { const generatedImage = tab.querySelector('.cnet-generated-image-group .cnet-image img'); if (!generatedImage) continue; await this.invoke(pasteImage, generatedImage.src); // Wait 200ms for pasting to fully complete so that we do not ended up with 2 separate // documents. await new Promise(r => setTimeout(r, 200)); layerNames.push(`unit-${i}`); } await this.invoke(removeLayersWithNames, layerNames); await this.invoke(setLayerNames, layerNames.reverse()); } /** * Send the images in the active photopea document back to each ControlNet units. */ async sendToControlNet(tabs) { // Gradio's image widgets are inputs. To set the image in one, we set the image on the input and // force it to refresh. function setImageOnInput(imageInput, file) { // Createa a data transfer element to set as the data in the input. const dt = new DataTransfer(); dt.items.add(file); const list = dt.files; // Actually set the image in the image widget. imageInput.files = list; // Foce the image widget to update with the new image, after setting its source files. const event = new Event('change', { 'bubbles': true, "composed": true }); imageInput.dispatchEvent(event); } function sendToControlNetUnit(b64Image, index) { const tab = tabs[index]; // Upload image to output image element. const outputImage = tab.querySelector('.cnet-photopea-output'); const outputImageUpload = outputImage.querySelector('input[type="file"]'); setImageOnInput(outputImageUpload, new File([b64toBlob(b64Image, "image/png")], "photopea_output.png")); // Make sure `UsePreviewAsInput` checkbox is checked. const checkbox = tab.querySelector('.cnet-preview-as-input input[type="checkbox"]'); if (!checkbox.checked) { checkbox.click(); } } const layerNames = JSON.parse(await this.invoke(getAllLayerNames)) .filter(name => /unit-\d+/.test(name)); for (const layerName of layerNames) { const arrayBuffer = await this.invoke(exportSelectedLayerOnly, 'PNG', layerName); const b64Image = base64ArrayBuffer(arrayBuffer); const layerIndex = Number.parseInt(layerName.split('-')[1]); sendToControlNetUnit(b64Image, layerIndex); } } } let photopeaWarningShown = false; function firstTimeUserPrompt() { if (opts.controlnet_photopea_warning) { const photopeaPopupMsg = "you are about to connect to https://photopea.com\n" + "- Click OK: proceed.\n" + "- Click Cancel: abort.\n" + "Photopea integration can be disabled in Settings > ControlNet > Disable photopea edit.\n" + "This popup can be disabled in Settings > ControlNet > Photopea popup warning."; if (photopeaWarningShown || confirm(photopeaPopupMsg)) photopeaWarningShown = true; else return false; } return true; } export function loadPhotopea(accordion) { const photopeaMainTrigger = accordion.querySelector('.cnet-photopea-main-trigger'); // Photopea edit feature disabled. if (!photopeaMainTrigger) { console.log("ControlNet photopea edit disabled."); return; } const closeModalButton = accordion.querySelector('.cnet-photopea-edit .cnet-modal-close'); const tabs = accordion.querySelectorAll('.cnet-unit-tab'); const photopeaIframe = accordion.querySelector('.photopea-iframe'); const photopeaContext = new PhotopeaContext(photopeaIframe, tabs); tabs.forEach(tab => { const photopeaChildTrigger = tab.querySelector('.cnet-photopea-child-trigger'); photopeaChildTrigger.addEventListener('click', async () => { if (!firstTimeUserPrompt()) return; photopeaMainTrigger.click(); if (await photopeaContext.invoke(hasActiveDocument) === "false") { await photopeaContext.fetchFromControlNet(tabs); } }); }); accordion.querySelector('.photopea-fetch').addEventListener('click', () => photopeaContext.fetchFromControlNet(tabs)); accordion.querySelector('.photopea-send').addEventListener('click', () => { photopeaContext.sendToControlNet(tabs) closeModalButton.click(); }); } ================================================ FILE: models/put_controlnet_models_here.txt ================================================ put_controlnet_models_here ================================================ FILE: patch_version.py ================================================ import re import subprocess def get_current_version(filename): version_pattern = r"version_flag\s*=\s*'v(\d+\.\d+\.\d+)'" with open(filename, "r") as file: content = file.read() match = re.search(version_pattern, content) if match: return match.group(1) else: raise ValueError("Version number not found in the file") def increment_version(version): major, minor, patch = map(int, version.split(".")) patch += 1 # Increment the patch number return f"{major}.{minor}.{patch}" def update_version_file(filename, new_version): with open(filename, "r") as file: content = file.read() new_content = re.sub( r"version_flag = 'v\d+\.\d+\.\d+'", f"version_flag = 'v{new_version}'", content ) with open(filename, "w") as file: file.write(new_content) def git_commit_and_tag(filename, new_version): commit_message = f":memo: Update to version v{new_version}" tag_name = f"v{new_version}" # Commit the changes subprocess.run(["git", "add", filename], check=True) subprocess.run(["git", "commit", "-m", commit_message], check=True) # Create a new tag subprocess.run(["git", "tag", tag_name], check=True) if __name__ == "__main__": filename = "scripts/controlnet_version.py" current_version = get_current_version(filename) new_version = increment_version(current_version) update_version_file(filename, new_version) git_commit_and_tag(filename, new_version) ================================================ FILE: preload.py ================================================ def preload(parser): parser.add_argument( "--controlnet-dir", type=str, help="Path to directory with ControlNet models", default=None, ) parser.add_argument( "--controlnet-annotator-models-path", type=str, help="Path to directory with annotator model directories", default=None, ) parser.add_argument( "--no-half-controlnet", action="store_true", help="do not switch the ControlNet models to 16-bit floats (only needed without --no-half)", default=None, ) # Setting default max_size=16 as each cache entry contains image as both key # and value (Very costly). parser.add_argument( "--controlnet-preprocessor-cache-size", type=int, help="Cache size for controlnet preprocessor results", default=16, ) parser.add_argument( "--controlnet-loglevel", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)", ) parser.add_argument( "--controlnet-tracemalloc", action="store_true", help="Enable memory tracing.", default=None, ) ================================================ FILE: pyproject.toml ================================================ [tool.ruff] target-version = "py310" exclude = [ "annotator", "tests", "web_tests", "example", "extract_controlnet_diff.py", "scripts/movie2movie.py", "scripts/preprocessor/legacy/preprocessor_compiled.py", "scripts/preprocessor/__init__.py", ] ignore = [ "E501", # Line too long "E721", # Do not compare types, use `isinstance` "E731", # Do not assign a `lambda` expression, use a `def` "I001", # Import block is un-sorted or un-formatted "C901", # Function is too complex "C408", # Rewrite as a literal "W605", # invalid escape sequence, messes with some docstrings ] ================================================ FILE: requirements.txt ================================================ fvcore mediapipe opencv-python>=4.8.0 svglib addict yapf albumentations==1.4.3 matplotlib facexlib timm<=0.9.5 pydantic<=1.10.17 controlnet_aux>=0.0.9 ================================================ FILE: scripts/adapter.py ================================================ import torch import torch.nn as nn from collections import OrderedDict from copy import deepcopy from modules import devices cond_cast_unet = getattr(devices, 'cond_cast_unet', lambda x: x) class TorchHijackForUnet: """ This is torch, but with cat that resizes tensors to appropriate dimensions if they do not match; this makes it possible to create pictures with dimensions that are multiples of 8 rather than 64 """ def __getattr__(self, item): if item == 'cat': return self.cat if hasattr(torch, item): return getattr(torch, item) raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, item)) def cat(self, tensors, *args, **kwargs): if len(tensors) == 2: a, b = tensors if a.shape[-2:] != b.shape[-2:]: a = torch.nn.functional.interpolate(a, b.shape[-2:], mode="nearest") tensors = (a, b) return torch.cat(tensors, *args, **kwargs) th = TorchHijackForUnet() def align(hint, size): b, c, h1, w1 = hint.shape h, w = size if h != h1 or w != w1: hint = th.nn.functional.interpolate(hint, size=size, mode="nearest") return hint class PlugableAdapter(nn.Module): def __init__(self, control_model) -> None: super().__init__() self.control_model = control_model self.control = None self.hint_cond = None def reset(self): self.control = None self.hint_cond = None def forward(self, hint=None, x=None, *args, **kwargs): if self.control is not None: return deepcopy(self.control) self.hint_cond = cond_cast_unet(hint) hint_in = cond_cast_unet(hint) if hasattr(self.control_model, 'conv_in') and \ (self.control_model.conv_in.in_channels == 64 or self.control_model.conv_in.in_channels == 256): hint_in = hint_in[:, 0:1, :, :] self.control = self.control_model(hint_in) return deepcopy(self.control) def aggressive_lowvram(self): self.to(devices.get_device_for("controlnet")) return def fullvram(self): self.to(devices.get_device_for("controlnet")) return def conv_nd(dims, *args, **kwargs): """ Create a 1D, 2D, or 3D convolution module. """ if dims == 1: return nn.Conv1d(*args, **kwargs) elif dims == 2: return nn.Conv2d(*args, **kwargs) elif dims == 3: return nn.Conv3d(*args, **kwargs) raise ValueError(f"unsupported dimensions: {dims}") def avg_pool_nd(dims, *args, **kwargs): """ Create a 1D, 2D, or 3D average pooling module. """ if dims == 1: return nn.AvgPool1d(*args, **kwargs) elif dims == 2: return nn.AvgPool2d(*args, **kwargs) elif dims == 3: return nn.AvgPool3d(*args, **kwargs) raise ValueError(f"unsupported dimensions: {dims}") class Downsample(nn.Module): """ A downsampling layer with an optional convolution. :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then downsampling occurs in the inner-two dimensions. """ def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1): super().__init__() self.channels = channels self.out_channels = out_channels or channels self.use_conv = use_conv self.dims = dims stride = 2 if dims != 3 else (1, 2, 2) if use_conv: self.op = conv_nd( dims, self.channels, self.out_channels, 3, stride=stride, padding=padding ) else: assert self.channels == self.out_channels self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) def forward(self, x): assert x.shape[1] == self.channels return self.op(x) class ResnetBlock(nn.Module): def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True): super().__init__() ps = ksize//2 if in_c != out_c or sk is False: self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps) else: # print('n_in') self.in_conv = None self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1) self.act = nn.ReLU() self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps) if sk is False: self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps) else: self.skep = None self.down = down if self.down is True: self.down_opt = Downsample(in_c, use_conv=use_conv) def forward(self, x): if self.down is True: x = self.down_opt(x) if self.in_conv is not None: # edit x = self.in_conv(x) h = self.block1(x) h = self.act(h) h = self.block2(h) if self.skep is not None: return h + self.skep(x) else: return h + x class Adapter(nn.Module): def __init__(self, channels=[320, 640, 1280, 1280], nums_rb=3, cin=64, ksize=3, sk=False, use_conv=True, is_sdxl=True): super(Adapter, self).__init__() if is_sdxl: self.pixel_shuffle = 16 downsample_avoided = [1] downsample_layers = [2] else: self.pixel_shuffle = 8 downsample_avoided = [] downsample_layers = [3, 2, 1] self.input_channels = cin // (self.pixel_shuffle * self.pixel_shuffle) self.channels = channels self.nums_rb = nums_rb self.body = [] self.unshuffle = nn.PixelUnshuffle(self.pixel_shuffle) for i in range(len(channels)): for r in range(nums_rb): if i in downsample_layers and r == 0: self.body.append(ResnetBlock( channels[i - 1], channels[i], down=True, ksize=ksize, sk=sk, use_conv=use_conv)) continue if i in downsample_avoided and r == 0: self.body.append(ResnetBlock( channels[i - 1], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv)) continue self.body.append(ResnetBlock( channels[i], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv )) self.body = nn.ModuleList(self.body) self.conv_in = nn.Conv2d(cin, channels[0], 3, 1, 1) def forward(self, x): self.to(x.device) x = self.unshuffle(x) hs = [] x = self.conv_in(x) for i in range(len(self.channels)): for r in range(self.nums_rb): idx = i * self.nums_rb + r x = self.body[idx](x) hs.append(x) self.to('cpu') return hs class LayerNorm(nn.LayerNorm): """Subclass torch's LayerNorm to handle fp16.""" def forward(self, x: torch.Tensor): orig_type = x.dtype ret = super().forward(x.type(torch.float32)) return ret.type(orig_type) class QuickGELU(nn.Module): def forward(self, x: torch.Tensor): return x * torch.sigmoid(1.702 * x) class ResidualAttentionBlock(nn.Module): def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): super().__init__() self.attn = nn.MultiheadAttention(d_model, n_head) self.ln_1 = LayerNorm(d_model) self.mlp = nn.Sequential( OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()), ("c_proj", nn.Linear(d_model * 4, d_model))])) self.ln_2 = LayerNorm(d_model) self.attn_mask = attn_mask def attention(self, x: torch.Tensor): self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] def forward(self, x: torch.Tensor): x = x + self.attention(self.ln_1(x)) x = x + self.mlp(self.ln_2(x)) return x class StyleAdapter(nn.Module): def __init__(self, width=1024, context_dim=768, num_head=8, n_layes=3, num_token=4): super().__init__() scale = width ** -0.5 self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(width, num_head) for _ in range(n_layes)]) self.num_token = num_token self.style_embedding = nn.Parameter(torch.randn(1, num_token, width) * scale) self.ln_post = LayerNorm(width) self.ln_pre = LayerNorm(width) self.proj = nn.Parameter(scale * torch.randn(width, context_dim)) def forward(self, x): # x shape [N, HW+1, C] style_embedding = self.style_embedding + torch.zeros( (x.shape[0], self.num_token, self.style_embedding.shape[-1]), device=x.device) x = torch.cat([x, style_embedding], dim=1) x = self.ln_pre(x) x = x.permute(1, 0, 2) # NLD -> LND x = self.transformer_layes(x) x = x.permute(1, 0, 2) # LND -> NLD x = self.ln_post(x[:, -self.num_token:, :]) x = x @ self.proj return x class ResnetBlock_light(nn.Module): def __init__(self, in_c): super().__init__() self.block1 = nn.Conv2d(in_c, in_c, 3, 1, 1) self.act = nn.ReLU() self.block2 = nn.Conv2d(in_c, in_c, 3, 1, 1) def forward(self, x): h = self.block1(x) h = self.act(h) h = self.block2(h) return h + x class extractor(nn.Module): def __init__(self, in_c, inter_c, out_c, nums_rb, down=False): super().__init__() self.in_conv = nn.Conv2d(in_c, inter_c, 1, 1, 0) self.body = [] for _ in range(nums_rb): self.body.append(ResnetBlock_light(inter_c)) self.body = nn.Sequential(*self.body) self.out_conv = nn.Conv2d(inter_c, out_c, 1, 1, 0) self.down = down if self.down is True: self.down_opt = Downsample(in_c, use_conv=False) def forward(self, x): if self.down is True: x = self.down_opt(x) x = self.in_conv(x) x = self.body(x) x = self.out_conv(x) return x class Adapter_light(nn.Module): def __init__(self, channels=[320, 640, 1280, 1280], nums_rb=3, cin=64): super(Adapter_light, self).__init__() self.unshuffle = nn.PixelUnshuffle(8) self.channels = channels self.nums_rb = nums_rb self.body = [] for i in range(len(channels)): if i == 0: self.body.append(extractor(in_c=cin, inter_c=channels[i]//4, out_c=channels[i], nums_rb=nums_rb, down=False)) else: self.body.append(extractor(in_c=channels[i-1], inter_c=channels[i]//4, out_c=channels[i], nums_rb=nums_rb, down=True)) self.body = nn.ModuleList(self.body) def forward(self, x): # unshuffle x = self.unshuffle(x) # extract features features = [] for i in range(len(self.channels)): x = self.body[i](x) features.append(x) return features ================================================ FILE: scripts/animate_diff/batch.py ================================================ from scripts.external_code import ControlNetUnit from scripts.logging import logger from modules.processing import StableDiffusionProcessing from modules import shared def add_animate_diff_batch_input( p: StableDiffusionProcessing, unit: ControlNetUnit ) -> ControlNetUnit: """AnimateDiff + ControlNet batch processing.""" assert unit.is_animate_diff_batch batch_parameters = unit.batch_images.split("\n") batch_image_dir = batch_parameters[0] logger.info( f"AnimateDiff + ControlNet {unit.module} receive the following parameters:" ) logger.info(f"\tbatch control images: {batch_image_dir}") for ad_cn_batch_parameter in batch_parameters[1:]: if ad_cn_batch_parameter.startswith("mask:"): unit.batch_mask_dir = ad_cn_batch_parameter[len("mask:") :].strip() logger.info(f"\tbatch control mask: {unit.batch_mask_dir}") elif ad_cn_batch_parameter.startswith("keyframe:"): unit.batch_keyframe_idx = ad_cn_batch_parameter[len("keyframe:") :].strip() unit.batch_keyframe_idx = [ int(b_i.strip()) for b_i in unit.batch_keyframe_idx.split(",") ] logger.info(f"\tbatch control keyframe index: {unit.batch_keyframe_idx}") batch_image_files = shared.listfiles(batch_image_dir) for batch_modifier in getattr(unit, "batch_modifiers", []): batch_image_files = batch_modifier(batch_image_files, p) unit.batch_image_files = batch_image_files unit.image = [] for idx, image_path in enumerate(batch_image_files): mask_path = None if getattr(unit, "batch_mask_dir", None) is not None: batch_mask_files = shared.listfiles(unit.batch_mask_dir) if len(batch_mask_files) >= len(batch_image_files): mask_path = batch_mask_files[idx] else: mask_path = batch_mask_files[0] unit.image.append( { "image": image_path, "mask": mask_path, } ) return unit ================================================ FILE: scripts/api.py ================================================ from typing import List, Optional import base64 import io import torch import numpy as np from fastapi import FastAPI, Body from fastapi.exceptions import HTTPException from pydantic import BaseModel from PIL import Image import gradio as gr from modules.api.models import * # noqa:F403 from modules.api import api from scripts import external_code, global_state from scripts.logging import logger from scripts.external_code import ControlNetUnit from scripts.supported_preprocessor import Preprocessor from annotator.openpose import draw_poses, decode_json_as_poses from annotator.openpose.animalpose import draw_animalposes def encode_to_base64(image): if isinstance(image, str): return image elif isinstance(image, Image.Image): return api.encode_pil_to_base64(image) elif isinstance(image, np.ndarray): return encode_np_to_base64(image) else: return "" def encode_np_to_base64(image): pil = Image.fromarray(image) return api.encode_pil_to_base64(pil) def encode_tensor_to_base64(obj: torch.Tensor) -> str: """Serialize the tensor data to base64 string.""" buffer = io.BytesIO() torch.save(obj, buffer) buffer.seek(0) # Rewind the buffer return base64.b64encode(buffer.getvalue()).decode("utf-8") def controlnet_api(_: gr.Blocks, app: FastAPI): @app.get("/controlnet/version") async def version(): return {"version": external_code.get_api_version()} @app.get("/controlnet/model_list") async def model_list(update: bool = True): up_to_date_model_list = external_code.get_models(update=update) logger.debug(up_to_date_model_list) return {"model_list": up_to_date_model_list} @app.get("/controlnet/module_list") async def module_list(alias_names: bool = False): _module_list = external_code.get_modules(alias_names) logger.debug(_module_list) return { "module_list": _module_list, "module_detail": external_code.get_modules_detail(alias_names), } @app.get("/controlnet/control_types") async def control_types(): def format_control_type( filtered_preprocessor_list, filtered_model_list, default_option, default_model, ): return { "module_list": filtered_preprocessor_list, "model_list": filtered_model_list, "default_option": default_option, "default_model": default_model, } return { "control_types": { control_type: format_control_type( *global_state.select_control_type(control_type) ) for control_type in Preprocessor.get_all_preprocessor_tags() } } @app.get("/controlnet/settings") async def settings(): max_models_num = external_code.get_max_models_num() return {"control_net_unit_count": max_models_num} @app.post("/controlnet/detect") async def detect( controlnet_module: str = Body("none", title="Controlnet Module"), controlnet_input_images: List[str] = Body([], title="Controlnet Input Images"), controlnet_processor_res: int = Body( -1, title="Controlnet Processor Resolution" ), controlnet_threshold_a: float = Body(-1, title="Controlnet Threshold a"), controlnet_threshold_b: float = Body(-1, title="Controlnet Threshold b"), controlnet_masks: List[str] = Body([], title="Controlnet Masks"), low_vram: bool = Body(False, title="Low vram"), ): preprocessor = Preprocessor.get_preprocessor(controlnet_module) if preprocessor is None: raise HTTPException(status_code=422, detail="Module not available") if controlnet_module in ( "clip_vision", "revision_clipvision", "revision_ignore_prompt", "ip-adapter-auto", ): raise HTTPException(status_code=422, detail="Module not supported") if len(controlnet_input_images) == 0: raise HTTPException(status_code=422, detail="No image selected") if preprocessor.requires_mask and len(controlnet_masks) != len( controlnet_input_images ): raise HTTPException( status_code=422, detail=f"Preprocessor {controlnet_module} requires `controlnet_masks` param.", ) logger.info( f"Detecting {str(len(controlnet_input_images))} images with the {controlnet_module} module." ) unit = ControlNetUnit( enabled=True, module=preprocessor.label, processor_res=controlnet_processor_res, threshold_a=controlnet_threshold_a, threshold_b=controlnet_threshold_b, ) tensors = [] images = [] poses = [] for i, input_image in enumerate(controlnet_input_images): img = external_code.to_base64_nparray(input_image) # Has mask. if i < len(controlnet_masks): if preprocessor.accepts_mask: mask = external_code.to_base64_nparray(controlnet_masks[i])[ :, :, :1 ] img = np.concatenate([img, mask], axis=2) else: logger.warn( f"Preprocessor {controlnet_module} does not accept mask. Mask ignored" ) class JsonAcceptor: def __init__(self) -> None: self.value = None def accept(self, json_dict: dict) -> None: self.value = json_dict json_acceptor = JsonAcceptor() result = preprocessor.cached_call( img, resolution=unit.processor_res, slider_1=unit.threshold_a, slider_2=unit.threshold_b, json_pose_callback=json_acceptor.accept, low_vram=low_vram, ) if preprocessor.returns_image: images.append(encode_to_base64(result.display_images[0])) else: tensors.append(encode_tensor_to_base64(result.value)) if "openpose" in controlnet_module: assert json_acceptor.value is not None poses.append(json_acceptor.value) preprocessor.unload() res = {"info": "Success"} if poses: res["poses"] = poses if images: res["images"] = images if tensors: res["tensor"] = tensors return res class Person(BaseModel): pose_keypoints_2d: List[float] hand_right_keypoints_2d: Optional[List[float]] hand_left_keypoints_2d: Optional[List[float]] face_keypoints_2d: Optional[List[float]] class PoseData(BaseModel): people: List[Person] canvas_width: int canvas_height: int @app.post("/controlnet/render_openpose_json") async def render_openpose_json( pose_data: List[PoseData] = Body([], title="Pose json files to render.") ): if not pose_data: return {"info": "No pose data detected."} else: def draw(poses, animals, H, W): if poses: assert len(animals) == 0 return draw_poses(poses, H, W) else: return draw_animalposes(animals, H, W) return { "images": [ encode_to_base64(draw(*decode_json_as_poses(pose.dict()))) for pose in pose_data ], "info": "Success", } try: import modules.script_callbacks as script_callbacks script_callbacks.on_app_started(controlnet_api) except Exception: logger.warn("Unable to mount ControlNet API.") ================================================ FILE: scripts/batch_hijack.py ================================================ import os from typing import Tuple, List from modules import img2img, processing, shared, script_callbacks from scripts import external_code from scripts.enums import InputMode from scripts.logging import logger class BatchHijack: def __init__(self): self.is_batch = False self.batch_index = 0 self.batch_size = 1 self.init_seed = None self.init_subseed = None self.process_batch_callbacks = [self.on_process_batch] self.process_batch_each_callbacks = [] self.postprocess_batch_each_callbacks = [self.on_postprocess_batch_each] self.postprocess_batch_callbacks = [self.on_postprocess_batch] def img2img_process_batch_hijack(self, p, *args, **kwargs): try: from scripts.animatediff_utils import get_animatediff_arg ad_params = get_animatediff_arg(p) if ad_params and ad_params.enable: ad_params.is_i2i_batch = True from scripts.animatediff_i2ibatch import animatediff_i2i_batch return animatediff_i2i_batch(p, *args, **kwargs) except ImportError: pass cn_is_batch, batches, output_dir, _ = get_cn_batches(p) if not cn_is_batch: return getattr(img2img, '__controlnet_original_process_batch')(p, *args, **kwargs) self.dispatch_callbacks(self.process_batch_callbacks, p, batches, output_dir) try: return getattr(img2img, '__controlnet_original_process_batch')(p, *args, **kwargs) finally: self.dispatch_callbacks(self.postprocess_batch_callbacks, p) def processing_process_images_hijack(self, p, *args, **kwargs): try: from scripts.animatediff_utils import get_animatediff_arg ad_params = get_animatediff_arg(p) if ad_params and ad_params.enable: return getattr(processing, '__controlnet_original_process_images_inner')(p, *args, **kwargs) except ImportError: pass if self.is_batch: # we are in img2img batch tab, do a single batch iteration return self.process_images_cn_batch(p, *args, **kwargs) cn_is_batch, batches, output_dir, input_file_names = get_cn_batches(p) if not cn_is_batch: # we are not in batch mode, fallback to original function return getattr(processing, '__controlnet_original_process_images_inner')(p, *args, **kwargs) output_images = [] try: self.dispatch_callbacks(self.process_batch_callbacks, p, batches, output_dir) for batch_i in range(self.batch_size): processed = self.process_images_cn_batch(p, *args, **kwargs) if shared.opts.data.get('controlnet_show_batch_images_in_ui', False): output_images.extend(processed.images[processed.index_of_first_image:]) if output_dir: self.save_images(output_dir, input_file_names[batch_i], processed.images[processed.index_of_first_image:]) if shared.state.interrupted: break finally: self.dispatch_callbacks(self.postprocess_batch_callbacks, p) if output_images: processed.images = output_images else: processed = processing.Processed(p, [], p.seed) return processed def process_images_cn_batch(self, p, *args, **kwargs): self.dispatch_callbacks(self.process_batch_each_callbacks, p) old_detectmap_output = shared.opts.data.get('control_net_no_detectmap', False) try: shared.opts.data.update({'control_net_no_detectmap': True}) processed = getattr(processing, '__controlnet_original_process_images_inner')(p, *args, **kwargs) finally: shared.opts.data.update({'control_net_no_detectmap': old_detectmap_output}) self.dispatch_callbacks(self.postprocess_batch_each_callbacks, p, processed) # do not go past control net batch size if self.batch_index >= self.batch_size: shared.state.interrupted = True return processed def save_images(self, output_dir, init_image_path, output_images): os.makedirs(output_dir, exist_ok=True) for n, processed_image in enumerate(output_images): filename = os.path.basename(init_image_path) if n > 0: left, right = os.path.splitext(filename) filename = f"{left}-{n}{right}" if processed_image.mode == 'RGBA': processed_image = processed_image.convert("RGB") processed_image.save(os.path.join(output_dir, filename)) def do_hijack(self): script_callbacks.on_script_unloaded(self.undo_hijack) hijack_function( module=img2img, name='process_batch', new_name='__controlnet_original_process_batch', new_value=self.img2img_process_batch_hijack, ) hijack_function( module=processing, name='process_images_inner', new_name='__controlnet_original_process_images_inner', new_value=self.processing_process_images_hijack ) def undo_hijack(self): unhijack_function( module=img2img, name='process_batch', new_name='__controlnet_original_process_batch', ) unhijack_function( module=processing, name='process_images_inner', new_name='__controlnet_original_process_images_inner', ) def adjust_job_count(self, p): if shared.state.job_count == -1: shared.state.job_count = p.n_iter shared.state.job_count *= self.batch_size def on_process_batch(self, p, batches, output_dir, *args): print('controlnet batch mode') self.is_batch = True self.batch_index = 0 self.batch_size = len(batches) processing.fix_seed(p) if shared.opts.data.get('controlnet_increment_seed_during_batch', False): self.init_seed = p.seed self.init_subseed = p.subseed self.adjust_job_count(p) p.do_not_save_grid = True p.do_not_save_samples = bool(output_dir) def on_postprocess_batch_each(self, p, *args): self.batch_index += 1 if shared.opts.data.get('controlnet_increment_seed_during_batch', False): p.seed = p.seed + len(p.all_prompts) p.subseed = p.subseed + len(p.all_prompts) def on_postprocess_batch(self, p, *args): self.is_batch = False self.batch_index = 0 self.batch_size = 1 if shared.opts.data.get('controlnet_increment_seed_during_batch', False): p.seed = self.init_seed p.all_seeds = [self.init_seed] p.subseed = self.init_subseed p.all_subseeds = [self.init_subseed] def dispatch_callbacks(self, callbacks, *args): for callback in callbacks: callback(*args) def hijack_function(module, name, new_name, new_value): # restore original function in case of reload unhijack_function(module=module, name=name, new_name=new_name) setattr(module, new_name, getattr(module, name)) setattr(module, name, new_value) def unhijack_function(module, name, new_name): if hasattr(module, new_name): setattr(module, name, getattr(module, new_name)) delattr(module, new_name) def get_cn_batches(p: processing.StableDiffusionProcessing) -> Tuple[bool, List[List[str]], str, List[str]]: units = external_code.get_all_units_in_processing(p) units = [unit.copy() for unit in units if getattr(unit, 'enabled', False)] any_unit_is_batch = False output_dir = '' input_file_names = [] for unit in units: if getattr(unit, 'input_mode', InputMode.SIMPLE) == InputMode.BATCH: any_unit_is_batch = True output_dir = getattr(unit, 'output_dir', '') if isinstance(unit.batch_images, str): unit.batch_images = shared.listfiles(unit.batch_images) input_file_names = unit.batch_images if any_unit_is_batch: cn_batch_size = min(len(getattr(unit, 'batch_images', [])) for unit in units if getattr(unit, 'input_mode', InputMode.SIMPLE) == InputMode.BATCH) else: cn_batch_size = 1 batches = [[] for _ in range(cn_batch_size)] for i in range(cn_batch_size): for unit in units: input_mode = getattr(unit, 'input_mode', InputMode.SIMPLE) if input_mode == InputMode.BATCH: batches[i].append(unit.batch_images[i]) else: batches[i].append(unit.image) if any_unit_is_batch: logger.info(f"Batch enabled ({len(batches)})") return any_unit_is_batch, batches, output_dir, input_file_names instance = BatchHijack() ================================================ FILE: scripts/cldm.py ================================================ from typing import List import torch import torch.nn as nn from modules import devices from scripts.controlnet_core.controlnet_union import ControlAddEmbedding, ResBlockUnionControlnet try: from sgm.modules.diffusionmodules.openaimodel import conv_nd, linear, zero_module, timestep_embedding, \ TimestepEmbedSequential, ResBlock, Downsample, SpatialTransformer, exists using_sgm = True except ImportError: from ldm.modules.diffusionmodules.openaimodel import conv_nd, linear, zero_module, timestep_embedding, \ TimestepEmbedSequential, ResBlock, Downsample, SpatialTransformer, exists using_sgm = False class PlugableControlModel(nn.Module): def __init__(self, config, state_dict=None): super().__init__() self.config = config self.control_model = ControlNet(**self.config).cpu() if state_dict is not None: self.control_model.load_state_dict(state_dict, strict=False) self.gpu_component = None self.is_control_lora = False def reset(self): pass def forward(self, *args, **kwargs): return self.control_model(*args, **kwargs) def aggressive_lowvram(self): self.to('cpu') def send_me_to_gpu(module, _): if self.gpu_component == module: return if self.gpu_component is not None: self.gpu_component.to('cpu') module.to(devices.get_device_for("controlnet")) self.gpu_component = module self.control_model.time_embed.register_forward_pre_hook(send_me_to_gpu) self.control_model.input_hint_block.register_forward_pre_hook(send_me_to_gpu) self.control_model.label_emb.register_forward_pre_hook(send_me_to_gpu) for m in self.control_model.input_blocks: m.register_forward_pre_hook(send_me_to_gpu) for m in self.control_model.zero_convs: m.register_forward_pre_hook(send_me_to_gpu) self.control_model.middle_block.register_forward_pre_hook(send_me_to_gpu) self.control_model.middle_block_out.register_forward_pre_hook(send_me_to_gpu) return def fullvram(self): self.to(devices.get_device_for("controlnet")) return class ControlNet(nn.Module): def __init__( self, in_channels, model_channels, hint_channels, num_res_blocks, attention_resolutions, dropout=0, channel_mult=(1, 2, 4, 8), conv_resample=True, dims=2, num_classes=None, use_checkpoint=False, use_fp16=True, num_heads=-1, num_head_channels=-1, num_heads_upsample=-1, use_scale_shift_norm=False, resblock_updown=False, use_spatial_transformer=True, transformer_depth=1, context_dim=None, n_embed=None, legacy=False, disable_self_attentions=None, num_attention_blocks=None, disable_middle_self_attn=False, use_linear_in_transformer=False, adm_in_channels=None, transformer_depth_middle=None, union_controlnet_num_control_type=None, device=None, global_average_pooling=False, ): super().__init__() self.global_average_pooling = global_average_pooling if num_heads_upsample == -1: num_heads_upsample = num_heads self.dims = dims self.in_channels = in_channels self.model_channels = model_channels if isinstance(transformer_depth, int): transformer_depth = len(channel_mult) * [transformer_depth] if transformer_depth_middle is None: transformer_depth_middle = transformer_depth[-1] if isinstance(num_res_blocks, int): self.num_res_blocks = len(channel_mult) * [num_res_blocks] else: self.num_res_blocks = num_res_blocks self.attention_resolutions = attention_resolutions self.dropout = dropout self.channel_mult = channel_mult self.conv_resample = conv_resample self.num_classes = num_classes self.use_checkpoint = use_checkpoint self.dtype = torch.float16 if use_fp16 else torch.float32 self.num_heads = num_heads self.num_head_channels = num_head_channels self.num_heads_upsample = num_heads_upsample self.predict_codebook_ids = n_embed is not None time_embed_dim = model_channels * 4 self.time_embed = nn.Sequential( linear(model_channels, time_embed_dim, dtype=self.dtype, device=device), nn.SiLU(), linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device), ) if self.num_classes is not None: if isinstance(self.num_classes, int): self.label_emb = nn.Embedding(num_classes, time_embed_dim) elif self.num_classes == "continuous": print("setting up linear c_adm embedding layer") self.label_emb = nn.Linear(1, time_embed_dim) elif self.num_classes == "sequential": assert adm_in_channels is not None self.label_emb = nn.Sequential( nn.Sequential( linear(adm_in_channels, time_embed_dim, dtype=self.dtype, device=device), nn.SiLU(), linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device), ) ) else: raise ValueError() self.input_blocks = nn.ModuleList( [ TimestepEmbedSequential( conv_nd(dims, in_channels, model_channels, 3, padding=1, dtype=self.dtype, device=device) ) ] ) self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)]) self.input_hint_block = TimestepEmbedSequential( conv_nd(dims, hint_channels, 16, 3, padding=1), nn.SiLU(), conv_nd(dims, 16, 16, 3, padding=1), nn.SiLU(), conv_nd(dims, 16, 32, 3, padding=1, stride=2), nn.SiLU(), conv_nd(dims, 32, 32, 3, padding=1), nn.SiLU(), conv_nd(dims, 32, 96, 3, padding=1, stride=2), nn.SiLU(), conv_nd(dims, 96, 96, 3, padding=1), nn.SiLU(), conv_nd(dims, 96, 256, 3, padding=1, stride=2), nn.SiLU(), zero_module(conv_nd(dims, 256, model_channels, 3, padding=1)) ) self._feature_size = model_channels input_block_chans = [model_channels] ch = model_channels ds = 1 for level, mult in enumerate(channel_mult): for nr in range(self.num_res_blocks[level]): layers = [ ResBlock( ch, time_embed_dim, dropout, out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm ) ] ch = mult * model_channels if ds in attention_resolutions: if num_head_channels == -1: dim_head = ch // num_heads else: num_heads = ch // num_head_channels dim_head = num_head_channels if legacy: #num_heads = 1 dim_head = ch // num_heads if use_spatial_transformer else num_head_channels if exists(disable_self_attentions): disabled_sa = disable_self_attentions[level] else: disabled_sa = False if not exists(num_attention_blocks) or nr < num_attention_blocks[level]: layers.append( SpatialTransformer( ch, num_heads, dim_head, depth=transformer_depth[level], context_dim=context_dim, disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer, use_checkpoint=use_checkpoint ) ) self.input_blocks.append(TimestepEmbedSequential(*layers)) self.zero_convs.append(self.make_zero_conv(ch)) self._feature_size += ch input_block_chans.append(ch) if level != len(channel_mult) - 1: out_ch = ch self.input_blocks.append( TimestepEmbedSequential( ResBlock( ch, time_embed_dim, dropout, out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, down=True ) if resblock_updown else Downsample( ch, conv_resample, dims=dims, out_channels=out_ch ) ) ) ch = out_ch input_block_chans.append(ch) self.zero_convs.append(self.make_zero_conv(ch)) ds *= 2 self._feature_size += ch if num_head_channels == -1: dim_head = ch // num_heads else: num_heads = ch // num_head_channels dim_head = num_head_channels if legacy: #num_heads = 1 dim_head = ch // num_heads if use_spatial_transformer else num_head_channels self.middle_block = TimestepEmbedSequential( ResBlock( ch, time_embed_dim, dropout, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm ), SpatialTransformer( # always uses a self-attn ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim, disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer, use_checkpoint=use_checkpoint ), ResBlock( ch, time_embed_dim, dropout, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm ), ) self.middle_block_out = self.make_zero_conv(ch) self._feature_size += ch if union_controlnet_num_control_type is not None: self.num_control_type = union_controlnet_num_control_type num_trans_channel = 320 num_trans_head = 8 num_trans_layer = 1 num_proj_channel = 320 self.task_embedding = nn.Parameter(torch.empty( self.num_control_type, num_trans_channel, dtype=self.dtype, device=device )) self.transformer_layes = nn.Sequential(*[ ResBlockUnionControlnet( num_trans_channel, num_trans_head, dtype=self.dtype, device=device ) for _ in range(num_trans_layer) ]) self.spatial_ch_projs = nn.Linear( num_trans_channel, num_proj_channel, dtype=self.dtype, device=device ) control_add_embed_dim = 256 self.control_add_embedding = ControlAddEmbedding( control_add_embed_dim, time_embed_dim, self.num_control_type, dtype=self.dtype, device=device ) else: self.task_embedding = None self.control_add_embedding = None def union_controlnet_merge( self, hint: torch.Tensor, control_type: List[int], emb: torch.Tensor, context: torch.Tensor ): """ Note: control_type is a list of enum values. The length of the list is the number of control images.""" # Equivalent to: https://github.com/xinsir6/ControlNetPlus/tree/main inputs = [] condition_list = [] for idx in range(min(1, len(control_type))): controlnet_cond = self.input_hint_block(hint[idx], emb, context) feat_seq = torch.mean(controlnet_cond, dim=(2, 3)) if idx < len(control_type): feat_seq += self.task_embedding[control_type[idx]] inputs.append(feat_seq.unsqueeze(1)) condition_list.append(controlnet_cond) x = torch.cat(inputs, dim=1) x = self.transformer_layes(x) controlnet_cond_fuser = None for idx in range(len(control_type)): alpha = self.spatial_ch_projs(x[:, idx]) alpha = alpha.unsqueeze(-1).unsqueeze(-1) o = condition_list[idx] + alpha if controlnet_cond_fuser is None: controlnet_cond_fuser = o else: controlnet_cond_fuser += o return controlnet_cond_fuser def make_zero_conv(self, channels): return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0))) def forward(self, x, hint, timesteps, context, y=None, control_type: List[int] = None, **kwargs): original_type = x.dtype x = x.to(self.dtype) hint = hint.to(self.dtype) timesteps = timesteps.to(self.dtype) context = context.to(self.dtype) if y is not None: y = y.to(self.dtype) t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(self.dtype) emb = self.time_embed(t_emb) guided_hint = None if self.control_add_embedding is not None: assert control_type is not None emb += self.control_add_embedding(control_type, emb.dtype, emb.device) if len(control_type) > 0: if len(hint.shape) < 5: hint = hint.unsqueeze(dim=0) guided_hint = self.union_controlnet_merge(hint, control_type, emb, context) if guided_hint is None: guided_hint = self.input_hint_block(hint, emb, context) outs = [] if self.num_classes is not None: assert y.shape[0] == x.shape[0] emb = emb + self.label_emb(y) h = x for module, zero_conv in zip(self.input_blocks, self.zero_convs): if guided_hint is not None: h = module(h, emb, context) h += guided_hint guided_hint = None else: h = module(h, emb, context) outs.append(zero_conv(h, emb, context)) h = self.middle_block(h, emb, context) outs.append(self.middle_block_out(h, emb, context)) outs = [o.to(original_type) for o in outs] return outs ================================================ FILE: scripts/controlnet.py ================================================ import gc import tracemalloc import os import logging from collections import OrderedDict from copy import copy, deepcopy from typing import Dict, Optional, Tuple, List import modules.scripts as scripts from modules import shared, devices, script_callbacks, processing, masking, images import gradio as gr import time from einops import rearrange # Register all preprocessors. import scripts.preprocessor as preprocessor_init # noqa from annotator.util import HWC3 from internal_controlnet.external_code import ControlNetUnit from scripts import global_state, hook, external_code, batch_hijack, controlnet_version, utils from scripts.controlnet_lora import bind_control_lora, unbind_control_lora from scripts.controlnet_lllite import clear_all_lllite from scripts.ipadapter.plugable_ipadapter import ImageEmbed, clear_all_ip_adapter from scripts.ipadapter.pulid_attn import PULID_SETTING_FIDELITY, PULID_SETTING_STYLE from scripts.utils import load_state_dict, get_unique_axis0, align_dim_latent from scripts.hook import ControlParams, UnetHook, HackedImageRNG from scripts.enums import ( ControlModelType, InputMode, StableDiffusionVersion, HiResFixOption, PuLIDMode, ControlMode, BatchOption, ResizeMode, ) from scripts.controlnet_ui.controlnet_ui_group import ControlNetUiGroup from scripts.controlnet_ui.photopea import Photopea from scripts.logging import logger from scripts.supported_preprocessor import Preprocessor from scripts.animate_diff.batch import add_animate_diff_batch_input from modules.processing import StableDiffusionProcessingImg2Img, StableDiffusionProcessingTxt2Img, StableDiffusionProcessing from modules.images import save_image from scripts.infotext import Infotext import cv2 import numpy as np import torch from PIL import Image, ImageFilter, ImageOps from scripts.lvminthin import lvmin_thin, nake_nms from scripts.controlnet_model_guess import build_model_by_guess, ControlModel from scripts.hook import torch_dfs # Gradio 3.32 bug fix import tempfile gradio_tempfile_path = os.path.join(tempfile.gettempdir(), 'gradio') os.makedirs(gradio_tempfile_path, exist_ok=True) def clear_all_secondary_control_models(m): all_modules = torch_dfs(m) for module in all_modules: _original_inner_forward_cn_hijack = getattr(module, '_original_inner_forward_cn_hijack', None) original_forward_cn_hijack = getattr(module, 'original_forward_cn_hijack', None) if _original_inner_forward_cn_hijack is not None: module._forward = _original_inner_forward_cn_hijack if original_forward_cn_hijack is not None: module.forward = original_forward_cn_hijack clear_all_lllite() clear_all_ip_adapter() def find_closest_lora_model_name(search: str): if not search: return None if search in global_state.cn_models: return search search = search.lower() if search in global_state.cn_models_names: return global_state.cn_models_names.get(search) applicable = [name for name in global_state.cn_models_names.keys() if search in name.lower()] if not applicable: return None applicable = sorted(applicable, key=lambda name: len(name)) return global_state.cn_models_names[applicable[0]] def swap_img2img_pipeline(p: processing.StableDiffusionProcessingImg2Img): p.__class__ = processing.StableDiffusionProcessingTxt2Img dummy = processing.StableDiffusionProcessingTxt2Img() for k,v in dummy.__dict__.items(): if hasattr(p, k): continue setattr(p, k, v) global_state.update_cn_models() logger.info(f"ControlNet {controlnet_version.version_flag}") def prepare_mask( mask: Image.Image, p: processing.StableDiffusionProcessing ) -> Image.Image: """ Prepare an image mask for the inpainting process. This function takes as input a PIL Image object and an instance of the StableDiffusionProcessing class, and performs the following steps to prepare the mask: 1. Convert the mask to grayscale (mode "L"). 2. If the 'inpainting_mask_invert' attribute of the processing instance is True, invert the mask colors. 3. If the 'mask_blur' attribute of the processing instance is greater than 0, apply a Gaussian blur to the mask with a radius equal to 'mask_blur'. Args: mask (Image.Image): The input mask as a PIL Image object. p (processing.StableDiffusionProcessing): An instance of the StableDiffusionProcessing class containing the processing parameters. Returns: mask (Image.Image): The prepared mask as a PIL Image object. """ mask = mask.convert("L") if getattr(p, "inpainting_mask_invert", False): mask = ImageOps.invert(mask) if hasattr(p, 'mask_blur_x'): if getattr(p, "mask_blur_x", 0) > 0: np_mask = np.array(mask) kernel_size = 2 * int(2.5 * p.mask_blur_x + 0.5) + 1 np_mask = cv2.GaussianBlur(np_mask, (kernel_size, 1), p.mask_blur_x) mask = Image.fromarray(np_mask) if getattr(p, "mask_blur_y", 0) > 0: np_mask = np.array(mask) kernel_size = 2 * int(2.5 * p.mask_blur_y + 0.5) + 1 np_mask = cv2.GaussianBlur(np_mask, (1, kernel_size), p.mask_blur_y) mask = Image.fromarray(np_mask) else: if getattr(p, "mask_blur", 0) > 0: mask = mask.filter(ImageFilter.GaussianBlur(p.mask_blur)) return mask def set_numpy_seed(p: processing.StableDiffusionProcessing) -> Optional[int]: """ Set the random seed for NumPy based on the provided parameters. Args: p (processing.StableDiffusionProcessing): The instance of the StableDiffusionProcessing class. Returns: Optional[int]: The computed random seed if successful, or None if an exception occurs. This function sets the random seed for NumPy using the seed and subseed values from the given instance of StableDiffusionProcessing. If either seed or subseed is -1, it uses the first value from `all_seeds`. Otherwise, it takes the maximum of the provided seed value and 0. The final random seed is computed by adding the seed and subseed values, applying a bitwise AND operation with 0xFFFFFFFF to ensure it fits within a 32-bit integer. """ try: tmp_seed = int(p.all_seeds[0] if p.seed == -1 else max(int(p.seed), 0)) tmp_subseed = int(p.all_seeds[0] if p.subseed == -1 else max(int(p.subseed), 0)) seed = (tmp_seed + tmp_subseed) & 0xFFFFFFFF np.random.seed(seed) return seed except Exception as e: logger.warning(e) logger.warning('Warning: Failed to use consistent random seed.') return None def get_pytorch_control(x: np.ndarray) -> torch.Tensor: # A very safe method to make sure that Apple/Mac works y = x # below is very boring but do not change these. If you change these Apple or Mac may fail. y = torch.from_numpy(y) y = y.float() / 255.0 y = rearrange(y, 'h w c -> 1 c h w') y = y.clone() y = y.to(devices.get_device_for("controlnet")) y = y.clone() return y def get_control( p: StableDiffusionProcessing, unit: ControlNetUnit, idx: int, control_model_type: ControlModelType, preprocessor: Preprocessor, ): """Get input for a ControlNet unit.""" if unit.is_animate_diff_batch: unit = add_animate_diff_batch_input(p, unit) high_res_fix = isinstance(p, StableDiffusionProcessingTxt2Img) and getattr(p, 'enable_hr', False) h, w, hr_y, hr_x = Script.get_target_dimensions(p) input_image, resize_mode = Script.choose_input_image(p, unit, idx) if isinstance(input_image, list): assert unit.accepts_multiple_inputs or unit.is_animate_diff_batch input_images = input_image else: # Following operations are only for single input image. input_image = Script.try_crop_image_with_a1111_mask(p, unit, input_image, resize_mode) input_image = np.ascontiguousarray(input_image.copy()).copy() # safe numpy if unit.module == 'inpaint_only+lama' and resize_mode == ResizeMode.OUTER_FIT: # inpaint_only+lama is special and required outpaint fix _, input_image = Script.detectmap_proc(input_image, unit.module, resize_mode, hr_y, hr_x) input_images = [input_image] if unit.pixel_perfect: unit.processor_res = external_code.pixel_perfect_resolution( input_images[0], target_H=h, target_W=w, resize_mode=resize_mode, ) # Preprocessor result may depend on numpy random operations, use the # random seed in `StableDiffusionProcessing` to make the # preprocessor result reproducable. # Currently following preprocessors use numpy random: # - shuffle seed = set_numpy_seed(p) logger.debug(f"Use numpy seed {seed}.") logger.info(f"Using preprocessor: {unit.module}") logger.info(f'preprocessor resolution = {unit.processor_res}') detected_maps = [] def store_detected_map(detected_map, module: str) -> None: if unit.save_detected_map: detected_maps.append((detected_map, module)) def preprocess_input_image(input_image: np.ndarray): """ Preprocess single input image. """ result = preprocessor.cached_call( input_image, resolution=unit.processor_res, slider_1=unit.threshold_a, slider_2=unit.threshold_b, low_vram=( ("clip" in unit.module or unit.module == "ip-adapter_face_id_plus") and shared.opts.data.get("controlnet_clip_detector_on_cpu", False) ), model=unit.model, ) detected_map = result.value is_image = preprocessor.returns_image # TODO: Refactor img control detection logic. if high_res_fix: if is_image: hr_control, hr_detected_map = Script.detectmap_proc(detected_map, unit.module, resize_mode, hr_y, hr_x) store_detected_map(hr_detected_map, unit.module) else: hr_control = detected_map else: hr_control = None if is_image: control, detected_map = Script.detectmap_proc(detected_map, unit.module, resize_mode, h, w) store_detected_map(detected_map, unit.module) else: control = detected_map for image in result.display_images: store_detected_map(image, unit.module) if control_model_type == ControlModelType.T2I_StyleAdapter: control = control['last_hidden_state'] if control_model_type == ControlModelType.ReVision: control = control['image_embeds'] if is_image and unit.is_animate_diff_batch: # AnimateDiff save VRAM control = control.cpu() if hr_control is not None: hr_control = hr_control.cpu() return control, hr_control def optional_tqdm(iterable, use_tqdm=unit.is_animate_diff_batch): from tqdm import tqdm return tqdm(iterable) if use_tqdm else iterable controls, hr_controls = list(zip(*[preprocess_input_image(img) for img in optional_tqdm(input_images)])) assert len(controls) == len(hr_controls) return controls, hr_controls, detected_maps class Script(scripts.Script, metaclass=( utils.TimeMeta if logger.level == logging.DEBUG else type)): model_cache: Dict[str, ControlModel] = OrderedDict() def __init__(self) -> None: super().__init__() self.latest_network = None self.input_image = None self.latest_model_hash = "" self.enabled_units: List[ControlNetUnit] = [] self.detected_map = [] self.post_processors = [] self.noise_modifier = None self.ui_batch_option_state = [BatchOption.DEFAULT.value, False] # `Script` instance is created twice, once for img2img and once for txt2img. # However, A1111 does not pass is_img2img to script constructor, so this field # is initialized in `ui` method. self.is_img2img = False batch_hijack.instance.process_batch_callbacks.append(self.batch_tab_process) batch_hijack.instance.process_batch_each_callbacks.append(self.batch_tab_process_each) batch_hijack.instance.postprocess_batch_each_callbacks.insert(0, self.batch_tab_postprocess_each) batch_hijack.instance.postprocess_batch_callbacks.insert(0, self.batch_tab_postprocess) def title(self): return "ControlNet" def show(self, is_img2img): return scripts.AlwaysVisible def uigroup(self, tabname: str, is_img2img: bool, elem_id_tabname: str, photopea: Optional[Photopea]) -> Tuple[ControlNetUiGroup, gr.State]: group = ControlNetUiGroup(is_img2img, photopea) return group, group.render(tabname, elem_id_tabname) def ui_batch_options(self, is_img2img: bool, elem_id_tabname: str): batch_option = gr.Radio( choices=[e.value for e in BatchOption], value=BatchOption.DEFAULT.value, label="Batch Option", elem_id=f"{elem_id_tabname}_controlnet_batch_option_radio", elem_classes="controlnet_batch_option_radio", ) use_batch_style_align = gr.Checkbox( label='[StyleAlign] Align image style in the batch.' ) unit_args = [batch_option, use_batch_style_align] def update_ui_batch_options(*args): self.ui_batch_option_state = args return for comp in unit_args: event_subscribers = [] if hasattr(comp, "edit"): event_subscribers.append(comp.edit) elif hasattr(comp, "click"): event_subscribers.append(comp.click) elif isinstance(comp, gr.Slider) and hasattr(comp, "release"): event_subscribers.append(comp.release) elif hasattr(comp, "change"): event_subscribers.append(comp.change) if hasattr(comp, "clear"): event_subscribers.append(comp.clear) for event_subscriber in event_subscribers: event_subscriber( fn=update_ui_batch_options, inputs=unit_args ) return def ui(self, is_img2img): """this function should create gradio UI elements. See https://gradio.app/docs/#components The return value should be an array of all components that are used in processing. Values of those returned components will be passed to run() and process() functions. """ self.is_img2img = is_img2img infotext = Infotext() ui_groups = [] controls = [] max_models = shared.opts.data.get("control_net_unit_count", 3) elem_id_tabname = ("img2img" if is_img2img else "txt2img") + "_controlnet" with gr.Group(elem_id=elem_id_tabname): with gr.Accordion(f"ControlNet {controlnet_version.version_flag}", open = False, elem_id="controlnet"): photopea = Photopea() if not shared.opts.data.get("controlnet_disable_photopea_edit", False) else None if max_models > 1: with gr.Tabs(elem_id=f"{elem_id_tabname}_tabs"): for i in range(max_models): with gr.Tab(f"ControlNet Unit {i}", elem_classes=['cnet-unit-tab']): group, state = self.uigroup(f"ControlNet-{i}", is_img2img, elem_id_tabname, photopea) ui_groups.append(group) controls.append(state) else: with gr.Column(): group, state = self.uigroup("ControlNet", is_img2img, elem_id_tabname, photopea) ui_groups.append(group) controls.append(state) with gr.Accordion("Batch Options", open=False, elem_id="controlnet_batch_options"): self.ui_batch_options(is_img2img, elem_id_tabname) for i, ui_group in enumerate(ui_groups): infotext.register_unit(i, ui_group) if shared.opts.data.get("control_net_sync_field_args", True): self.infotext_fields = infotext.infotext_fields self.paste_field_names = infotext.paste_field_names return tuple(controls) @staticmethod def clear_control_model_cache(): Script.model_cache.clear() gc.collect() devices.torch_gc() @staticmethod def load_control_model(p, unet, model) -> ControlModel: if model in Script.model_cache: logger.info(f"Loading model from cache: {model}") control_model = Script.model_cache[model] if control_model.type == ControlModelType.Controlllite: # Falls through to load Controlllite model fresh. # TODO Fix context sharing issue for Controlllite. pass elif not control_model.type.allow_context_sharing: # Creates a shallow-copy of control_model so that configs/inputs # from different units can be bind correctly. While heavy objects # of the underlying nn.Module is not copied. return ControlModel(copy(control_model.model), control_model.type) else: return control_model # Remove model from cache to clear space before building another model if len(Script.model_cache) > 0 and len(Script.model_cache) >= shared.opts.data.get("control_net_model_cache_size", 2): Script.model_cache.popitem(last=False) gc.collect() devices.torch_gc() control_model = Script.build_control_model(p, unet, model) if shared.opts.data.get("control_net_model_cache_size", 2) > 0: Script.model_cache[model] = control_model return control_model @staticmethod def build_control_model(p, unet, model) -> ControlModel: if model is None or model == 'None': raise RuntimeError("You have not selected any ControlNet Model.") model_path = global_state.cn_models.get(model, None) if model_path is None: model = find_closest_lora_model_name(model) model_path = global_state.cn_models.get(model, None) if model_path is None: raise RuntimeError(f"model not found: {model}") # trim '"' at start/end if model_path.startswith("\"") and model_path.endswith("\""): model_path = model_path[1:-1] if not os.path.exists(model_path): raise ValueError(f"file not found: {model_path}") logger.info(f"Loading model: {model}") state_dict = load_state_dict(model_path) control_model = build_model_by_guess(state_dict, unet, model_path) control_model.model.to('cpu', dtype=p.sd_model.dtype) logger.info(f"ControlNet model {model}({control_model.type}) loaded.") return control_model @staticmethod def get_remote_call(p, attribute, default=None, idx=0, strict=False, force=False): if not force and not shared.opts.data.get("control_net_allow_script_control", False): return default def get_element(obj, strict=False): if not isinstance(obj, list): return obj if not strict or idx == 0 else None elif idx < len(obj): return obj[idx] else: return None attribute_value = get_element(getattr(p, attribute, None), strict) return attribute_value if attribute_value is not None else default @staticmethod def parse_remote_call(p, unit: ControlNetUnit, idx): selector = Script.get_remote_call unit.enabled = selector(p, "control_net_enabled", unit.enabled, idx, strict=True) unit.module = selector(p, "control_net_module", unit.module, idx) unit.model = selector(p, "control_net_model", unit.model, idx) unit.weight = selector(p, "control_net_weight", unit.weight, idx) unit.image = selector(p, "control_net_image", unit.image, idx) unit.resize_mode = selector(p, "control_net_resize_mode", unit.resize_mode, idx) unit.low_vram = selector(p, "control_net_lowvram", unit.low_vram, idx) unit.processor_res = selector(p, "control_net_pres", unit.processor_res, idx) unit.threshold_a = selector(p, "control_net_pthr_a", unit.threshold_a, idx) unit.threshold_b = selector(p, "control_net_pthr_b", unit.threshold_b, idx) unit.guidance_start = selector(p, "control_net_guidance_start", unit.guidance_start, idx) unit.guidance_end = selector(p, "control_net_guidance_end", unit.guidance_end, idx) # Backward compatibility. See https://github.com/Mikubill/sd-webui-controlnet/issues/1740 # for more details. unit.guidance_end = selector(p, "control_net_guidance_strength", unit.guidance_end, idx) unit.control_mode = selector(p, "control_net_control_mode", unit.control_mode, idx) unit.pixel_perfect = selector(p, "control_net_pixel_perfect", unit.pixel_perfect, idx) return unit @staticmethod def detectmap_proc(detected_map, module, resize_mode, h, w): if 'inpaint' in module: detected_map = detected_map.astype(np.float32) else: detected_map = HWC3(detected_map) def safe_numpy(x): # A very safe method to make sure that Apple/Mac works y = x # below is very boring but do not change these. If you change these Apple or Mac may fail. y = y.copy() y = np.ascontiguousarray(y) y = y.copy() return y def high_quality_resize(x, size): # Written by lvmin # Super high-quality control map up-scaling, considering binary, seg, and one-pixel edges inpaint_mask = None if x.ndim == 3 and x.shape[2] == 4: inpaint_mask = x[:, :, 3] x = x[:, :, 0:3] if x.shape[0] != size[1] or x.shape[1] != size[0]: new_size_is_smaller = (size[0] * size[1]) < (x.shape[0] * x.shape[1]) new_size_is_bigger = (size[0] * size[1]) > (x.shape[0] * x.shape[1]) unique_color_count = len(get_unique_axis0(x.reshape(-1, x.shape[2]))) is_one_pixel_edge = False is_binary = False if unique_color_count == 2: is_binary = np.min(x) < 16 and np.max(x) > 240 if is_binary: xc = x xc = cv2.erode(xc, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1) xc = cv2.dilate(xc, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1) one_pixel_edge_count = np.where(xc < x)[0].shape[0] all_edge_count = np.where(x > 127)[0].shape[0] is_one_pixel_edge = one_pixel_edge_count * 2 > all_edge_count if 2 < unique_color_count < 200: interpolation = cv2.INTER_NEAREST elif new_size_is_smaller: interpolation = cv2.INTER_AREA else: interpolation = cv2.INTER_CUBIC # Must be CUBIC because we now use nms. NEVER CHANGE THIS y = cv2.resize(x, size, interpolation=interpolation) if inpaint_mask is not None: inpaint_mask = cv2.resize(inpaint_mask, size, interpolation=interpolation) if is_binary: y = np.mean(y.astype(np.float32), axis=2).clip(0, 255).astype(np.uint8) if is_one_pixel_edge: y = nake_nms(y) _, y = cv2.threshold(y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) y = lvmin_thin(y, prunings=new_size_is_bigger) else: _, y = cv2.threshold(y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) y = np.stack([y] * 3, axis=2) else: y = x if inpaint_mask is not None: inpaint_mask = (inpaint_mask > 127).astype(np.float32) * 255.0 inpaint_mask = inpaint_mask[:, :, None].clip(0, 255).astype(np.uint8) y = np.concatenate([y, inpaint_mask], axis=2) return y if resize_mode == ResizeMode.RESIZE: detected_map = high_quality_resize(detected_map, (w, h)) detected_map = safe_numpy(detected_map) return get_pytorch_control(detected_map), detected_map old_h, old_w, _ = detected_map.shape old_w = float(old_w) old_h = float(old_h) k0 = float(h) / old_h k1 = float(w) / old_w safeint = lambda x: int(np.round(x)) if resize_mode == ResizeMode.OUTER_FIT: k = min(k0, k1) borders = np.concatenate([detected_map[0, :, :], detected_map[-1, :, :], detected_map[:, 0, :], detected_map[:, -1, :]], axis=0) high_quality_border_color = np.median(borders, axis=0).astype(detected_map.dtype) if len(high_quality_border_color) == 4: # Inpaint hijack high_quality_border_color[3] = 255 high_quality_background = np.tile(high_quality_border_color[None, None], [h, w, 1]) detected_map = high_quality_resize(detected_map, (safeint(old_w * k), safeint(old_h * k))) new_h, new_w, _ = detected_map.shape pad_h = max(0, (h - new_h) // 2) pad_w = max(0, (w - new_w) // 2) high_quality_background[pad_h:pad_h + new_h, pad_w:pad_w + new_w] = detected_map detected_map = high_quality_background detected_map = safe_numpy(detected_map) return get_pytorch_control(detected_map), detected_map else: k = max(k0, k1) detected_map = high_quality_resize(detected_map, (safeint(old_w * k), safeint(old_h * k))) new_h, new_w, _ = detected_map.shape pad_h = max(0, (new_h - h) // 2) pad_w = max(0, (new_w - w) // 2) detected_map = detected_map[pad_h:pad_h+h, pad_w:pad_w+w] detected_map = safe_numpy(detected_map) return get_pytorch_control(detected_map), detected_map @staticmethod def get_enabled_units(p): def unfold_merged(unit: ControlNetUnit) -> List[ControlNetUnit]: """Unfolds a merged unit to multiple units. Keeps the unit merged for preprocessors that can accept multiple input images. """ if unit.input_mode != InputMode.MERGE: return [unit] if unit.accepts_multiple_inputs: unit.input_mode = InputMode.SIMPLE return [unit] assert isinstance(unit.image, list) result = [] for image in unit.image: u = unit.copy() u.image = [image] u.input_mode = InputMode.SIMPLE u.weight = unit.weight / len(unit.image) result.append(u) return result units = external_code.get_all_units_in_processing(p) if len(units) == 0: # fill a null group remote_unit = Script.parse_remote_call(p, ControlNetUnit(), 0) if remote_unit.enabled: units.append(remote_unit) enabled_units = [] for idx, unit in enumerate(units): local_unit = Script.parse_remote_call(p, unit, idx) if not local_unit.enabled: continue enabled_units.extend(unfold_merged(local_unit)) Infotext.write_infotext(enabled_units, p) return enabled_units @staticmethod def choose_input_image( p: processing.StableDiffusionProcessing, unit: ControlNetUnit, idx: int ) -> Tuple[np.ndarray, ResizeMode]: """ Choose input image from following sources with descending priority: - p.image_control: [Deprecated] Lagacy way to pass image to controlnet. - p.control_net_input_image: [Deprecated] Lagacy way to pass image to controlnet. - unit.image: ControlNet unit input image. - p.init_images: A1111 img2img input image. Returns: - The input image in ndarray form. - The resize mode. """ def from_rgba_to_input(img: np.ndarray) -> np.ndarray: if ( shared.opts.data.get("controlnet_ignore_noninpaint_mask", False) or (img[:, :, 3] <= 5).all() or (img[:, :, 3] >= 250).all() ): # Take RGB return img[:, :, :3] logger.info("Canvas scribble mode. Using mask scribble as input.") return HWC3(img[:, :, 3]) # 4 input image sources. p_image_control = getattr(p, "image_control", None) p_input_image = Script.get_remote_call(p, "control_net_input_image", None, idx) image = unit.get_input_images_rgba() a1111_image = getattr(p, "init_images", [None])[0] resize_mode = unit.resize_mode if batch_hijack.instance.is_batch and p_image_control is not None: logger.warning("Warn: Using legacy field 'p.image_control'.") input_image = HWC3(np.asarray(p_image_control)) elif p_input_image is not None: logger.warning("Warn: Using legacy field 'p.controlnet_input_image'") if isinstance(p_input_image, dict) and "mask" in p_input_image and "image" in p_input_image: color = HWC3(np.asarray(p_input_image['image'])) alpha = np.asarray(p_input_image['mask'])[..., None] input_image = np.concatenate([color, alpha], axis=2) else: input_image = HWC3(np.asarray(p_input_image)) elif image is not None: assert isinstance(image, list) # Inpaint mask or CLIP mask. if unit.is_inpaint or unit.uses_clip: # RGBA input_image = image else: # RGB input_image = [from_rgba_to_input(img) for img in image] if len(input_image) == 1: input_image = input_image[0] elif a1111_image is not None: input_image = HWC3(np.asarray(a1111_image)) a1111_i2i_resize_mode = getattr(p, "resize_mode", None) assert a1111_i2i_resize_mode is not None resize_mode = external_code.resize_mode_from_value(a1111_i2i_resize_mode) a1111_mask_image : Optional[Image.Image] = getattr(p, "image_mask", None) if unit.is_inpaint: if a1111_mask_image is not None: a1111_mask = np.array(prepare_mask(a1111_mask_image, p)) assert a1111_mask.ndim == 2 assert a1111_mask.shape[0] == input_image.shape[0] assert a1111_mask.shape[1] == input_image.shape[1] input_image = np.concatenate([input_image[:, :, 0:3], a1111_mask[:, :, None]], axis=2) else: input_image = np.concatenate([ input_image[:, :, 0:3], np.zeros_like(input_image, dtype=np.uint8)[:, :, 0:1], ], axis=2) else: # No input image detected. if batch_hijack.instance.is_batch: shared.state.interrupted = True raise ValueError("controlnet is enabled but no input image is given") assert isinstance(input_image, (np.ndarray, list)) return input_image, resize_mode @staticmethod def try_crop_image_with_a1111_mask( p: StableDiffusionProcessing, unit: ControlNetUnit, input_image: np.ndarray, resize_mode: ResizeMode, ) -> np.ndarray: """ Crop ControlNet input image based on A1111 inpaint mask given. This logic is crutial in upscale scripts, as they use A1111 mask + inpaint_full_res to crop tiles. """ # Note: The method determining whether the active script is an upscale script is purely # based on `extra_generation_params` these scripts attach on `p`, and subject to change # in the future. # TODO: Change this to a more robust condition once A1111 offers a way to verify script name. is_upscale_script = any("upscale" in k.lower() and "Hires" not in k for k in getattr(p, "extra_generation_params", {}).keys()) logger.debug(f"is_upscale_script={is_upscale_script}") # Note: `inpaint_full_res` is "inpaint area" on UI. The flag is `True` when "Only masked" # option is selected. a1111_mask_image : Optional[Image.Image] = getattr(p, "image_mask", None) is_only_masked_inpaint = ( issubclass(type(p), StableDiffusionProcessingImg2Img) and p.inpaint_full_res and a1111_mask_image is not None ) if ( 'reference' not in unit.module and is_only_masked_inpaint and (is_upscale_script or unit.inpaint_crop_input_image) ): logger.debug("Crop input image based on A1111 mask.") input_image = [input_image[:, :, i] for i in range(input_image.shape[2])] input_image = [Image.fromarray(x) for x in input_image] mask = prepare_mask(a1111_mask_image, p) crop_region = masking.get_crop_region(np.array(mask), p.inpaint_full_res_padding) crop_region = masking.expand_crop_region(crop_region, p.width, p.height, mask.width, mask.height) input_image = [ images.resize_image(resize_mode.int_value(), i, mask.width, mask.height) for i in input_image ] input_image = [x.crop(crop_region) for x in input_image] input_image = [ images.resize_image(ResizeMode.OUTER_FIT.int_value(), x, p.width, p.height) for x in input_image ] input_image = [np.asarray(x)[:, :, 0] for x in input_image] input_image = np.stack(input_image, axis=2) return input_image @staticmethod def check_sd_version_compatible(unit: ControlNetUnit) -> None: """ Checks whether the given ControlNet unit has model compatible with the currently active sd model. An exception is thrown if ControlNet unit is detected to be incompatible. """ sd_version = global_state.get_sd_version() assert sd_version != StableDiffusionVersion.UNKNOWN if "revision" in unit.module.lower() and sd_version != StableDiffusionVersion.SDXL: raise Exception(f"Preprocessor 'revision' only supports SDXL. Current SD base model is {sd_version}.") # No need to check if the ControlModelType does not require model to be present. if unit.model is None or unit.model.lower() == "none": return cnet_sd_version = StableDiffusionVersion.detect_from_model_name(unit.model) if cnet_sd_version == StableDiffusionVersion.UNKNOWN: logger.warn(f"Unable to determine version for ControlNet model '{unit.model}'.") return if not sd_version.is_compatible_with(cnet_sd_version): raise Exception(f"ControlNet model {unit.model}({cnet_sd_version}) is not compatible with sd model({sd_version})") @staticmethod def get_target_dimensions(p: StableDiffusionProcessing) -> Tuple[int, int, int, int]: """Returns (h, w, hr_h, hr_w).""" h = align_dim_latent(p.height) w = align_dim_latent(p.width) high_res_fix = ( isinstance(p, StableDiffusionProcessingTxt2Img) and getattr(p, 'enable_hr', False) ) if high_res_fix: if p.hr_resize_x == 0 and p.hr_resize_y == 0: hr_y = int(p.height * p.hr_scale) hr_x = int(p.width * p.hr_scale) else: hr_y, hr_x = p.hr_resize_y, p.hr_resize_x hr_y = align_dim_latent(hr_y) hr_x = align_dim_latent(hr_x) else: hr_y = h hr_x = w return h, w, hr_y, hr_x def controlnet_main_entry(self, p): sd_ldm = p.sd_model unet = sd_ldm.model.diffusion_model self.noise_modifier = None setattr(p, 'controlnet_control_loras', []) if self.latest_network is not None: # always restore (~0.05s) self.latest_network.restore() # always clear (~0.05s) clear_all_secondary_control_models(unet) if not batch_hijack.instance.is_batch: self.enabled_units = Script.get_enabled_units(p) batch_option_uint_separate = self.ui_batch_option_state[0] == BatchOption.SEPARATE.value batch_option_style_align = self.ui_batch_option_state[1] if len(self.enabled_units) == 0 and not batch_option_style_align: self.latest_network = None return logger.info(f"unit_separate = {batch_option_uint_separate}, style_align = {batch_option_style_align}") detected_maps = [] forward_params: List[ControlParams] = [] post_processors = [] # cache stuff if self.latest_model_hash != p.sd_model.sd_model_hash: Script.clear_control_model_cache() self.latest_model_hash = p.sd_model.sd_model_hash # Unload unused preprocessors Preprocessor.unload_unused(active_processors={ p for unit in self.enabled_units for p in unit.get_actual_preprocessors() }) high_res_fix = isinstance(p, StableDiffusionProcessingTxt2Img) and getattr(p, 'enable_hr', False) for idx, unit in enumerate(self.enabled_units): Script.check_sd_version_compatible(unit) if ( 'inpaint_only' == unit.module and issubclass(type(p), StableDiffusionProcessingImg2Img) and p.image_mask is not None ): logger.warning('A1111 inpaint and ControlNet inpaint duplicated. Falls back to inpaint_global_harmonious.') unit.module = 'inpaint' preprocessor = Preprocessor.get_preprocessor(unit.module) assert preprocessor is not None if preprocessor.do_not_need_model: model_net = None if 'reference' in unit.module: control_model_type = ControlModelType.AttentionInjection elif 'revision' in unit.module: control_model_type = ControlModelType.ReVision else: raise Exception("Unable to determine control_model_type.") else: model_net, control_model_type = Script.load_control_model(p, unet, unit.model) model_net.reset() if model_net is not None and getattr(devices, "fp8", False) and control_model_type == ControlModelType.ControlNet: for _module in model_net.modules(): # FIXME: let's only apply fp8 to ControlNet for now if isinstance(_module, (torch.nn.Conv2d, torch.nn.Linear)): _module.to(torch.float8_e4m3fn) if control_model_type == ControlModelType.ControlLoRA: control_lora = model_net.control_model bind_control_lora(unet, control_lora) p.controlnet_control_loras.append(control_lora) if unit.effective_region_mask is not None: assert control_model_type.supports_effective_region_mask, f"`effective_region_mask` not supported for {control_model_type}" if unit.ipadapter_input is not None: # Use ipadapter_input from API call. assert control_model_type == ControlModelType.IPAdapter controls = unit.ipadapter_input hr_controls = unit.ipadapter_input else: controls, hr_controls, additional_maps = get_control( p, unit, idx, control_model_type, preprocessor) detected_maps.extend(additional_maps) if len(controls) == len(hr_controls) == 1 and control_model_type not in [ControlModelType.SparseCtrl]: control = controls[0] hr_control = hr_controls[0] elif unit.is_animate_diff_batch or control_model_type in [ControlModelType.SparseCtrl]: cn_ad_keyframe_idx = getattr(unit, "batch_keyframe_idx", None) def ad_process_control(cc: List[torch.Tensor], cn_ad_keyframe_idx=cn_ad_keyframe_idx): if unit.is_ipadapter: ip_adapter_image_emb_cond = [] model_net.ipadapter.image_proj_model.to(torch.float32) # noqa for c in cc: c = model_net.ipadapter.get_image_emb(c) # noqa ip_adapter_image_emb_cond.append(c.cond_emb) c_cond = torch.cat(ip_adapter_image_emb_cond, dim=0) c = ImageEmbed(c_cond, c.uncond_emb, True) else: c = torch.cat(cc, dim=0) # SparseCtrl keyframe need to encode control image with VAE if control_model_type == ControlModelType.SparseCtrl and \ model_net.control_model.use_simplified_condition_embedding: # noqa c = UnetHook.call_vae_using_process(p, c) # handle key frame control for different control methods if cn_ad_keyframe_idx is not None or control_model_type in [ControlModelType.SparseCtrl]: if control_model_type == ControlModelType.SparseCtrl: # sparsectrl has its own embed generator from scripts.controlnet_sparsectrl import SparseCtrl if cn_ad_keyframe_idx is None: cn_ad_keyframe_idx = [0] logger.info(f"SparseCtrl: control images will be applied to frames: {cn_ad_keyframe_idx}") else: logger.info(f"SparseCtrl: control images will be applied to frames: {cn_ad_keyframe_idx}") for frame_idx, frame_path in zip(unit.batch_keyframe_idx, unit.batch_image_files): logger.info(f"\t{frame_idx}: {frame_path}") c = SparseCtrl.create_cond_mask(cn_ad_keyframe_idx, c, p.batch_size).cpu() elif unit.is_ipadapter: # ip-adapter should do prompt travel logger.info("IP-Adapter: control prompts will be traveled in the following way:") for frame_idx, frame_path in zip(unit.batch_keyframe_idx, unit.batch_image_files): logger.info(f"\t{frame_idx}: {frame_path}") from scripts.animatediff_utils import get_animatediff_arg ip_adapter_emb = c c = c.cond_emb c_full = torch.zeros((p.batch_size, *c.shape[1:]), dtype=c.dtype, device=c.device) for i, idx in enumerate(cn_ad_keyframe_idx[:-1]): c_full[idx:cn_ad_keyframe_idx[i + 1]] = c[i] c_full[cn_ad_keyframe_idx[-1]:] = c[-1] ad_params = get_animatediff_arg(p) prompt_scheduler = deepcopy(ad_params.prompt_scheduler) prompt_scheduler.prompt_map = {i: "" for i in cn_ad_keyframe_idx} prompt_closed_loop = (ad_params.video_length > ad_params.batch_size) and (ad_params.closed_loop in ['R+P', 'A']) c_full = prompt_scheduler.multi_cond(c_full, prompt_closed_loop) if shared.opts.batch_cond_uncond: c_full = torch.cat([c_full, c_full], dim=0) c = ImageEmbed(c_full, ip_adapter_emb.uncond_emb, True) else: # normal CN should insert empty frames logger.info(f"ControlNet: control images will be applied to frames: {cn_ad_keyframe_idx} where") for frame_idx, frame_path in zip(unit.batch_keyframe_idx, unit.batch_image_files): logger.info(f"\t{frame_idx}: {frame_path}") c_full = torch.zeros((p.batch_size, *c.shape[1:]), dtype=c.dtype, device=c.device) c_full[cn_ad_keyframe_idx] = c c = c_full # handle batch condition and unconditional if shared.opts.batch_cond_uncond and not unit.accepts_multiple_inputs: c = torch.cat([c, c], dim=0) return c control = ad_process_control(controls) hr_control = ad_process_control(hr_controls) if hr_controls[0] is not None else None if control_model_type == ControlModelType.SparseCtrl: control_model_type = ControlModelType.ControlNet else: control = controls hr_control = hr_controls preprocessor_dict = dict( name=unit.module, preprocessor_resolution=unit.processor_res, threshold_a=unit.threshold_a, threshold_b=unit.threshold_b ) global_average_pooling = ( control_model_type.is_controlnet and model_net.control_model.global_average_pooling ) if control_model_type == ControlModelType.ControlNetUnion: logger.info(f"ControlNetUnion control type: {unit.union_control_type}") forward_param = ControlParams( control_model=model_net, preprocessor=preprocessor_dict, hint_cond=control, weight=unit.weight, guidance_stopped=False, start_guidance_percent=unit.guidance_start, stop_guidance_percent=unit.guidance_end, advanced_weighting=unit.advanced_weighting, control_model_type=control_model_type, global_average_pooling=global_average_pooling, hr_hint_cond=hr_control, hr_option=unit.hr_option if high_res_fix else HiResFixOption.BOTH, soft_injection=unit.control_mode != ControlMode.BALANCED, cfg_injection=unit.control_mode == ControlMode.CONTROL, effective_region_mask=( get_pytorch_control(unit.effective_region_mask)[:, 0:1, :, :] if unit.effective_region_mask is not None else None ), # TODO: Implement merge of units with the same union model. union_control_types=[unit.union_control_type], ) forward_params.append(forward_param) if 'inpaint_only' in unit.module: final_inpaint_feed = hr_control if hr_control is not None else control final_inpaint_feed = final_inpaint_feed.detach().cpu().numpy() final_inpaint_feed = np.ascontiguousarray(final_inpaint_feed).copy() final_inpaint_mask = final_inpaint_feed[:, 3, :, :].astype(np.float32) final_inpaint_raw = final_inpaint_feed[:, :3].astype(np.float32) sigma = shared.opts.data.get("control_net_inpaint_blur_sigma", 7) final_inpaint_mask_post_cv = [] for m in final_inpaint_mask: m = cv2.dilate(m, np.ones((sigma, sigma), dtype=np.uint8)) m = cv2.blur(m, (sigma, sigma))[None, None] final_inpaint_mask_post_cv.append(m) final_inpaint_mask = np.concatenate(final_inpaint_mask_post_cv, axis=0) _, _, Hmask, Wmask = final_inpaint_mask.shape final_inpaint_raw = torch.from_numpy(np.ascontiguousarray(final_inpaint_raw).copy()) final_inpaint_mask = torch.from_numpy(np.ascontiguousarray(final_inpaint_mask).copy()) def inpaint_only_post_processing(x, i): if i >= final_inpaint_raw.shape[0]: i = 0 _, H, W = x.shape if Hmask != H or Wmask != W: logger.error('Error: ControlNet find post-processing resolution mismatch. This could be related to other extensions hacked processing.') return x r = final_inpaint_raw[i].to(x.dtype).to(x.device) m = final_inpaint_mask[i].to(x.dtype).to(x.device) y = m * x.clip(0, 1) + (1 - m) * r y = y.clip(0, 1) return y post_processors.append(inpaint_only_post_processing) if 'recolor' in unit.module: final_feed = hr_control if hr_control is not None else control final_feed = final_feed.detach().cpu().numpy() final_feed = np.ascontiguousarray(final_feed).copy() final_feed = final_feed[:, 0, :, :].astype(np.float32) final_feed = (final_feed * 255).clip(0, 255).astype(np.uint8) _, Hfeed, Wfeed = final_feed.shape if 'luminance' in unit.module: def recolor_luminance_post_processing(x, i): if i >= final_feed.shape[0]: i = 0 C, H, W = x.shape if Hfeed != H or Wfeed != W or C != 3: logger.error('Error: ControlNet find post-processing resolution mismatch. This could be related to other extensions hacked processing.') return x h = x.detach().cpu().numpy().transpose((1, 2, 0)) h = (h * 255).clip(0, 255).astype(np.uint8) h = cv2.cvtColor(h, cv2.COLOR_RGB2LAB) h[:, :, 0] = final_feed[i] h = cv2.cvtColor(h, cv2.COLOR_LAB2RGB) h = (h.astype(np.float32) / 255.0).transpose((2, 0, 1)) y = torch.from_numpy(h).clip(0, 1).to(x) return y post_processors.append(recolor_luminance_post_processing) if 'intensity' in unit.module: def recolor_intensity_post_processing(x, i): if i >= final_feed.shape[0]: i = 0 C, H, W = x.shape if Hfeed != H or Wfeed != W or C != 3: logger.error('Error: ControlNet find post-processing resolution mismatch. This could be related to other extensions hacked processing.') return x h = x.detach().cpu().numpy().transpose((1, 2, 0)) h = (h * 255).clip(0, 255).astype(np.uint8) h = cv2.cvtColor(h, cv2.COLOR_RGB2HSV) h[:, :, 2] = final_feed[i] h = cv2.cvtColor(h, cv2.COLOR_HSV2RGB) h = (h.astype(np.float32) / 255.0).transpose((2, 0, 1)) y = torch.from_numpy(h).clip(0, 1).to(x) return y post_processors.append(recolor_intensity_post_processing) if '+lama' in unit.module: forward_param.used_hint_cond_latent = hook.UnetHook.call_vae_using_process(p, control) self.noise_modifier = forward_param.used_hint_cond_latent del model_net is_low_vram = any(unit.low_vram for unit in self.enabled_units) for i, (param, unit) in enumerate(zip(forward_params, self.enabled_units)): if param.control_model_type == ControlModelType.IPAdapter: if param.advanced_weighting is not None: logger.info(f"IP-Adapter using advanced weighting {param.advanced_weighting}") assert len(param.advanced_weighting) == global_state.get_sd_version().transformer_block_num # Convert advanced weighting list to dict weight = { i: w for i, w in enumerate(param.advanced_weighting) if w > 0 } else: weight = param.weight h, w, hr_y, hr_x = Script.get_target_dimensions(p) if unit.pulid_mode == PuLIDMode.STYLE: pulid_attn_setting = PULID_SETTING_STYLE else: assert unit.pulid_mode == PuLIDMode.FIDELITY pulid_attn_setting = PULID_SETTING_FIDELITY param.control_model.hook( model=unet, preprocessor_outputs=param.hint_cond, weight=weight, dtype=torch.float32, start=param.start_guidance_percent, end=param.stop_guidance_percent, latent_width=w // 8, latent_height=h // 8, effective_region_mask=param.effective_region_mask, pulid_attn_setting=pulid_attn_setting, ) if param.control_model_type == ControlModelType.Controlllite: param.control_model.hook( model=unet, cond=param.hint_cond, weight=param.weight, start=param.start_guidance_percent, end=param.stop_guidance_percent ) if param.control_model_type == ControlModelType.InstantID: # For instant_id we always expect ip-adapter model followed # by ControlNet model. assert i > 0, "InstantID control model should follow ipadapter model." ip_adapter_param = forward_params[i - 1] assert ip_adapter_param.control_model_type == ControlModelType.IPAdapter, \ "InstantID control model should follow ipadapter model." control_model = ip_adapter_param.control_model assert hasattr(control_model, "image_emb") param.control_context_override = control_model.image_emb self.latest_network = UnetHook(lowvram=is_low_vram) self.latest_network.hook(model=unet, sd_ldm=sd_ldm, control_params=forward_params, process=p, batch_option_uint_separate=batch_option_uint_separate, batch_option_style_align=batch_option_style_align) self.detected_map = detected_maps self.post_processors = post_processors def controlnet_hack(self, p): t = time.time() if getattr(shared.cmd_opts, 'controlnet_tracemalloc', False): tracemalloc.start() setattr(self, "malloc_begin", tracemalloc.take_snapshot()) self.controlnet_main_entry(p) if getattr(shared.cmd_opts, 'controlnet_tracemalloc', False): logger.info("After hook malloc:") for stat in tracemalloc.take_snapshot().compare_to(self.malloc_begin, "lineno")[:10]: logger.info(stat) if len(self.enabled_units) > 0: logger.info(f'ControlNet Hooked - Time = {time.time() - t}') @staticmethod def process_has_sdxl_refiner(p): return getattr(p, 'refiner_checkpoint', None) is not None def process(self, p, *args, **kwargs): if not Script.process_has_sdxl_refiner(p): self.controlnet_hack(p) return def before_process_batch(self, p, *args, **kwargs): if self.noise_modifier is not None: p.rng = HackedImageRNG(rng=p.rng, noise_modifier=self.noise_modifier, sd_model=p.sd_model) self.noise_modifier = None if Script.process_has_sdxl_refiner(p): self.controlnet_hack(p) return def postprocess_batch(self, p, *args, **kwargs): images = kwargs.get('images', []) for post_processor in self.post_processors: for i in range(len(images)): images[i] = post_processor(images[i], i) return def postprocess(self, p, processed, *args): sd_ldm = p.sd_model unet = sd_ldm.model.diffusion_model clear_all_secondary_control_models(unet) self.noise_modifier = None for control_lora in getattr(p, 'controlnet_control_loras', []): unbind_control_lora(control_lora) p.controlnet_control_loras = [] self.post_processors = [] setattr(p, 'controlnet_vae_cache', None) processor_params_flag = (', '.join(getattr(processed, 'extra_generation_params', []))).lower() self.post_processors = [] if not batch_hijack.instance.is_batch: self.enabled_units.clear() if shared.opts.data.get("control_net_detectmap_autosaving", False) and self.latest_network is not None: for detect_map, module in self.detected_map: detectmap_dir = os.path.join(shared.opts.data.get("control_net_detectedmap_dir", ""), module) if not os.path.isabs(detectmap_dir): detectmap_dir = os.path.join(p.outpath_samples, detectmap_dir) if module != "none": os.makedirs(detectmap_dir, exist_ok=True) img = Image.fromarray(np.ascontiguousarray(detect_map.clip(0, 255).astype(np.uint8)).copy()) save_image(img, detectmap_dir, module) if self.latest_network is None: return if not batch_hijack.instance.is_batch: if not shared.opts.data.get("control_net_no_detectmap", False): if 'sd upscale' not in processor_params_flag: if self.detected_map is not None: for detect_map, module in self.detected_map: if detect_map is None: continue detect_map = np.ascontiguousarray(detect_map.copy()).copy() detect_map = external_code.visualize_inpaint_mask(detect_map) processed.images.extend([ Image.fromarray( detect_map.clip(0, 255).astype(np.uint8) ) ]) self.input_image = None self.latest_network.restore() self.latest_network = None self.detected_map.clear() gc.collect() devices.torch_gc() if getattr(shared.cmd_opts, 'controlnet_tracemalloc', False): logger.info("After generation:") for stat in tracemalloc.take_snapshot().compare_to(self.malloc_begin, "lineno")[:10]: logger.info(stat) tracemalloc.stop() def batch_tab_process(self, p, batches, *args, **kwargs): is_img2img = isinstance(p, StableDiffusionProcessingImg2Img) if is_img2img != self.is_img2img: return self.enabled_units = Script.get_enabled_units(p) for unit_i, unit in enumerate(self.enabled_units): unit.batch_images = iter([batch[unit_i] for batch in batches]) def batch_tab_process_each(self, p, *args, **kwargs): is_img2img = isinstance(p, StableDiffusionProcessingImg2Img) if is_img2img != self.is_img2img: return for unit in self.enabled_units: if getattr(unit, 'loopback', False) and batch_hijack.instance.batch_index > 0: continue unit.image = next(unit.batch_images) def batch_tab_postprocess_each(self, p, processed, *args, **kwargs): is_img2img = isinstance(p, StableDiffusionProcessingImg2Img) if is_img2img != self.is_img2img: return for unit_i, unit in enumerate(self.enabled_units): if getattr(unit, 'loopback', False): output_images = getattr(processed, 'images', [])[processed.index_of_first_image:] if output_images: unit.image = np.array(output_images[0]) else: logger.warning(f'Warning: No loopback image found for controlnet unit {unit_i}. Using control map from last batch iteration instead') def batch_tab_postprocess(self, p, *args, **kwargs): is_img2img = isinstance(p, StableDiffusionProcessingImg2Img) if is_img2img != self.is_img2img: return self.enabled_units.clear() self.input_image = None if self.latest_network is None: return self.latest_network.restore() self.latest_network = None self.detected_map.clear() def on_ui_settings(): section = ('control_net', "ControlNet") shared.opts.add_option("control_net_detectedmap_dir", shared.OptionInfo( global_state.default_detectedmap_dir, "Directory for detected maps auto saving", section=section)) shared.opts.add_option("control_net_models_path", shared.OptionInfo( "", "Extra path to scan for ControlNet models (e.g. training output directory)", section=section)) shared.opts.add_option("control_net_modules_path", shared.OptionInfo( "", "Path to directory containing annotator model directories (overrides corresponding command line flag)", section=section).needs_reload_ui()) shared.opts.add_option("control_net_unit_count", shared.OptionInfo( 3, "Multi-ControlNet: ControlNet unit number", gr.Slider, {"minimum": 1, "maximum": 10, "step": 1}, section=section).needs_reload_ui()) shared.opts.add_option("control_net_model_cache_size", shared.OptionInfo( 2, "Model cache size", gr.Slider, {"minimum": 1, "maximum": 10, "step": 1}, section=section).needs_reload_ui()) shared.opts.add_option("control_net_inpaint_blur_sigma", shared.OptionInfo( 7, "ControlNet inpainting Gaussian blur sigma", gr.Slider, {"minimum": 0, "maximum": 64, "step": 1}, section=section)) shared.opts.add_option("control_net_no_detectmap", shared.OptionInfo( False, "Do not append detectmap to output", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("control_net_detectmap_autosaving", shared.OptionInfo( False, "Allow detectmap auto saving", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("control_net_allow_script_control", shared.OptionInfo( False, "Allow other script to control this extension", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("control_net_sync_field_args", shared.OptionInfo( True, "Paste ControlNet parameters in infotext", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("controlnet_show_batch_images_in_ui", shared.OptionInfo( False, "Show batch images in gradio gallery output", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("controlnet_increment_seed_during_batch", shared.OptionInfo( False, "Increment seed after each controlnet batch iteration", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("controlnet_disable_openpose_edit", shared.OptionInfo( False, "Disable openpose edit", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("controlnet_disable_photopea_edit", shared.OptionInfo( False, "Disable photopea edit", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("controlnet_photopea_warning", shared.OptionInfo( True, "Photopea popup warning", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("controlnet_ignore_noninpaint_mask", shared.OptionInfo( False, "Ignore mask on ControlNet input image if control type is not inpaint", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("controlnet_clip_detector_on_cpu", shared.OptionInfo( False, "Load CLIP preprocessor model on CPU", gr.Checkbox, {"interactive": True}, section=section)) shared.opts.add_option("controlnet_control_type_dropdown", shared.OptionInfo( False, "Display control type as dropdown", gr.Checkbox, {"interactive": True}, section=section).needs_reload_ui()) batch_hijack.instance.do_hijack() script_callbacks.on_ui_settings(on_ui_settings) script_callbacks.on_infotext_pasted(Infotext.on_infotext_pasted) script_callbacks.on_after_component(ControlNetUiGroup.on_after_component) script_callbacks.on_before_reload(ControlNetUiGroup.reset) ================================================ FILE: scripts/controlnet_core/controlnet_union.py ================================================ from collections import OrderedDict import torch import torch.nn as nn try: from sgm.modules.diffusionmodules.openaimodel import ( timestep_embedding, ) using_sgm = True except ImportError: from ldm.modules.diffusionmodules.openaimodel import ( timestep_embedding, ) using_sgm = False def attention_pytorch( q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False ): if skip_reshape: b, _, _, dim_head = q.shape else: b, _, dim_head = q.shape dim_head //= heads q, k, v = map( lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2), (q, k, v), ) out = torch.nn.functional.scaled_dot_product_attention( q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False ) out = out.transpose(1, 2).reshape(b, -1, heads * dim_head) return out class ControlAddEmbedding(nn.Module): def __init__( self, in_dim, out_dim, num_control_type, dtype=None, device=None, ): super().__init__() self.num_control_type = num_control_type self.in_dim = in_dim self.linear_1 = nn.Linear( in_dim * num_control_type, out_dim, dtype=dtype, device=device ) self.linear_2 = nn.Linear(out_dim, out_dim, dtype=dtype, device=device) def forward(self, control_type, dtype, device): c_type = torch.zeros((self.num_control_type,), device=device) c_type[control_type] = 1.0 c_type = ( timestep_embedding(c_type.flatten(), self.in_dim, repeat_only=False) .to(dtype) .reshape((-1, self.num_control_type * self.in_dim)) ) return self.linear_2(torch.nn.functional.silu(self.linear_1(c_type))) class OptimizedAttention(nn.Module): def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None): super().__init__() self.heads = nhead self.c = c self.in_proj = nn.Linear(c, c * 3, bias=True, dtype=dtype, device=device) self.out_proj = nn.Linear(c, c, bias=True, dtype=dtype, device=device) def forward(self, x): x = self.in_proj(x) q, k, v = x.split(self.c, dim=2) out = attention_pytorch(q, k, v, self.heads) return self.out_proj(out) class QuickGELU(nn.Module): def forward(self, x: torch.Tensor): return x * torch.sigmoid(1.702 * x) class ResBlockUnionControlnet(nn.Module): def __init__(self, dim, nhead, dtype=None, device=None, operations=None): super().__init__() self.attn = OptimizedAttention( dim, nhead, dtype=dtype, device=device, operations=operations ) self.ln_1 = nn.LayerNorm(dim, dtype=dtype, device=device) self.mlp = nn.Sequential( OrderedDict( [ ( "c_fc", nn.Linear(dim, dim * 4, dtype=dtype, device=device), ), ("gelu", QuickGELU()), ( "c_proj", nn.Linear(dim * 4, dim, dtype=dtype, device=device), ), ] ) ) self.ln_2 = nn.LayerNorm(dim, dtype=dtype, device=device) def attention(self, x: torch.Tensor): return self.attn(x) def forward(self, x: torch.Tensor): x = x + self.attention(self.ln_1(x)) x = x + self.mlp(self.ln_2(x)) return x ================================================ FILE: scripts/controlnet_diffusers.py ================================================ # https://gist.github.com/takuma104/4adfb3d968d80bea1d18a30c06439242 # 2nd editing by lllyasviel # =================# # UNet Conversion # # =================# unet_conversion_map = [ # (stable-diffusion, HF Diffusers) ("time_embed.0.weight", "time_embedding.linear_1.weight"), ("time_embed.0.bias", "time_embedding.linear_1.bias"), ("time_embed.2.weight", "time_embedding.linear_2.weight"), ("time_embed.2.bias", "time_embedding.linear_2.bias"), ("label_emb.0.0.weight", "add_embedding.linear_1.weight"), ("label_emb.0.0.bias", "add_embedding.linear_1.bias"), ("label_emb.0.2.weight", "add_embedding.linear_2.weight"), ("label_emb.0.2.bias", "add_embedding.linear_2.bias"), ("input_blocks.0.0.weight", "conv_in.weight"), ("input_blocks.0.0.bias", "conv_in.bias"), ("middle_block_out.0.weight", "controlnet_mid_block.weight"), ("middle_block_out.0.bias", "controlnet_mid_block.bias"), ] unet_conversion_map_resnet = [ # (stable-diffusion, HF Diffusers) ("in_layers.0", "norm1"), ("in_layers.2", "conv1"), ("out_layers.0", "norm2"), ("out_layers.3", "conv2"), ("emb_layers.1", "time_emb_proj"), ("skip_connection", "conv_shortcut"), ] unet_conversion_map_layer = [] # hardcoded number of downblocks and resnets/attentions... # would need smarter logic for other networks. for i in range(4): # loop over downblocks/upblocks for j in range(10): # loop over resnets/attentions for downblocks hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}." sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0." unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix)) hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}." sd_down_atn_prefix = f"input_blocks.{3 * i + j + 1}.1." unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix)) hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv." sd_downsample_prefix = f"input_blocks.{3 * (i + 1)}.0.op." unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix)) hf_mid_atn_prefix = "mid_block.attentions.0." sd_mid_atn_prefix = "middle_block.1." unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix)) for j in range(2): hf_mid_res_prefix = f"mid_block.resnets.{j}." sd_mid_res_prefix = f"middle_block.{2*j}." unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix)) # controlnet specific controlnet_cond_embedding_names = ['conv_in'] + [f'blocks.{i}' for i in range(6)] + ['conv_out'] for i, hf_prefix in enumerate(controlnet_cond_embedding_names): hf_prefix = f"controlnet_cond_embedding.{hf_prefix}." sd_prefix = f"input_hint_block.{i*2}." unet_conversion_map_layer.append((sd_prefix, hf_prefix)) for i in range(12): hf_prefix = f"controlnet_down_blocks.{i}." sd_prefix = f"zero_convs.{i}.0." unet_conversion_map_layer.append((sd_prefix, hf_prefix)) def convert_from_diffuser_state_dict(unet_state_dict): mapping = {k: k for k in unet_state_dict.keys()} for sd_name, hf_name in unet_conversion_map: mapping[hf_name] = sd_name for k, v in mapping.items(): if "resnets" in k: for sd_part, hf_part in unet_conversion_map_resnet: v = v.replace(hf_part, sd_part) mapping[k] = v for k, v in mapping.items(): for sd_part, hf_part in unet_conversion_map_layer: v = v.replace(hf_part, sd_part) mapping[k] = v new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items() if k in unet_state_dict} return new_state_dict ================================================ FILE: scripts/controlnet_lllite.py ================================================ # https://github.com/kohya-ss/ControlNet-LLLite-ComfyUI/blob/main/node_control_net_lllite.py import re import torch class LLLiteModule(torch.nn.Module): def __init__( self, name: str, is_conv2d: bool, in_dim: int, depth: int, cond_emb_dim: int, mlp_dim: int, ): super().__init__() self.name = name self.is_conv2d = is_conv2d self.is_first = False modules = [] modules.append(torch.nn.Conv2d(3, cond_emb_dim // 2, kernel_size=4, stride=4, padding=0)) # to latent (from VAE) size*2 if depth == 1: modules.append(torch.nn.ReLU(inplace=True)) modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim, kernel_size=2, stride=2, padding=0)) elif depth == 2: modules.append(torch.nn.ReLU(inplace=True)) modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim, kernel_size=4, stride=4, padding=0)) elif depth == 3: # kernel size 8は大きすぎるので、4にする / kernel size 8 is too large, so set it to 4 modules.append(torch.nn.ReLU(inplace=True)) modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim // 2, kernel_size=4, stride=4, padding=0)) modules.append(torch.nn.ReLU(inplace=True)) modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim, kernel_size=2, stride=2, padding=0)) self.conditioning1 = torch.nn.Sequential(*modules) if self.is_conv2d: self.down = torch.nn.Sequential( torch.nn.Conv2d(in_dim, mlp_dim, kernel_size=1, stride=1, padding=0), torch.nn.ReLU(inplace=True), ) self.mid = torch.nn.Sequential( torch.nn.Conv2d(mlp_dim + cond_emb_dim, mlp_dim, kernel_size=1, stride=1, padding=0), torch.nn.ReLU(inplace=True), ) self.up = torch.nn.Sequential( torch.nn.Conv2d(mlp_dim, in_dim, kernel_size=1, stride=1, padding=0), ) else: self.down = torch.nn.Sequential( torch.nn.Linear(in_dim, mlp_dim), torch.nn.ReLU(inplace=True), ) self.mid = torch.nn.Sequential( torch.nn.Linear(mlp_dim + cond_emb_dim, mlp_dim), torch.nn.ReLU(inplace=True), ) self.up = torch.nn.Sequential( torch.nn.Linear(mlp_dim, in_dim), ) self.depth = depth self.cond_image = None self.cond_emb = None def set_cond_image(self, cond_image): self.cond_image = cond_image self.cond_emb = None def forward(self, x, blk_shape): if self.cond_emb is None: # print(f"cond_emb is None, {self.name}") cx = self.conditioning1(self.cond_image.to(x.device, dtype=x.dtype)) if blk_shape is not None: b, c, h, w = blk_shape cx = torch.nn.functional.interpolate(cx, (h, w), mode="nearest-exact") if not self.is_conv2d: # reshape / b,c,h,w -> b,h*w,c n, c, h, w = cx.shape cx = cx.view(n, c, h * w).permute(0, 2, 1) self.cond_emb = cx cx = self.cond_emb # uncond/condでxはバッチサイズが2倍 if x.shape[0] != cx.shape[0]: if self.is_conv2d: cx = cx.repeat(x.shape[0] // cx.shape[0], 1, 1, 1) else: # print("x.shape[0] != cx.shape[0]", x.shape[0], cx.shape[0]) cx = cx.repeat(x.shape[0] // cx.shape[0], 1, 1) cx = torch.cat([cx, self.down(x)], dim=1 if self.is_conv2d else 2) cx = self.mid(cx) cx = self.up(cx) return cx all_hack = {} def clear_all_lllite(): global all_hack for k, v in all_hack.items(): k.forward = v k.lllite_list = [] all_hack = {} return class PlugableControlLLLite(torch.nn.Module): def __init__(self, state_dict): super().__init__() self.cache = {} module_weights = {} for key, value in state_dict.items(): fragments = key.split(".") module_name = fragments[0] weight_name = ".".join(fragments[1:]) if module_name not in module_weights: module_weights[module_name] = {} module_weights[module_name][weight_name] = value modules = {} for module_name, weights in module_weights.items(): if "conditioning1.4.weight" in weights: depth = 3 elif weights["conditioning1.2.weight"].shape[-1] == 4: depth = 2 else: depth = 1 module = LLLiteModule( name=module_name, is_conv2d=weights["down.0.weight"].ndim == 4, in_dim=weights["down.0.weight"].shape[1], depth=depth, cond_emb_dim=weights["conditioning1.0.weight"].shape[0] * 2, mlp_dim=weights["down.0.weight"].shape[0], ) module.load_state_dict(weights) modules[module_name] = module setattr(self, module_name, module) if len(modules) == 1: module.is_first = True self.modules = modules return def reset(self): self.cache = {} return @torch.no_grad() def hook(self, model, cond, weight, start, end): global all_hack cond_image = cond * 2.0 - 1.0 for module in self.modules.values(): module.set_cond_image(cond_image) for k, v in self.modules.items(): k = k.replace('middle_block', 'middle_blocks_0') match = re.match("lllite_unet_(.*)_blocks_(.*)_1_transformer_blocks_(.*)_(.*)_to_(.*)", k, re.M | re.I) assert match, 'Failed to load ControlLLLite!' root = match.group(1) block = match.group(2) block_number = match.group(3) attn_name = match.group(4) proj_name = match.group(5) if root == 'input': b = model.input_blocks[int(block)][1].transformer_blocks[int(block_number)] elif root == 'output': b = model.output_blocks[int(block)][1].transformer_blocks[int(block_number)] else: b = model.middle_block[1].transformer_blocks[int(block_number)] b = getattr(b, attn_name, None) assert b is not None, 'Failed to load ControlLLLite!' b = getattr(b, 'to_' + proj_name, None) assert b is not None, 'Failed to load ControlLLLite!' if not hasattr(b, 'lllite_list'): b.lllite_list = [] if len(b.lllite_list) == 0: all_hack[b] = b.forward b.forward = self.get_hacked_forward(original_forward=b.forward, model=model, blk=b) b.lllite_list.append((weight, start, end, v)) return def get_hacked_forward(self, original_forward, model, blk): @torch.no_grad() def forward(x, **kwargs): current_sampling_percent = getattr(model, 'current_sampling_percent', 0.5) current_h_shape = getattr(model, 'current_h_shape', None) is_in_high_res_fix = getattr(model, 'is_in_high_res_fix', False) if not is_in_high_res_fix: hack = 0 for weight, start, end, module in blk.lllite_list: module.to(x.device) if current_sampling_percent < start or current_sampling_percent > end: hack = hack + 0 else: hack = hack + module(x, current_h_shape) * weight x = x + hack return original_forward(x, **kwargs) return forward ================================================ FILE: scripts/controlnet_lora.py ================================================ import torch from contextlib import contextmanager from typing import Union, Tuple _size_2_t = Union[int, Tuple[int, int]] class LinearWithLoRA(torch.nn.Module): def __init__( self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None: super().__init__() self.weight_module = None self.up = None self.down = None self.bias = None self.in_features = in_features self.out_features = out_features self.device = device self.dtype = dtype self.weight = None def bind_lora(self, weight_module): self.weight_module = [weight_module] def unbind_lora(self): if self.up is not None and self.down is not None: # SAI's model is weird and needs this self.weight_module = None def get_original_weight(self): if self.weight_module is None: return None return self.weight_module[0].weight def forward(self, x): if self.weight is not None: return torch.nn.functional.linear(x, self.weight.to(x), self.bias.to(x) if self.bias is not None else None) original_weight = self.get_original_weight() if original_weight is None: return None # A1111 needs first_time_calculation if self.up is not None and self.down is not None: weight = original_weight.to(x) + torch.mm(self.up, self.down).to(x) else: weight = original_weight.to(x) return torch.nn.functional.linear(x, weight, self.bias.to(x) if self.bias is not None else None) class Conv2dWithLoRA(torch.nn.Module): def __init__( self, in_channels: int, out_channels: int, kernel_size: _size_2_t, stride: _size_2_t = 1, padding: Union[str, _size_2_t] = 0, dilation: _size_2_t = 1, groups: int = 1, bias: bool = True, padding_mode: str = 'zeros', device=None, dtype=None ) -> None: super().__init__() self.stride = stride self.padding = padding self.dilation = dilation self.groups = groups self.weight_module = None self.bias = None self.up = None self.down = None self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.padding_mode = padding_mode self.device = device self.dtype = dtype self.weight = None def bind_lora(self, weight_module): self.weight_module = [weight_module] def unbind_lora(self): if self.up is not None and self.down is not None: # SAI's model is weird and needs this self.weight_module = None def get_original_weight(self): if self.weight_module is None: return None return self.weight_module[0].weight def forward(self, x): if self.weight is not None: return torch.nn.functional.conv2d(x, self.weight.to(x), self.bias.to(x) if self.bias is not None else None, self.stride, self.padding, self.dilation, self.groups) original_weight = self.get_original_weight() if original_weight is None: return None # A1111 needs first_time_calculation if self.up is not None and self.down is not None: weight = original_weight.to(x) + torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1)).reshape(original_weight.shape).to(x) else: weight = original_weight.to(x) return torch.nn.functional.conv2d(x, weight, self.bias.to(x) if self.bias is not None else None, self.stride, self.padding, self.dilation, self.groups) @contextmanager def controlnet_lora_hijack(): linear, conv2d = torch.nn.Linear, torch.nn.Conv2d torch.nn.Linear, torch.nn.Conv2d = LinearWithLoRA, Conv2dWithLoRA try: yield finally: torch.nn.Linear, torch.nn.Conv2d = linear, conv2d def recursive_set(obj, key, value): if obj is None: return if '.' in key: k1, k2 = key.split('.', 1) recursive_set(getattr(obj, k1, None), k2, value) else: setattr(obj, key, value) def force_load_state_dict(model, state_dict): for k in list(state_dict.keys()): recursive_set(model, k, torch.nn.Parameter(state_dict[k])) del state_dict[k] return def recursive_bind_lora(obj, key, value): if obj is None: return if '.' in key: k1, k2 = key.split('.', 1) recursive_bind_lora(getattr(obj, k1, None), k2, value) else: target = getattr(obj, key, None) if target is not None and hasattr(target, 'bind_lora'): target.bind_lora(value) def recursive_get(obj, key): if obj is None: return if '.' in key: k1, k2 = key.split('.', 1) return recursive_get(getattr(obj, k1, None), k2) else: return getattr(obj, key, None) def bind_control_lora(base_model, control_lora_model): sd = base_model.state_dict() keys = list(sd.keys()) keys = list(set([k.rsplit('.', 1)[0] for k in keys])) module_dict = {k: recursive_get(base_model, k) for k in keys} for k, v in module_dict.items(): recursive_bind_lora(control_lora_model, k, v) def torch_dfs(model: torch.nn.Module): result = [model] for child in model.children(): result += torch_dfs(child) return result def unbind_control_lora(control_lora_model): for m in torch_dfs(control_lora_model): if hasattr(m, 'unbind_lora'): m.unbind_lora() return ================================================ FILE: scripts/controlnet_model_guess.py ================================================ import copy import os import torch from pathlib import Path from typing import NamedTuple from modules import devices from scripts.adapter import PlugableAdapter, Adapter, StyleAdapter, Adapter_light from scripts.controlnet_lllite import PlugableControlLLLite from scripts.cldm import PlugableControlModel from scripts.controlnet_sparsectrl import PlugableSparseCtrlModel from scripts.ipadapter.ipadapter_model import IPAdapterModel from scripts.ipadapter.plugable_ipadapter import PlugableIPAdapter from scripts.logging import logger from scripts.controlnet_diffusers import convert_from_diffuser_state_dict from scripts.controlnet_lora import controlnet_lora_hijack, force_load_state_dict from scripts.enums import ControlModelType controlnet_default_config = {'adm_in_channels': None, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': 2, 'attention_resolutions': [1, 2, 4], 'transformer_depth': [1, 1, 1, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, "num_heads": 8, "global_average_pooling": False} controlnet_sdxl_config = {'num_classes': 'sequential', 'adm_in_channels': 2816, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': 2, 'attention_resolutions': [2, 4], 'transformer_depth': [0, 2, 10], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 10, 'use_linear_in_transformer': True, 'context_dim': 2048, "num_head_channels": 64, "global_average_pooling": False} controlnet_sdxl_mid_config = {'num_classes': 'sequential', 'adm_in_channels': 2816, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': 2, 'attention_resolutions': [4], 'transformer_depth': [0, 0, 1], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 2048, "num_head_channels": 64, "global_average_pooling": False} controlnet_sdxl_small_config = {'num_classes': 'sequential', 'adm_in_channels': 2816, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': 2, 'attention_resolutions': [], 'transformer_depth': [0, 0, 0], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 0, 'use_linear_in_transformer': True, "num_head_channels": 64, 'context_dim': 1, "global_average_pooling": False} t2i_adapter_config = { 'channels': [320, 640, 1280, 1280], 'nums_rb': 2, 'ksize': 1, 'sk': True, 'cin': 192, 'use_conv': False } t2i_adapter_light_config = { 'channels': [320, 640, 1280, 1280], 'nums_rb': 4, 'cin': 192, } t2i_adapter_style_config = { 'width': 1024, 'context_dim': 768, 'num_head': 8, 'n_layes': 3, 'num_token': 8, } # Stolen from https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/utils.py def state_dict_key_replace(state_dict, keys_to_replace): for x in keys_to_replace: if x in state_dict: state_dict[keys_to_replace[x]] = state_dict.pop(x) return state_dict # # Stolen from https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/utils.py def state_dict_prefix_replace(state_dict, replace_prefix): for rp in replace_prefix: replace = list(map(lambda a: (a, "{}{}".format(replace_prefix[rp], a[len(rp):])), filter(lambda a: a.startswith(rp), state_dict.keys()))) for x in replace: state_dict[x[1]] = state_dict.pop(x[0]) return state_dict class ControlModel(NamedTuple): model: torch.nn.Module type: ControlModelType def build_model_by_guess(state_dict, unet, model_path: str) -> ControlModel: if "lora_controlnet" in state_dict: is_sdxl = "input_blocks.11.0.in_layers.0.weight" not in state_dict logger.info(f"Using ControlNet lora ({'SDXL' if is_sdxl else 'SD15'})") del state_dict['lora_controlnet'] config = copy.deepcopy(controlnet_sdxl_config if is_sdxl else controlnet_default_config) config['global_average_pooling'] = False config['hint_channels'] = int(state_dict['input_hint_block.0.weight'].shape[1]) config['use_fp16'] = devices.dtype_unet == torch.float16 with controlnet_lora_hijack(): network = PlugableControlModel(config, state_dict=None) force_load_state_dict(network.control_model, state_dict) network.is_control_lora = True network.to(devices.dtype_unet) return ControlModel(network, ControlModelType.ControlLoRA) if "down_blocks.0.motion_modules.0.temporal_transformer.norm.weight" in state_dict: # sparsectrl config = copy.deepcopy(controlnet_default_config) if "input_hint_block.0.weight" in state_dict: # rgb config['use_simplified_condition_embedding'] = True config['conditioning_channels'] = 5 else: # scribble config['use_simplified_condition_embedding'] = False config['conditioning_channels'] = 4 config['use_fp16'] = devices.dtype_unet == torch.float16 network = PlugableSparseCtrlModel(config, state_dict) network.to(devices.dtype_unet) return ControlModel(network, ControlModelType.SparseCtrl) if "controlnet_cond_embedding.conv_in.weight" in state_dict: # diffusers state_dict = convert_from_diffuser_state_dict(state_dict) if 'adapter.body.0.resnets.0.block1.weight' in state_dict: # diffusers prefix_replace = {} for i in range(4): for j in range(2): prefix_replace["adapter.body.{}.resnets.{}.".format(i, j)] = "body.{}.".format(i * 2 + j) prefix_replace["adapter.body.{}.".format(i)] = "body.{}.".format(i * 2) prefix_replace["adapter."] = "" state_dict = state_dict_prefix_replace(state_dict, prefix_replace) if any('image_proj.' in x for x in state_dict.keys()) and any('ip_adapter.' in x for x in state_dict.keys()): # safetensor ipadapters st_model = {"image_proj": {}, "ip_adapter": {}} for key in state_dict.keys(): if key.startswith("image_proj."): st_model["image_proj"][key.replace("image_proj.", "")] = state_dict[key] elif key.startswith("ip_adapter."): st_model["ip_adapter"][key.replace("ip_adapter.", "")] = state_dict[key] # sort keys model = {"image_proj": st_model["image_proj"], "ip_adapter": {}} sorted_keys = sorted(st_model["ip_adapter"].keys(), key=lambda x: int(x.split(".")[0])) for key in sorted_keys: model["ip_adapter"][key] = st_model["ip_adapter"][key] state_dict = model del st_model model_has_shuffle_in_filename = 'shuffle' in Path(os.path.abspath(model_path)).stem.lower() state_dict = {k.replace("control_model.", ""): v for k, v in state_dict.items()} state_dict = {k.replace("adapter.", ""): v for k, v in state_dict.items()} if 'input_hint_block.0.weight' in state_dict: if 'label_emb.0.0.bias' not in state_dict: config = copy.deepcopy(controlnet_default_config) logger.info('controlnet_default_config') config['global_average_pooling'] = model_has_shuffle_in_filename config['hint_channels'] = int(state_dict['input_hint_block.0.weight'].shape[1]) config['context_dim'] = int(state_dict['input_blocks.5.1.transformer_blocks.0.attn2.to_k.weight'].shape[1]) for key in state_dict.keys(): p = state_dict[key] if 'proj_in.weight' in key or 'proj_out.weight' in key: if len(p.shape) == 2: p = p[..., None, None] state_dict[key] = p else: has_full_layers = 'input_blocks.8.1.transformer_blocks.9.norm3.weight' in state_dict has_mid_layers = 'input_blocks.8.1.transformer_blocks.0.norm3.weight' in state_dict if has_full_layers: config = copy.deepcopy(controlnet_sdxl_config) logger.info('controlnet_sdxl_config') elif has_mid_layers: config = copy.deepcopy(controlnet_sdxl_mid_config) logger.info('controlnet_sdxl_mid_config') else: config = copy.deepcopy(controlnet_sdxl_small_config) logger.info('controlnet_sdxl_small_config') config['global_average_pooling'] = False config['hint_channels'] = int(state_dict['input_hint_block.0.weight'].shape[1]) if 'difference' in state_dict and unet is not None: unet_state_dict = unet.state_dict() unet_state_dict_keys = unet_state_dict.keys() final_state_dict = {} for key in state_dict.keys(): p = state_dict[key] if key in unet_state_dict_keys: p_new = p + unet_state_dict[key].clone().cpu() else: p_new = p final_state_dict[key] = p_new state_dict = final_state_dict if "control_add_embedding.linear_1.bias" in state_dict: # Controlnet Union config["union_controlnet_num_control_type"] = state_dict["task_embedding"].shape[0] final_state_dict = {} for k in list(state_dict.keys()): new_k = k.replace('.attn.in_proj_', '.attn.in_proj.') final_state_dict[new_k] = state_dict.pop(k) state_dict = final_state_dict control_model_type = ControlModelType.ControlNetUnion elif "instant_id" in model_path.lower(): control_model_type = ControlModelType.InstantID else: control_model_type = ControlModelType.ControlNet config['use_fp16'] = devices.dtype_unet == torch.float16 network = PlugableControlModel(config, state_dict) network.to(devices.dtype_unet) return ControlModel(network, control_model_type) if 'conv_in.weight' in state_dict: logger.info('t2i_adapter_config') cin = int(state_dict['conv_in.weight'].shape[1]) channel = int(state_dict['conv_in.weight'].shape[0]) ksize = int(state_dict['body.0.block2.weight'].shape[2]) down_opts = tuple(filter(lambda item: item.endswith("down_opt.op.weight"), state_dict)) use_conv = len(down_opts) > 0 is_sdxl = cin == 256 or cin == 768 adapter = Adapter( cin=cin, channels=[channel, channel*2, channel*4, channel*4], nums_rb=2, ksize=ksize, sk=True, use_conv=use_conv, is_sdxl=is_sdxl ).cpu() adapter.load_state_dict(state_dict, strict=False) network = PlugableAdapter(adapter) return ControlModel(network, ControlModelType.T2I_Adapter) if 'style_embedding' in state_dict: config = copy.deepcopy(t2i_adapter_style_config) logger.info('t2i_adapter_style_config') adapter = StyleAdapter(**config).cpu() adapter.load_state_dict(state_dict, strict=False) network = PlugableAdapter(adapter) return ControlModel(network, ControlModelType.T2I_StyleAdapter) if 'body.0.in_conv.weight' in state_dict: config = copy.deepcopy(t2i_adapter_light_config) logger.info('t2i_adapter_light_config') config['cin'] = int(state_dict['body.0.in_conv.weight'].shape[1]) adapter = Adapter_light(**config).cpu() adapter.load_state_dict(state_dict, strict=False) network = PlugableAdapter(adapter) return ControlModel(network, ControlModelType.T2I_Adapter) if 'ip_adapter' in state_dict: network = PlugableIPAdapter(IPAdapterModel.load(state_dict, model_path)) return ControlModel(network, ControlModelType.IPAdapter) if any('lllite' in k for k in state_dict.keys()): network = PlugableControlLLLite(state_dict) network.to('cpu') return ControlModel(network, ControlModelType.Controlllite) raise Exception('[ControlNet Error] Cannot recognize the ControlModel!') ================================================ FILE: scripts/controlnet_sparsectrl.py ================================================ from typing import Tuple, List import torch from torch import nn from torch.nn import functional as F from scripts.cldm import PlugableControlModel, ControlNet, zero_module, conv_nd, TimestepEmbedSequential class PlugableSparseCtrlModel(PlugableControlModel): def __init__(self, config, state_dict=None): nn.Module.__init__(self) self.config = config self.control_model = SparseCtrl(**self.config).cpu() if state_dict is not None: self.control_model.load_state_dict(state_dict, strict=False) self.gpu_component = None class CondEmbed(nn.Module): def __init__( self, dims: int, conditioning_embedding_channels: int, conditioning_channels: int = 3, block_out_channels: Tuple[int] = (16, 32, 96, 256), ): super().__init__() self.conv_in = conv_nd(dims, conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) self.blocks = nn.ModuleList([]) for i in range(len(block_out_channels) - 1): channel_in = block_out_channels[i] channel_out = block_out_channels[i + 1] self.blocks.append(conv_nd(dims, channel_in, channel_in, kernel_size=3, padding=1)) self.blocks.append(conv_nd(dims, channel_in, channel_out, kernel_size=3, padding=1, stride=2)) self.conv_out = zero_module(conv_nd(dims, block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)) def forward(self, conditioning): embedding = self.conv_in(conditioning) embedding = F.silu(embedding) for block in self.blocks: embedding = block(embedding) embedding = F.silu(embedding) embedding = self.conv_out(embedding) return embedding class SparseCtrl(ControlNet): def __init__(self, use_simplified_condition_embedding=True, conditioning_channels=4, **kwargs): super().__init__(hint_channels=1, **kwargs) # we don't need hint_channels, but we need to set it to 1 to avoid errors self.use_simplified_condition_embedding = use_simplified_condition_embedding if use_simplified_condition_embedding: self.input_hint_block = TimestepEmbedSequential( zero_module(conv_nd(self.dims, conditioning_channels, kwargs.get("model_channels", 320), kernel_size=3, padding=1))) else: self.input_hint_block = TimestepEmbedSequential( CondEmbed( self.dims, kwargs.get("model_channels", 320), conditioning_channels=conditioning_channels,)) def load_state_dict(self, state_dict, strict=False): mm_dict = {} cn_dict = {} for k, v in state_dict.items(): if "motion_modules" in k: mm_dict[k] = v else: cn_dict[k] = v super().load_state_dict(cn_dict, strict=True) from scripts.animatediff_mm import MotionWrapper, MotionModuleType sparsectrl_mm = MotionWrapper("", "", MotionModuleType.SparseCtrl) sparsectrl_mm.load_state_dict(mm_dict, strict=True) for mm_idx, unet_idx in enumerate([1, 2, 4, 5, 7, 8, 10, 11]): mm_idx0, mm_idx1 = mm_idx // 2, mm_idx % 2 mm_inject = getattr(sparsectrl_mm.down_blocks[mm_idx0], "motion_modules")[mm_idx1] self.input_blocks[unet_idx].append(mm_inject) @staticmethod def create_cond_mask(control_image_index: List[int], control_image_latents: torch.Tensor, video_length: int): hint_cond = torch.zeros((video_length, *control_image_latents.shape[1:]), device=control_image_latents.device, dtype=control_image_latents.dtype) hint_cond[control_image_index] = control_image_latents[:len(control_image_index)] hint_cond_mask = torch.zeros((hint_cond.shape[0], 1, *hint_cond.shape[2:]), device=control_image_latents.device, dtype=control_image_latents.dtype) hint_cond_mask[control_image_index] = 1.0 return torch.cat([hint_cond, hint_cond_mask], dim=1) def forward(self, x, hint, timesteps, context, y=None, **kwargs): return super().forward(torch.zeros_like(x, device=x.device), hint, timesteps, context, y=y, **kwargs) ================================================ FILE: scripts/controlnet_ui/advanced_weight_control.py ================================================ import gradio as gr import matplotlib.pyplot as plt from matplotlib.patches import Patch import io import json from PIL import Image from typing import List from scripts.enums import StableDiffusionVersion from scripts.global_state import get_sd_version from scripts.ipadapter.weight import calc_weights INPUT_BLOCK_COLOR = "#61bdee" MIDDLE_BLOCK_COLOR = "#e2e2e2" OUTPUT_BLOCK_COLOR = "#dc6e55" def get_bar_colors( sd_version: StableDiffusionVersion, input_color, middle_color, output_color ): middle_block_idx = 4 if sd_version == StableDiffusionVersion.SDXL else 6 def get_color(idx): if idx < middle_block_idx: return input_color elif idx == middle_block_idx: return middle_color else: return output_color return [get_color(i) for i in range(sd_version.transformer_block_num)] def plot_weights( numbers: List[float], colors: List[str], ): # Create a bar chart plt.figure(figsize=(8, 4)) plt.bar(range(len(numbers)), numbers, color=colors) plt.xlabel("Transformer Index") plt.ylabel("Weight") plt.legend( handles=[ Patch(color=color, label=label) for color, label in ( (INPUT_BLOCK_COLOR, "Input Block"), (MIDDLE_BLOCK_COLOR, "Middle Block"), (OUTPUT_BLOCK_COLOR, "Output Block"), ) ], loc="best", ) # Save the plot to a BytesIO buffer buffer = io.BytesIO() plt.savefig(buffer, format="png") plt.close() buffer.seek(0) # Convert the buffer to a PIL image and return it image = Image.open(buffer) return image class AdvancedWeightControl: def __init__(self): self.group = None self.weight_type = None self.weight_plot = None self.weight_editor = None self.weight_composition = None def render(self): with gr.Group(visible=False) as self.group: with gr.Row(): self.weight_type = gr.Dropdown( choices=[ "normal", "ease in", "ease out", "ease in-out", "reverse in-out", "weak input", "weak output", "weak middle", "strong middle", "style transfer", "composition", "strong style transfer", "style and composition", "strong style and composition", ], label="Weight Type", value="normal", ) self.weight_composition = gr.Slider( label="Composition Weight", minimum=0, maximum=2.0, value=1.0, step=0.01, visible=False, ) self.weight_editor = gr.Textbox(label="Weights", visible=False) self.weight_plot = gr.Image( value=None, label="Weight Plot", interactive=False, visible=False, ) def register_callbacks( self, weight_input: gr.Slider, advanced_weighting: gr.State, control_type: gr.Radio, update_unit_counter: gr.Number, ): def advanced_weighting_supported(control_type: str) -> bool: return control_type in ("IP-Adapter", "Instant-ID") self.weight_type.change( fn=lambda weight_type: gr.update( visible=weight_type in ("style and composition", "strong style and composition") ), inputs=[self.weight_type], outputs=[self.weight_composition], ) def update_weight_textbox( control_type: str, weight_type: str, weight: float, weight_composition: float, ): if not advanced_weighting_supported(control_type): return gr.update() sd_version = get_sd_version() weights = calc_weights(weight_type, weight, sd_version, weight_composition) return gr.update(value=str([round(w, 2) for w in weights]), visible=True) trigger_inputs = [self.weight_type, weight_input, self.weight_composition] for trigger_input in trigger_inputs: trigger_input.change( fn=update_weight_textbox, inputs=[ control_type, self.weight_type, weight_input, self.weight_composition, ], outputs=[self.weight_editor], ) def update_plot(weights_string: str): try: weights = json.loads(weights_string) assert isinstance(weights, list) except Exception: return gr.update(visible=False) sd_version = get_sd_version() weight_plot = plot_weights( weights, get_bar_colors( sd_version, input_color=INPUT_BLOCK_COLOR, middle_color=MIDDLE_BLOCK_COLOR, output_color=OUTPUT_BLOCK_COLOR, ), ) return gr.update(value=weight_plot, visible=True) def update_advanced_weighting(weights_string: str): try: weights = json.loads(weights_string) assert isinstance(weights, list) except Exception: return None return weights self.weight_editor.change( fn=update_plot, inputs=[self.weight_editor], outputs=[self.weight_plot], ) self.weight_editor.change( fn=update_advanced_weighting, inputs=[self.weight_editor], outputs=[advanced_weighting], ).then( fn=lambda x: gr.update(value=x + 1), inputs=[update_unit_counter], outputs=[update_unit_counter], ) # Necessary to flush gr.State change to unit state. # TODO: Expose advanced weighting control for other control types. def control_type_change(control_type: str, old_weights): supported = advanced_weighting_supported(control_type) if supported: return ( gr.update(visible=supported), old_weights, gr.update(), gr.update(), ) else: return ( gr.update(visible=supported), None, gr.update(visible=False), gr.update(visible=False), ) control_type.change( fn=control_type_change, inputs=[control_type, advanced_weighting], outputs=[ self.group, advanced_weighting, self.weight_editor, self.weight_plot, ], ) ================================================ FILE: scripts/controlnet_ui/controlnet_ui_group.py ================================================ import json import gradio as gr import functools import itertools from typing import List, Optional, Union, Dict, Tuple, Literal, Any from dataclasses import dataclass import numpy as np from scripts.supported_preprocessor import Preprocessor from scripts.utils import svg_preprocess, read_image from scripts import ( global_state, external_code, ) from annotator.util import HWC3 from internal_controlnet.external_code import ControlNetUnit from scripts.logging import logger from scripts.controlnet_ui.openpose_editor import OpenposeEditor from scripts.controlnet_ui.photopea import Photopea from scripts.controlnet_ui.advanced_weight_control import AdvancedWeightControl from scripts.enums import ( InputMode, HiResFixOption, PuLIDMode, ControlMode, ResizeMode, ControlNetUnionControlType, ) from modules import shared from modules.ui_components import FormRow, FormHTML, ToolButton @dataclass class A1111Context: """Contains all components from A1111.""" img2img_batch_input_dir: Optional[gr.components.Component] = None img2img_batch_output_dir: Optional[gr.components.Component] = None txt2img_submit_button: Optional[gr.components.Component] = None img2img_submit_button: Optional[gr.components.Component] = None # Slider controls from A1111 WebUI. txt2img_w_slider: Optional[gr.components.Component] = None txt2img_h_slider: Optional[gr.components.Component] = None img2img_w_slider: Optional[gr.components.Component] = None img2img_h_slider: Optional[gr.components.Component] = None img2img_img2img_tab: Optional[gr.components.Component] = None img2img_img2img_sketch_tab: Optional[gr.components.Component] = None img2img_batch_tab: Optional[gr.components.Component] = None img2img_inpaint_tab: Optional[gr.components.Component] = None img2img_inpaint_sketch_tab: Optional[gr.components.Component] = None img2img_inpaint_upload_tab: Optional[gr.components.Component] = None img2img_inpaint_area: Optional[gr.components.Component] = None # txt2img_enable_hr is only available for A1111 > 1.7.0. txt2img_enable_hr: Optional[gr.components.Component] = None setting_sd_model_checkpoint: Optional[gr.components.Component] = None @property def img2img_inpaint_tabs(self) -> Tuple[gr.components.Component]: return ( self.img2img_inpaint_tab, self.img2img_inpaint_sketch_tab, self.img2img_inpaint_upload_tab, ) @property def img2img_non_inpaint_tabs(self) -> List[gr.components.Component]: return ( self.img2img_img2img_tab, self.img2img_img2img_sketch_tab, self.img2img_batch_tab, ) @property def ui_initialized(self) -> bool: optional_components = { # Optional components are only available after A1111 v1.7.0. "img2img_img2img_tab": "img2img_img2img_tab", "img2img_img2img_sketch_tab": "img2img_img2img_sketch_tab", "img2img_batch_tab": "img2img_batch_tab", "img2img_inpaint_tab": "img2img_inpaint_tab", "img2img_inpaint_sketch_tab": "img2img_inpaint_sketch_tab", "img2img_inpaint_upload_tab": "img2img_inpaint_upload_tab", # SDNext does not have this field. Temporarily disable the callback on # the checkpoint change until we find a way to register an event when # all A1111 UI components are ready. "setting_sd_model_checkpoint": "setting_sd_model_checkpoint", } return all( c for name, c in vars(self).items() if name not in optional_components.values() ) def set_component(self, component: gr.components.Component): id_mapping = { "img2img_batch_input_dir": "img2img_batch_input_dir", "img2img_batch_output_dir": "img2img_batch_output_dir", "txt2img_generate": "txt2img_submit_button", "img2img_generate": "img2img_submit_button", "txt2img_width": "txt2img_w_slider", "txt2img_height": "txt2img_h_slider", "img2img_width": "img2img_w_slider", "img2img_height": "img2img_h_slider", "img2img_img2img_tab": "img2img_img2img_tab", "img2img_img2img_sketch_tab": "img2img_img2img_sketch_tab", "img2img_batch_tab": "img2img_batch_tab", "img2img_inpaint_tab": "img2img_inpaint_tab", "img2img_inpaint_sketch_tab": "img2img_inpaint_sketch_tab", "img2img_inpaint_upload_tab": "img2img_inpaint_upload_tab", "img2img_inpaint_full_res": "img2img_inpaint_area", "txt2img_hr-checkbox": "txt2img_enable_hr", # backward compatibility for webui < 1.6.0 "txt2img_enable_hr": "txt2img_enable_hr", # setting_sd_model_checkpoint is expected to be initialized last. # "setting_sd_model_checkpoint": "setting_sd_model_checkpoint", } elem_id = getattr(component, "elem_id", None) # Do not set component if it has already been set. # https://github.com/Mikubill/sd-webui-controlnet/issues/2587 if elem_id in id_mapping and getattr(self, id_mapping[elem_id]) is None: setattr(self, id_mapping[elem_id], component) logger.debug(f"Setting {elem_id}.") logger.debug( f"A1111 initialized {sum(c is not None for c in vars(self).values())}/{len(vars(self).keys())}." ) def create_ui_unit( input_mode: InputMode = InputMode.SIMPLE, batch_images: Optional[Any] = None, output_dir: str = "", loopback: bool = False, merge_gallery_files: List[Dict[Union[Literal["name"], Literal["data"]], str]] = [], use_preview_as_input: bool = False, generated_image: Optional[np.ndarray] = None, *args, ) -> ControlNetUnit: unit_dict = { k: v for k, v in zip( vars(ControlNetUnit()).keys(), itertools.chain( [True, input_mode, batch_images, output_dir, loopback], args ), ) } if use_preview_as_input and generated_image is not None: input_image = generated_image unit_dict["module"] = "none" else: input_image = unit_dict["image"] if merge_gallery_files and input_mode == InputMode.MERGE: input_image = [ {"image": read_image(file["name"])} for file in merge_gallery_files ] unit_dict["image"] = input_image return ControlNetUnit.from_dict(unit_dict) class ControlNetUiGroup(object): refresh_symbol = "\U0001f504" # 🔄 switch_values_symbol = "\U000021C5" # ⇅ camera_symbol = "\U0001F4F7" # 📷 reverse_symbol = "\U000021C4" # ⇄ tossup_symbol = "\u2934" trigger_symbol = "\U0001F4A5" # 💥 open_symbol = "\U0001F4DD" # 📝 tooltips = { "🔄": "Refresh", "\u2934": "Send dimensions to stable diffusion", "💥": "Run preprocessor", "📝": "Open new canvas", "📷": "Enable webcam", "⇄": "Mirror webcam", } global_batch_input_dir = gr.Textbox( label="Controlnet input directory", placeholder="Leave empty to use input directory", **shared.hide_dirs, elem_id="controlnet_batch_input_dir", ) a1111_context = A1111Context() # All ControlNetUiGroup instances created. all_ui_groups: List["ControlNetUiGroup"] = [] def __init__( self, is_img2img: bool, photopea: Optional[Photopea], ): # Whether callbacks have been registered. self.callbacks_registered: bool = False # Whether the render method on this object has been called. self.ui_initialized: bool = False self.is_img2img = is_img2img self.default_unit = ControlNetUnit() self.photopea = photopea self.webcam_enabled = False self.webcam_mirrored = False # Note: All gradio elements declared in `render` will be defined as member variable. # Update counter to trigger a force update of ControlNetUnit. # This is useful when a field with no event subscriber available changes. # e.g. gr.Gallery, gr.State, etc. self.update_unit_counter = None self.upload_tab = None self.image = None self.generated_image_group = None self.generated_image = None self.mask_image_group = None self.effective_region_mask = None self.batch_tab = None self.batch_image_dir = None self.merge_tab = None self.merge_gallery = None self.merge_upload_button = None self.merge_clear_button = None self.create_canvas = None self.canvas_width = None self.canvas_height = None self.canvas_create_button = None self.canvas_cancel_button = None self.open_new_canvas_button = None self.webcam_enable = None self.webcam_mirror = None self.send_dimen_button = None self.enabled = None self.low_vram = None self.pixel_perfect = None self.preprocessor_preview = None self.mask_upload = None self.type_filter = None self.module = None self.trigger_preprocessor = None self.model = None self.refresh_models = None self.weight = None self.guidance_start = None self.guidance_end = None self.advanced = None self.processor_res = None self.threshold_a = None self.threshold_b = None self.control_mode = None self.resize_mode = None self.loopback = None self.use_preview_as_input = None self.openpose_editor = None self.upload_independent_img_in_img2img = None self.image_upload_panel = None self.save_detected_map = None self.input_mode = gr.State(InputMode.SIMPLE) self.inpaint_crop_input_image = None self.hr_option = None self.advanced_weight_control = AdvancedWeightControl() self.batch_image_dir_state = None self.output_dir_state = None self.advanced_weighting = gr.State(None) self.pulid_mode = None self.union_control_type = None # API-only fields self.ipadapter_input = gr.State(None) ControlNetUiGroup.all_ui_groups.append(self) def render(self, tabname: str, elem_id_tabname: str) -> None: """The pure HTML structure of a single ControlNetUnit. Calling this function will populate `self` with all gradio element declared in local scope. Args: tabname: elem_id_tabname: Returns: None """ self.update_unit_counter = gr.Number(value=0, visible=False) self.openpose_editor = OpenposeEditor() with gr.Group(visible=not self.is_img2img) as self.image_upload_panel: self.save_detected_map = gr.Checkbox(value=True, visible=False) with gr.Tabs(): with gr.Tab(label="Single Image") as self.upload_tab: with gr.Row(elem_classes=["cnet-image-row"], equal_height=True): with gr.Group(elem_classes=["cnet-input-image-group"]): self.image = gr.Image( source="upload", brush_radius=20, mirror_webcam=False, type="numpy", tool="sketch", elem_id=f"{elem_id_tabname}_{tabname}_input_image", elem_classes=["cnet-image"], brush_color=( shared.opts.img2img_inpaint_mask_brush_color if hasattr( shared.opts, "img2img_inpaint_mask_brush_color" ) else None ), ) self.image.preprocess = functools.partial( svg_preprocess, preprocess=self.image.preprocess ) self.openpose_editor.render_upload() with gr.Group( visible=False, elem_classes=["cnet-generated-image-group"] ) as self.generated_image_group: self.generated_image = gr.Image( value=None, label="Preprocessor Preview", elem_id=f"{elem_id_tabname}_{tabname}_generated_image", elem_classes=["cnet-image"], interactive=True, height=242, ) # Gradio's magic number. Only 242 works. with gr.Group( elem_classes=["cnet-generated-image-control-group"] ): if self.photopea: self.photopea.render_child_trigger() self.openpose_editor.render_edit() preview_check_elem_id = f"{elem_id_tabname}_{tabname}_controlnet_preprocessor_preview_checkbox" preview_close_button_js = f"document.querySelector('#{preview_check_elem_id} input[type=\\'checkbox\\']').click();" gr.HTML( value=f"""Close""", visible=True, elem_classes=["cnet-close-preview"], ) with gr.Tab(label="Batch") as self.batch_tab: self.batch_image_dir = gr.Textbox( label="Input Directory", placeholder="Leave empty to use img2img batch controlnet input directory", elem_id=f"{elem_id_tabname}_{tabname}_batch_image_dir", ) with gr.Tab(label="Multi-Inputs") as self.merge_tab: self.merge_gallery = gr.Gallery( columns=[4], rows=[2], object_fit="contain", height="auto" ) with gr.Row(): self.merge_upload_button = gr.UploadButton( "Upload Images", file_types=["image"], file_count="multiple", ) self.merge_clear_button = gr.Button("Clear Images") # Note: effective region mask works with all 3 input types. with gr.Group( visible=False, elem_classes=["cnet-mask-image-group"] ) as self.mask_image_group: self.effective_region_mask = gr.Image( value=None, label="Effective Region Mask", elem_id=f"{elem_id_tabname}_{tabname}_mask_image", elem_classes=["cnet-effective-region-mask-image"], interactive=True, ) if self.photopea: self.photopea.attach_photopea_output(self.generated_image) with gr.Accordion( label="Open New Canvas", visible=False ) as self.create_canvas: self.canvas_width = gr.Slider( label="New Canvas Width", minimum=256, maximum=1024, value=512, step=64, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_canvas_width", ) self.canvas_height = gr.Slider( label="New Canvas Height", minimum=256, maximum=1024, value=512, step=64, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_canvas_height", ) with gr.Row(): self.canvas_create_button = gr.Button( value="Create New Canvas", elem_id=f"{elem_id_tabname}_{tabname}_controlnet_canvas_create_button", ) self.canvas_cancel_button = gr.Button( value="Cancel", elem_id=f"{elem_id_tabname}_{tabname}_controlnet_canvas_cancel_button", ) with gr.Row(elem_classes="controlnet_image_controls"): FormHTML( value="

Set the preprocessor to [invert] If your image has white background and black lines.

", elem_classes="controlnet_invert_warning", ) self.open_new_canvas_button = ToolButton( value=ControlNetUiGroup.open_symbol, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_open_new_canvas_button", tooltip=ControlNetUiGroup.tooltips[ControlNetUiGroup.open_symbol], ) self.webcam_enable = ToolButton( value=ControlNetUiGroup.camera_symbol, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_webcam_enable", tooltip=ControlNetUiGroup.tooltips[ControlNetUiGroup.camera_symbol], ) self.webcam_mirror = ToolButton( value=ControlNetUiGroup.reverse_symbol, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_webcam_mirror", tooltip=ControlNetUiGroup.tooltips[ ControlNetUiGroup.reverse_symbol ], ) self.send_dimen_button = ToolButton( value=ControlNetUiGroup.tossup_symbol, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_send_dimen_button", tooltip=ControlNetUiGroup.tooltips[ControlNetUiGroup.tossup_symbol], ) with FormRow(elem_classes=["controlnet_main_options"]): self.enabled = gr.Checkbox( label="Enable", value=self.default_unit.enabled, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_enable_checkbox", elem_classes=["cnet-unit-enabled"], ) self.low_vram = gr.Checkbox( label="Low VRAM", value=self.default_unit.low_vram, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_low_vram_checkbox", ) self.pixel_perfect = gr.Checkbox( label="Pixel Perfect", value=self.default_unit.pixel_perfect, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_pixel_perfect_checkbox", ) self.preprocessor_preview = gr.Checkbox( label="Allow Preview", value=False, elem_classes=["cnet-allow-preview"], elem_id=preview_check_elem_id, visible=not self.is_img2img, ) self.mask_upload = gr.Checkbox( label="Effective Region Mask", value=False, elem_classes=["cnet-mask-upload"], elem_id=f"{elem_id_tabname}_{tabname}_controlnet_mask_upload_checkbox", ) self.use_preview_as_input = gr.Checkbox( label="Preview as Input", value=False, elem_classes=["cnet-preview-as-input"], visible=False, ) with gr.Row(elem_classes="controlnet_img2img_options"): if self.is_img2img: self.upload_independent_img_in_img2img = gr.Checkbox( label="Upload independent control image", value=False, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_same_img2img_checkbox", elem_classes=["cnet-unit-same_img2img"], ) else: self.upload_independent_img_in_img2img = None # Note: The checkbox needs to exist for both img2img and txt2img as infotext # needs the checkbox value. self.inpaint_crop_input_image = gr.Checkbox( label="Crop input image based on A1111 mask", value=False, elem_classes=["cnet-crop-input-image"], visible=False, ) with gr.Row(): self.union_control_type = gr.Textbox( label="Union Control Type", value=ControlNetUnionControlType.UNKNOWN.value, visible=False, ) with gr.Row(elem_classes=["controlnet_control_type", "controlnet_row"]): self.type_filter = ( gr.Dropdown if shared.opts.data.get("controlnet_control_type_dropdown", False) else gr.Radio )( Preprocessor.get_all_preprocessor_tags(), label="Control Type", value="All", elem_id=f"{elem_id_tabname}_{tabname}_controlnet_type_filter_radio", elem_classes="controlnet_control_type_filter_group", ) with gr.Row(elem_classes=["controlnet_preprocessor_model", "controlnet_row"]): self.module = gr.Dropdown( [p.label for p in Preprocessor.get_sorted_preprocessors()], label="Preprocessor", value=self.default_unit.module, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_preprocessor_dropdown", ) self.trigger_preprocessor = ToolButton( value=ControlNetUiGroup.trigger_symbol, visible=not self.is_img2img, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_trigger_preprocessor", elem_classes=["cnet-run-preprocessor"], tooltip=ControlNetUiGroup.tooltips[ControlNetUiGroup.trigger_symbol], ) self.model = gr.Dropdown( list(global_state.cn_models.keys()), label="Model", value=self.default_unit.model, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_model_dropdown", ) self.refresh_models = ToolButton( value=ControlNetUiGroup.refresh_symbol, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_refresh_models", tooltip=ControlNetUiGroup.tooltips[ControlNetUiGroup.refresh_symbol], ) with gr.Row(elem_classes=["controlnet_weight_steps", "controlnet_row"]): self.weight = gr.Slider( label="Control Weight", value=self.default_unit.weight, minimum=0.0, maximum=2.0, step=0.05, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_control_weight_slider", elem_classes="controlnet_control_weight_slider", ) self.guidance_start = gr.Slider( label="Starting Control Step", value=self.default_unit.guidance_start, minimum=0.0, maximum=1.0, interactive=True, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_start_control_step_slider", elem_classes="controlnet_start_control_step_slider", ) self.guidance_end = gr.Slider( label="Ending Control Step", value=self.default_unit.guidance_end, minimum=0.0, maximum=1.0, interactive=True, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_ending_control_step_slider", elem_classes="controlnet_ending_control_step_slider", ) # advanced options with gr.Column(visible=False) as self.advanced: self.processor_res = gr.Slider( label="Preprocessor resolution", value=self.default_unit.processor_res, minimum=64, maximum=2048, visible=False, interactive=True, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_preprocessor_resolution_slider", ) self.threshold_a = gr.Slider( label="Threshold A", value=self.default_unit.threshold_a, minimum=64, maximum=1024, visible=False, interactive=True, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_threshold_A_slider", ) self.threshold_b = gr.Slider( label="Threshold B", value=self.default_unit.threshold_b, minimum=64, maximum=1024, visible=False, interactive=True, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_threshold_B_slider", ) self.control_mode = gr.Radio( choices=[e.value for e in ControlMode], value=self.default_unit.control_mode.value, label="Control Mode", elem_id=f"{elem_id_tabname}_{tabname}_controlnet_control_mode_radio", elem_classes="controlnet_control_mode_radio", ) self.resize_mode = gr.Radio( choices=[e.value for e in ResizeMode], value=self.default_unit.resize_mode.value, label="Resize Mode", elem_id=f"{elem_id_tabname}_{tabname}_controlnet_resize_mode_radio", elem_classes="controlnet_resize_mode_radio", visible=not self.is_img2img, ) self.hr_option = gr.Radio( choices=[e.value for e in HiResFixOption], value=self.default_unit.hr_option.value, label="Hires-Fix Option", elem_id=f"{elem_id_tabname}_{tabname}_controlnet_hr_option_radio", elem_classes="controlnet_hr_option_radio", visible=False, ) self.pulid_mode = gr.Radio( choices=[e.value for e in PuLIDMode], value=self.default_unit.pulid_mode.value, label="PuLID Mode", elem_id=f"{elem_id_tabname}_{tabname}_controlnet_pulid_mode_radio", elem_classes="controlnet_pulid_mode_radio", visible=False, ) self.loopback = gr.Checkbox( label="[Batch Loopback] Automatically send generated images to this ControlNet unit in batch generation", value=False, elem_id=f"{elem_id_tabname}_{tabname}_controlnet_automatically_send_generated_images_checkbox", elem_classes="controlnet_loopback_checkbox", visible=False, ) self.advanced_weight_control.render() self.batch_image_dir_state = gr.State("") self.output_dir_state = gr.State("") unit_args = ( self.input_mode, self.batch_image_dir_state, self.output_dir_state, self.loopback, # Non-persistent fields. # Following inputs will not be persistent on `ControlNetUnit`. # They are only used during object construction. self.merge_gallery, self.use_preview_as_input, self.generated_image, # End of Non-persistent fields. self.enabled, self.module, self.model, self.weight, self.image, self.resize_mode, self.low_vram, self.processor_res, self.threshold_a, self.threshold_b, self.guidance_start, self.guidance_end, self.pixel_perfect, self.control_mode, self.inpaint_crop_input_image, self.hr_option, self.save_detected_map, self.advanced_weighting, self.effective_region_mask, self.pulid_mode, self.union_control_type, ) unit = gr.State(ControlNetUnit()) # It is necessary to update unit state actively to avoid potential # flaky racing issue. # https://github.com/Mikubill/sd-webui-controlnet/issues/2875 for comp in unit_args + (self.update_unit_counter,): event_subscribers = [] if hasattr(comp, "edit"): event_subscribers.append(comp.edit) elif hasattr(comp, "click"): event_subscribers.append(comp.click) elif isinstance(comp, gr.Slider) and hasattr(comp, "release"): event_subscribers.append(comp.release) elif hasattr(comp, "change"): event_subscribers.append(comp.change) if hasattr(comp, "clear"): event_subscribers.append(comp.clear) for event_subscriber in event_subscribers: event_subscriber( fn=create_ui_unit, inputs=list(unit_args), outputs=unit ) ( ControlNetUiGroup.a1111_context.img2img_submit_button if self.is_img2img else ControlNetUiGroup.a1111_context.txt2img_submit_button ).click( fn=create_ui_unit, inputs=list(unit_args), outputs=unit, queue=False, ) self.register_core_callbacks() self.ui_initialized = True return unit def register_send_dimensions(self): """Register event handler for send dimension button.""" def send_dimensions(image): def closesteight(num): rem = num % 8 if rem <= 4: return round(num - rem) else: return round(num + (8 - rem)) if image: interm = np.asarray(image.get("image")) return closesteight(interm.shape[1]), closesteight(interm.shape[0]) else: return gr.Slider.update(), gr.Slider.update() outputs = ( [ ControlNetUiGroup.a1111_context.img2img_w_slider, ControlNetUiGroup.a1111_context.img2img_h_slider, ] if self.is_img2img else [ ControlNetUiGroup.a1111_context.txt2img_w_slider, ControlNetUiGroup.a1111_context.txt2img_h_slider, ] ) self.send_dimen_button.click( fn=send_dimensions, inputs=[self.image], outputs=outputs, show_progress=False, ) def register_webcam_toggle(self): def webcam_toggle(): self.webcam_enabled = not self.webcam_enabled return { "value": None, "source": "webcam" if self.webcam_enabled else "upload", "__type__": "update", } self.webcam_enable.click( webcam_toggle, inputs=None, outputs=self.image, show_progress=False ) def register_webcam_mirror_toggle(self): def webcam_mirror_toggle(): self.webcam_mirrored = not self.webcam_mirrored return {"mirror_webcam": self.webcam_mirrored, "__type__": "update"} self.webcam_mirror.click( webcam_mirror_toggle, inputs=None, outputs=self.image, show_progress=False ) def register_refresh_all_models(self): def refresh_all_models(model: str): global_state.update_cn_models() choices = list(global_state.cn_models.keys()) return gr.Dropdown.update( value=model if model in global_state.cn_models else "None", choices=choices, ) self.refresh_models.click( refresh_all_models, inputs=[self.model], outputs=[self.model], show_progress=False, ) def register_build_sliders(self): def build_sliders(module: str, pp: bool): preprocessor = Preprocessor.get_preprocessor(module) slider_resolution_kwargs = ( preprocessor.slider_resolution.gradio_update_kwargs.copy() ) if pp: slider_resolution_kwargs["visible"] = False grs = [ gr.update(**slider_resolution_kwargs), gr.update(**preprocessor.slider_1.gradio_update_kwargs.copy()), gr.update(**preprocessor.slider_2.gradio_update_kwargs.copy()), gr.update(visible=True), gr.update(visible=not preprocessor.do_not_need_model), gr.update(visible=not preprocessor.do_not_need_model), gr.update(visible=preprocessor.show_control_mode), ] return grs inputs = [ self.module, self.pixel_perfect, ] outputs = [ self.processor_res, self.threshold_a, self.threshold_b, self.advanced, self.model, self.refresh_models, self.control_mode, ] self.module.change( build_sliders, inputs=inputs, outputs=outputs, show_progress=False ) self.pixel_perfect.change( build_sliders, inputs=inputs, outputs=outputs, show_progress=False ) def filter_selected(k: str): logger.debug(f"Switch to control type {k}") ( filtered_preprocessor_list, filtered_model_list, default_option, default_model, ) = global_state.select_control_type(k, global_state.get_sd_version()) return [ gr.Dropdown.update( value=default_option, choices=filtered_preprocessor_list ), gr.Dropdown.update(value=default_model, choices=filtered_model_list), ] self.type_filter.change( fn=filter_selected, inputs=[self.type_filter], outputs=[self.module, self.model], show_progress=False, ) def register_union_control_type(self): def filter_selected(k: str): control_type = ControlNetUnionControlType.from_str(k) logger.debug(f"Switch to union control type {control_type}") return gr.update(value=control_type.value) self.type_filter.change( fn=filter_selected, inputs=[self.type_filter], outputs=[self.union_control_type], show_progress=False, ) def register_sd_version_changed(self): def sd_version_changed(type_filter: str, current_model: str): """When SD version changes, update model dropdown choices.""" ( filtered_preprocessor_list, filtered_model_list, default_option, default_model, ) = global_state.select_control_type( type_filter, global_state.get_sd_version() ) if current_model in filtered_model_list: return gr.update() return gr.Dropdown.update( value=default_model, choices=filtered_model_list, ) if ControlNetUiGroup.a1111_context.setting_sd_model_checkpoint: ControlNetUiGroup.a1111_context.setting_sd_model_checkpoint.change( fn=sd_version_changed, inputs=[self.type_filter, self.model], outputs=[self.model], show_progress=False, ) def register_run_annotator(self): def run_annotator( image, module, pres, pthr_a, pthr_b, t2i_w, t2i_h, pp, rm, model: str ): if image is None: return ( gr.update(value=None, visible=True), gr.update(), *self.openpose_editor.update(""), ) img = HWC3(image["image"]) has_mask = not ( (image["mask"][:, :, 0] <= 5).all() or (image["mask"][:, :, 0] >= 250).all() ) if "inpaint" in module: color = HWC3(image["image"]) alpha = image["mask"][:, :, 0:1] img = np.concatenate([color, alpha], axis=2) elif has_mask and not shared.opts.data.get( "controlnet_ignore_noninpaint_mask", False ): img = HWC3(image["mask"][:, :, 0]) preprocessor = Preprocessor.get_preprocessor(module) if pp: pres = external_code.pixel_perfect_resolution( img, target_H=t2i_h, target_W=t2i_w, resize_mode=external_code.resize_mode_from_value(rm), ) class JsonAcceptor: def __init__(self) -> None: self.value = "" def accept(self, json_dict: dict) -> None: self.value = json.dumps(json_dict) json_acceptor = JsonAcceptor() logger.info(f"Preview Resolution = {pres}") def is_openpose(module: str): return "openpose" in module # Only openpose preprocessor returns a JSON output, pass json_acceptor # only when a JSON output is expected. This will make preprocessor cache # work for all other preprocessors other than openpose ones. JSON acceptor # instance are different every call, which means cache will never take # effect. # TODO: Maybe we should let `preprocessor` return a Dict to alleviate this issue? # This requires changing all callsites though. result = preprocessor.cached_call( img, resolution=pres, slider_1=pthr_a, slider_2=pthr_b, low_vram=( ("clip" in module or module == "ip-adapter_face_id_plus") and shared.opts.data.get("controlnet_clip_detector_on_cpu", False) ), json_pose_callback=( json_acceptor.accept if is_openpose(module) else None ), model=model, ) return ( # Update to `generated_image` gr.update( value=result.display_images[0], visible=True, interactive=False ), # preprocessor_preview gr.update(value=True), # openpose editor *self.openpose_editor.update(json_acceptor.value), ) self.trigger_preprocessor.click( fn=run_annotator, inputs=[ self.image, self.module, self.processor_res, self.threshold_a, self.threshold_b, ( ControlNetUiGroup.a1111_context.img2img_w_slider if self.is_img2img else ControlNetUiGroup.a1111_context.txt2img_w_slider ), ( ControlNetUiGroup.a1111_context.img2img_h_slider if self.is_img2img else ControlNetUiGroup.a1111_context.txt2img_h_slider ), self.pixel_perfect, self.resize_mode, self.model, ], outputs=[ self.generated_image, self.preprocessor_preview, *self.openpose_editor.outputs(), ], ) def register_shift_preview(self): def shift_preview(is_on): return ( # generated_image gr.update() if is_on else gr.update(value=None), # generated_image_group gr.update(visible=is_on), # use_preview_as_input, gr.update(visible=False), # Now this is automatically managed # download_pose_link gr.update() if is_on else gr.update(value=None), # modal edit button gr.update() if is_on else gr.update(visible=False), ) self.preprocessor_preview.change( fn=shift_preview, inputs=[self.preprocessor_preview], outputs=[ self.generated_image, self.generated_image_group, self.use_preview_as_input, self.openpose_editor.download_link, self.openpose_editor.modal, ], show_progress=False, ) def register_create_canvas(self): self.open_new_canvas_button.click( lambda: gr.Accordion.update(visible=True), inputs=None, outputs=self.create_canvas, show_progress=False, ) self.canvas_cancel_button.click( lambda: gr.Accordion.update(visible=False), inputs=None, outputs=self.create_canvas, show_progress=False, ) def fn_canvas(h, w): return np.zeros(shape=(h, w, 3), dtype=np.uint8) + 255, gr.Accordion.update( visible=False ) self.canvas_create_button.click( fn=fn_canvas, inputs=[self.canvas_height, self.canvas_width], outputs=[self.image, self.create_canvas], show_progress=False, ) def register_img2img_same_input(self): def fn_same_checked(x): return [ gr.update(value=None), gr.update(value=None), gr.update(value=False, visible=x), ] + [gr.update(visible=x)] * 4 self.upload_independent_img_in_img2img.change( fn_same_checked, inputs=self.upload_independent_img_in_img2img, outputs=[ self.image, self.batch_image_dir, self.preprocessor_preview, self.image_upload_panel, self.trigger_preprocessor, self.loopback, self.resize_mode, ], show_progress=False, ) def register_shift_crop_input_image(self): # A1111 < 1.7.0 compatibility. if any(c is None for c in ControlNetUiGroup.a1111_context.img2img_inpaint_tabs): self.inpaint_crop_input_image.visible = True self.inpaint_crop_input_image.value = True return is_inpaint_tab = gr.State(False) def shift_crop_input_image(is_inpaint: bool, inpaint_area: int): # Note: inpaint_area (0: Whole picture, 1: Only masked) # By default set value to True, as most preprocessors need cropped result. return gr.update(value=True, visible=is_inpaint and inpaint_area == 1) gradio_kwargs = dict( fn=shift_crop_input_image, inputs=[ is_inpaint_tab, ControlNetUiGroup.a1111_context.img2img_inpaint_area, ], outputs=[self.inpaint_crop_input_image], show_progress=False, ) for elem in ControlNetUiGroup.a1111_context.img2img_inpaint_tabs: elem.select(fn=lambda: True, inputs=[], outputs=[is_inpaint_tab]).then( **gradio_kwargs ) for elem in ControlNetUiGroup.a1111_context.img2img_non_inpaint_tabs: elem.select(fn=lambda: False, inputs=[], outputs=[is_inpaint_tab]).then( **gradio_kwargs ) ControlNetUiGroup.a1111_context.img2img_inpaint_area.change(**gradio_kwargs) def register_shift_hr_options(self): # A1111 version < 1.6.0. if not ControlNetUiGroup.a1111_context.txt2img_enable_hr: return ControlNetUiGroup.a1111_context.txt2img_enable_hr.change( fn=lambda checked: gr.update(visible=checked), inputs=[ControlNetUiGroup.a1111_context.txt2img_enable_hr], outputs=[self.hr_option], show_progress=False, ) def register_shift_upload_mask(self): """Controls whether the upload mask input should be visible.""" self.mask_upload.change( fn=lambda checked: ( # Clear mask_image if unchecked. (gr.update(visible=False), gr.update(value=None)) if not checked else (gr.update(visible=True), gr.update()) ), inputs=[self.mask_upload], outputs=[self.mask_image_group, self.effective_region_mask], show_progress=False, ) def register_shift_pulid_mode(self): self.model.change( fn=lambda model: gr.update(visible="pulid" in model.lower()), inputs=[self.model], outputs=[self.pulid_mode], show_progress=False, ) def register_sync_batch_dir(self): def determine_batch_dir(batch_dir, fallback_dir, fallback_fallback_dir): if batch_dir: return batch_dir elif fallback_dir: return fallback_dir else: return fallback_fallback_dir batch_dirs = [ self.batch_image_dir, ControlNetUiGroup.global_batch_input_dir, ControlNetUiGroup.a1111_context.img2img_batch_input_dir, ] for batch_dir_comp in batch_dirs: subscriber = getattr(batch_dir_comp, "blur", None) if subscriber is None: continue subscriber( fn=determine_batch_dir, inputs=batch_dirs, outputs=[self.batch_image_dir_state], queue=False, ) ControlNetUiGroup.a1111_context.img2img_batch_output_dir.blur( fn=lambda a: a, inputs=[ControlNetUiGroup.a1111_context.img2img_batch_output_dir], outputs=[self.output_dir_state], queue=False, ) def register_clear_preview(self): def clear_preview(x): if x: logger.info("Preview as input is cancelled.") return gr.update(value=False), gr.update(value=None) for comp in ( self.pixel_perfect, self.module, self.image, self.processor_res, self.threshold_a, self.threshold_b, self.upload_independent_img_in_img2img, ): event_subscribers = [] if hasattr(comp, "edit"): event_subscribers.append(comp.edit) elif hasattr(comp, "click"): event_subscribers.append(comp.click) elif isinstance(comp, gr.Slider) and hasattr(comp, "release"): event_subscribers.append(comp.release) elif hasattr(comp, "change"): event_subscribers.append(comp.change) if hasattr(comp, "clear"): event_subscribers.append(comp.clear) for event_subscriber in event_subscribers: event_subscriber( fn=clear_preview, inputs=self.use_preview_as_input, outputs=[self.use_preview_as_input, self.generated_image], ) def register_multi_images_upload(self): """Register callbacks on merge tab multiple images upload.""" self.merge_clear_button.click( fn=lambda: [], inputs=[], outputs=[self.merge_gallery], ).then( fn=lambda x: gr.update(value=x + 1), inputs=[self.update_unit_counter], outputs=[self.update_unit_counter], ) def upload_file(files, current_files): return {file_d["name"] for file_d in current_files} | { file.name for file in files } self.merge_upload_button.upload( upload_file, inputs=[self.merge_upload_button, self.merge_gallery], outputs=[self.merge_gallery], queue=False, ).then( fn=lambda x: gr.update(value=x + 1), inputs=[self.update_unit_counter], outputs=[self.update_unit_counter], ) def register_core_callbacks(self): """Register core callbacks that only involves gradio components defined within this ui group.""" self.register_webcam_toggle() self.register_webcam_mirror_toggle() self.register_refresh_all_models() self.register_build_sliders() self.register_union_control_type() self.register_shift_preview() self.register_shift_upload_mask() self.register_shift_pulid_mode() self.register_create_canvas() self.register_clear_preview() self.register_multi_images_upload() self.openpose_editor.register_callbacks( self.generated_image, self.use_preview_as_input, self.model, ) assert self.type_filter is not None self.advanced_weight_control.register_callbacks( self.weight, self.advanced_weighting, self.type_filter, self.update_unit_counter, ) if self.is_img2img: self.register_img2img_same_input() def register_callbacks(self): """Register callbacks that involves A1111 context gradio components.""" # Prevent infinite recursion. if self.callbacks_registered: return self.callbacks_registered = True self.register_sd_version_changed() self.register_send_dimensions() self.register_run_annotator() self.register_sync_batch_dir() if self.is_img2img: self.register_shift_crop_input_image() else: self.register_shift_hr_options() @staticmethod def register_input_mode_sync(ui_groups: List["ControlNetUiGroup"]): """ - ui_group.input_mode should be updated when user switch tabs. - Loopback checkbox should only be visible if at least one ControlNet unit is set to batch mode. Argument: ui_groups: All ControlNetUiGroup instances defined in current Script context. Returns: None """ if not ui_groups: return for ui_group in ui_groups: batch_fn = lambda: InputMode.BATCH simple_fn = lambda: InputMode.SIMPLE merge_fn = lambda: InputMode.MERGE for input_tab, fn in ( (ui_group.upload_tab, simple_fn), (ui_group.batch_tab, batch_fn), (ui_group.merge_tab, merge_fn), ): # Sync input_mode. input_tab.select( fn=fn, inputs=[], outputs=[ui_group.input_mode], show_progress=False, ).then( # Update visibility of loopback checkbox. fn=lambda *mode_values: ( ( gr.update( visible=any(m == InputMode.BATCH for m in mode_values) ), ) * len(ui_groups) ), inputs=[g.input_mode for g in ui_groups], outputs=[g.loopback for g in ui_groups], show_progress=False, ) @staticmethod def reset(): ControlNetUiGroup.a1111_context = A1111Context() ControlNetUiGroup.all_ui_groups = [] @staticmethod def try_register_all_callbacks(): unit_count = shared.opts.data.get("control_net_unit_count", 3) all_unit_count = unit_count * 2 # txt2img + img2img. if ( # All A1111 components ControlNet units care about are all registered. ControlNetUiGroup.a1111_context.ui_initialized and all_unit_count == len(ControlNetUiGroup.all_ui_groups) and all( g.ui_initialized and (not g.callbacks_registered) for g in ControlNetUiGroup.all_ui_groups ) ): for ui_group in ControlNetUiGroup.all_ui_groups: ui_group.register_callbacks() ControlNetUiGroup.register_input_mode_sync( [g for g in ControlNetUiGroup.all_ui_groups if g.is_img2img] ) ControlNetUiGroup.register_input_mode_sync( [g for g in ControlNetUiGroup.all_ui_groups if not g.is_img2img] ) logger.info("ControlNet UI callback registered.") @staticmethod def on_after_component(component, **_kwargs): """Register the A1111 component.""" if getattr(component, "elem_id", None) == "img2img_batch_inpaint_mask_dir": ControlNetUiGroup.global_batch_input_dir.render() return ControlNetUiGroup.a1111_context.set_component(component) ControlNetUiGroup.try_register_all_callbacks() ================================================ FILE: scripts/controlnet_ui/modal.py ================================================ import gradio as gr from typing import List class ModalInterface(gr.Interface): modal_id_counter = 0 def __init__( self, html_content: str, open_button_text: str, open_button_classes: List[str] = [], open_button_extra_attrs: str = '' ): self.html_content = html_content self.open_button_text = open_button_text self.open_button_classes = open_button_classes self.open_button_extra_attrs = open_button_extra_attrs self.modal_id = ModalInterface.modal_id_counter ModalInterface.modal_id_counter += 1 def __call__(self): return self.create_modal() def create_modal(self, visible=True): html_code = f"""
×
{self.html_content}
{self.open_button_text}
""" return gr.HTML(value=html_code, visible=visible) ================================================ FILE: scripts/controlnet_ui/openpose_editor.py ================================================ import base64 import gradio as gr import json from typing import List, Dict, Any, Tuple from annotator.openpose import decode_json_as_poses, draw_poses from annotator.openpose.animalpose import draw_animalposes from scripts.controlnet_ui.modal import ModalInterface from modules import shared from scripts.logging import logger def parse_data_url(data_url: str): # Split the URL at the comma media_type, data = data_url.split(",", 1) # Check if the data is base64-encoded assert ";base64" in media_type # Decode the base64 data return base64.b64decode(data) def encode_data_url(json_string: str) -> str: base64_encoded_json = base64.b64encode(json_string.encode("utf-8")).decode("utf-8") return f"data:application/json;base64,{base64_encoded_json}" class OpenposeEditor(object): # Filename used when user click the download link. download_file = "pose.json" # URL the openpose editor is mounted on. editor_url = "/openpose_editor_index" def __init__(self) -> None: self.render_button = None self.pose_input = None self.download_link = None self.upload_link = None self.modal = None def render_edit(self): """Renders the buttons in preview image control button group.""" # The hidden button to trigger a re-render of generated image. self.render_button = gr.Button(visible=False, elem_classes=["cnet-render-pose"]) # The hidden element that stores the pose json for backend retrieval. # The front-end javascript will write the edited JSON data to the element. self.pose_input = gr.Textbox(visible=False, elem_classes=["cnet-pose-json"]) self.modal = ModalInterface( # Use about:blank here as placeholder so that the iframe does not # immediately navigate. Most of controlnet units do not need # openpose editor active. Only navigate when the user first click # 'Edit'. The navigation logic is in `openpose_editor.js`. '', open_button_text="Edit", open_button_classes=["cnet-edit-pose"], open_button_extra_attrs=f'title="Send pose to {OpenposeEditor.editor_url} for edit."', ).create_modal(visible=False) self.download_link = gr.HTML( value=f"""JSON""", visible=False, elem_classes=["cnet-download-pose"], ) def render_upload(self): """Renders the button in input image control button group.""" self.upload_link = gr.HTML( value=""" """, visible=False, elem_classes=["cnet-upload-pose"], ) def register_callbacks( self, generated_image: gr.Image, use_preview_as_input: gr.Checkbox, model: gr.Dropdown, ): def render_pose(pose_url: str) -> Tuple[Dict, Dict]: json_string = parse_data_url(pose_url).decode("utf-8") poses, animals, height, width = decode_json_as_poses( json.loads(json_string) ) logger.info("Preview as input is enabled.") return ( # Generated image. gr.update( value=( draw_poses( poses, height, width, draw_body=True, draw_hand=True, draw_face=True, ) if poses else draw_animalposes(animals, height, width) ), visible=True, ), # Use preview as input. gr.update(value=True), # Self content. *self.update(json_string), ) self.render_button.click( fn=render_pose, inputs=[self.pose_input], outputs=[generated_image, use_preview_as_input, *self.outputs()], ) def update_upload_link(model: str) -> Dict: return gr.update(visible="openpose" in model.lower()) model.change(fn=update_upload_link, inputs=[model], outputs=[self.upload_link]) def outputs(self) -> List[Any]: return [ self.download_link, self.modal, ] def update(self, json_string: str) -> List[Dict]: """ Called when there is a new JSON pose value generated by running preprocessor. Args: json_string: The new JSON string generated by preprocessor. Returns: An gr.update event. """ hint = "Download the pose as .json file" html = f""" JSON""" visible = json_string != "" return [ # Download link update. gr.update(value=html, visible=visible), # Modal update. gr.update( visible=visible and not shared.opts.data.get("controlnet_disable_openpose_edit", False) ), ] ================================================ FILE: scripts/controlnet_ui/photopea.py ================================================ import gradio as gr from scripts.controlnet_ui.modal import ModalInterface PHOTOPEA_LOGO = """ """ class Photopea(object): def __init__(self) -> None: self.modal = None self.triggers = [] self.render_editor() def render_editor(self): """Render the editor modal.""" with gr.Group(elem_classes=["cnet-photopea-edit"]): self.modal = ModalInterface( # Use about:blank here as placeholder so that the iframe does not # immediately navigate. Only navigate when the user first click # 'Edit'. The navigation logic is in `photopea.js`. """
""", open_button_text="Edit", open_button_classes=["cnet-photopea-main-trigger"], open_button_extra_attrs="hidden", ).create_modal(visible=True) def render_child_trigger(self): self.triggers.append( gr.HTML( f"""
Edit {PHOTOPEA_LOGO}
""" ) ) def attach_photopea_output(self, generated_image: gr.Image): """Called in ControlNetUiGroup to attach preprocessor preview image Gradio element as the photopea output. If the front-end directly change the img HTML element's src to reflect the edited image result from photopea, the backend won't be notified. In this method we let the front-end upload the result image an invisible gr.Image instance and mirrors the value to preprocessor preview gr.Image. This is because the generated image gr.Image instance is inferred to be an output image by Gradio and has no ability to accept image upload directly. Arguments: generated_image: preprocessor result Gradio Image output element. Returns: None """ output = gr.Image( visible=False, source="upload", type="numpy", elem_classes=["cnet-photopea-output"], ) output.upload( fn=lambda img: img, inputs=[output], outputs=[generated_image], ) ================================================ FILE: scripts/controlnet_version.py ================================================ version_flag = 'v1.1.455' # A smart trick to know if user has updated as well as if user has restarted terminal. # Note that in "controlnet.py" we do NOT use "importlib.reload" to reload this "controlnet_version.py" # This means if user did not completely restart terminal, the "version_flag" will be the previous version. # Then, if we get a screenshot from user, we will know that if that user has restarted the terminal. # And we will also know what version the user is using so that bug track becomes easier. ================================================ FILE: scripts/enums.py ================================================ from __future__ import annotations from enum import Enum from typing import List, NamedTuple from functools import lru_cache class UnetBlockType(Enum): INPUT = "input" OUTPUT = "output" MIDDLE = "middle" class TransformerID(NamedTuple): block_type: UnetBlockType # The id of the block the transformer is in. Not all blocks have cross attn. block_id: int # The index of transformer within the block. # A block can have multiple transformers in SDXL. block_index: int # The call index of transformer if in a single step of diffusion. transformer_index: int class TransformerIDResult(NamedTuple): input_ids: List[TransformerID] output_ids: List[TransformerID] middle_ids: List[TransformerID] def get(self, idx: int) -> TransformerID: return self.to_list()[idx] def to_list(self) -> List[TransformerID]: return sorted( self.input_ids + self.output_ids + self.middle_ids, key=lambda i: i.transformer_index, ) class StableDiffusionVersion(Enum): """The version family of stable diffusion model.""" UNKNOWN = 0 SD1x = 1 SD2x = 2 SDXL = 3 @staticmethod def detect_from_model_name(model_name: str) -> "StableDiffusionVersion": """Based on the model name provided, guess what stable diffusion version it is. This might not be accurate without actually inspect the file content. """ if any(f"sd{v}" in model_name.lower() for v in ("14", "15", "16")): return StableDiffusionVersion.SD1x if "sd21" in model_name or "2.1" in model_name: return StableDiffusionVersion.SD2x if "xl" in model_name.lower(): return StableDiffusionVersion.SDXL return StableDiffusionVersion.UNKNOWN def encoder_block_num(self) -> int: if self in ( StableDiffusionVersion.SD1x, StableDiffusionVersion.SD2x, StableDiffusionVersion.UNKNOWN, ): return 12 else: return 9 # SDXL def controlnet_layer_num(self) -> int: return self.encoder_block_num() + 1 @property def transformer_block_num(self) -> int: """Number of blocks that has cross attn transformers in unet.""" if self in ( StableDiffusionVersion.SD1x, StableDiffusionVersion.SD2x, StableDiffusionVersion.UNKNOWN, ): return 16 else: return 11 # SDXL @property @lru_cache(maxsize=None) def transformer_ids(self) -> List[TransformerID]: """id of blocks that have cross attention""" if self in ( StableDiffusionVersion.SD1x, StableDiffusionVersion.SD2x, StableDiffusionVersion.UNKNOWN, ): transformer_index = 0 input_ids = [] for block_id in [1, 2, 4, 5, 7, 8]: input_ids.append( TransformerID(UnetBlockType.INPUT, block_id, 0, transformer_index) ) transformer_index += 1 middle_id = TransformerID(UnetBlockType.MIDDLE, 0, 0, transformer_index) transformer_index += 1 output_ids = [] for block_id in [3, 4, 5, 6, 7, 8, 9, 10, 11]: input_ids.append( TransformerID(UnetBlockType.OUTPUT, block_id, 0, transformer_index) ) transformer_index += 1 return TransformerIDResult(input_ids, output_ids, [middle_id]) else: # SDXL transformer_index = 0 input_ids = [] for block_id in [4, 5, 7, 8]: block_indices = ( range(2) if block_id in [4, 5] else range(10) ) # transformer_depth for index in block_indices: input_ids.append( TransformerID( UnetBlockType.INPUT, block_id, index, transformer_index ) ) transformer_index += 1 middle_ids = [ TransformerID(UnetBlockType.MIDDLE, 0, index, transformer_index) for index in range(10) ] transformer_index += 1 output_ids = [] for block_id in range(6): block_indices = ( range(2) if block_id in [3, 4, 5] else range(10) ) # transformer_depth for index in block_indices: output_ids.append( TransformerID( UnetBlockType.OUTPUT, block_id, index, transformer_index ) ) transformer_index += 1 return TransformerIDResult(input_ids, output_ids, middle_ids) def is_compatible_with(self, other: "StableDiffusionVersion") -> bool: """Incompatible only when one of version is SDXL and other is not.""" return ( any(v == StableDiffusionVersion.UNKNOWN for v in [self, other]) or sum(v == StableDiffusionVersion.SDXL for v in [self, other]) != 1 ) class ControlModelType(Enum): """ The type of Control Models (supported or not). """ ControlNet = "ControlNet, Lvmin Zhang" T2I_Adapter = "T2I_Adapter, Chong Mou" T2I_StyleAdapter = "T2I_StyleAdapter, Chong Mou" T2I_CoAdapter = "T2I_CoAdapter, Chong Mou" MasaCtrl = "MasaCtrl, Mingdeng Cao" GLIGEN = "GLIGEN, Yuheng Li" AttentionInjection = "AttentionInjection, Lvmin Zhang" # A simple attention injection written by Lvmin StableSR = "StableSR, Jianyi Wang" PromptDiffusion = "PromptDiffusion, Zhendong Wang" ControlLoRA = "ControlLoRA, Wu Hecong" ReVision = "ReVision, Stability" IPAdapter = "IPAdapter, Hu Ye" Controlllite = "Controlllite, Kohya" InstantID = "InstantID, Qixun Wang" SparseCtrl = "SparseCtrl, Yuwei Guo" ControlNetUnion = "ControlNetUnion, xinsir6" @property def is_controlnet(self) -> bool: """Returns whether the control model should be treated as ControlNet.""" return self in ( ControlModelType.ControlNet, ControlModelType.ControlLoRA, ControlModelType.InstantID, ControlModelType.ControlNetUnion, ) @property def allow_context_sharing(self) -> bool: """Returns whether this control model type allows the same PlugableControlModel object map to multiple ControlNetUnit. Both IPAdapter and Controlllite have unit specific input (clip/image) stored on the model object during inference. Sharing the context means that the input set earlier gets lost. """ return self not in ( ControlModelType.IPAdapter, ControlModelType.Controlllite, ) @property def supports_effective_region_mask(self) -> bool: return ( self in { ControlModelType.IPAdapter, ControlModelType.T2I_Adapter, } or self.is_controlnet ) # Written by Lvmin class AutoMachine(Enum): """ Lvmin's algorithm for Attention/AdaIn AutoMachine States. """ Read = "Read" Write = "Write" StyleAlign = "StyleAlign" class HiResFixOption(Enum): BOTH = "Both" LOW_RES_ONLY = "Low res only" HIGH_RES_ONLY = "High res only" class InputMode(Enum): # Single image to a single ControlNet unit. SIMPLE = "simple" # Input is a directory. N generations. Each generation takes 1 input image # from the directory. BATCH = "batch" # Input is a directory. 1 generation. Each generation takes N input image # from the directory. MERGE = "merge" class PuLIDMode(Enum): FIDELITY = "Fidelity" STYLE = "Extremely style" class ControlMode(Enum): """ The improved guess mode. """ BALANCED = "Balanced" PROMPT = "My prompt is more important" CONTROL = "ControlNet is more important" class BatchOption(Enum): DEFAULT = "All ControlNet units for all images in a batch" SEPARATE = "Each ControlNet unit for each image in a batch" class ResizeMode(Enum): """ Resize modes for ControlNet input images. """ RESIZE = "Just Resize" INNER_FIT = "Crop and Resize" OUTER_FIT = "Resize and Fill" def int_value(self): if self == ResizeMode.RESIZE: return 0 elif self == ResizeMode.INNER_FIT: return 1 elif self == ResizeMode.OUTER_FIT: return 2 assert False, "NOTREACHED" class ControlNetUnionControlType(Enum): """ ControlNet control type for ControlNet union model. https://github.com/xinsir6/ControlNetPlus/tree/main """ OPENPOSE = "OpenPose" DEPTH = "Depth" # hed/pidi/scribble/ted SOFT_EDGE = "Soft Edge" # canny/lineart/anime_lineart/mlsd HARD_EDGE = "Hard Edge" NORMAL_MAP = "Normal Map" SEGMENTATION = "Segmentation" TILE = "Tile" INPAINT = "Inpaint" UNKNOWN = "Unknown" @staticmethod def all_tags() -> List[str]: """ Tags can be handled by union ControlNet """ return [ "openpose", "depth", "softedge", "scribble", "canny", "lineart", "mlsd", "normalmap", "segmentation", "inpaint", "tile", ] @staticmethod def from_str(s: str) -> ControlNetUnionControlType: s = s.lower() if s == "openpose": return ControlNetUnionControlType.OPENPOSE elif s == "depth": return ControlNetUnionControlType.DEPTH elif s in ["scribble", "softedge"]: return ControlNetUnionControlType.SOFT_EDGE elif s in ["canny", "lineart", "mlsd"]: return ControlNetUnionControlType.HARD_EDGE elif s == "normalmap": return ControlNetUnionControlType.NORMAL_MAP elif s == "segmentation": return ControlNetUnionControlType.SEGMENTATION elif s in ["tile", "blur"]: return ControlNetUnionControlType.TILE elif s == "inpaint": return ControlNetUnionControlType.INPAINT return ControlNetUnionControlType.UNKNOWN def int_value(self) -> int: if self == ControlNetUnionControlType.UNKNOWN: raise ValueError("Unknown control type cannot be encoded.") return list(ControlNetUnionControlType).index(self) ================================================ FILE: scripts/external_code.py ================================================ from internal_controlnet.external_code import * # noqa: F403 ================================================ FILE: scripts/global_state.py ================================================ import os.path import stat from collections import OrderedDict from modules import shared, scripts, sd_models from modules.paths import models_path from scripts.enums import StableDiffusionVersion from scripts.supported_preprocessor import Preprocessor from typing import Dict, Tuple, List CN_MODEL_EXTS = [".pt", ".pth", ".ckpt", ".safetensors", ".bin"] cn_models_dir = os.path.join(models_path, "ControlNet") cn_models_dir_old = os.path.join(scripts.basedir(), "models") cn_models = OrderedDict() # "My_Lora(abcd1234)" -> C:/path/to/model.safetensors cn_models_names = {} # "my_lora" -> "My_Lora(abcd1234)" default_detectedmap_dir = os.path.join("detected_maps") script_dir = scripts.basedir() os.makedirs(cn_models_dir, exist_ok=True) def traverse_all_files(curr_path, model_list): f_list = [ (os.path.join(curr_path, entry.name), entry.stat()) for entry in os.scandir(curr_path) if os.path.isdir(curr_path) ] for f_info in f_list: fname, fstat = f_info if os.path.splitext(fname)[1] in CN_MODEL_EXTS: model_list.append(f_info) elif stat.S_ISDIR(fstat.st_mode): model_list = traverse_all_files(fname, model_list) return model_list def get_all_models(sort_by, filter_by, path): res = OrderedDict() fileinfos = traverse_all_files(path, []) filter_by = filter_by.strip(" ") if len(filter_by) != 0: fileinfos = [x for x in fileinfos if filter_by.lower() in os.path.basename(x[0]).lower()] if sort_by == "name": fileinfos = sorted(fileinfos, key=lambda x: os.path.basename(x[0])) elif sort_by == "date": fileinfos = sorted(fileinfos, key=lambda x: -x[1].st_mtime) elif sort_by == "path name": fileinfos = sorted(fileinfos) for finfo in fileinfos: filename = finfo[0] name = os.path.splitext(os.path.basename(filename))[0] # Prevent a hypothetical "None.pt" from being listed. if name != "None": res[name + f" [{sd_models.model_hash(filename)}]"] = filename return res def update_cn_models(): cn_models.clear() ext_dirs = (shared.opts.data.get("control_net_models_path", None), getattr(shared.cmd_opts, 'controlnet_dir', None)) extra_lora_paths = (extra_lora_path for extra_lora_path in ext_dirs if extra_lora_path is not None and os.path.exists(extra_lora_path)) paths = [cn_models_dir, cn_models_dir_old, *extra_lora_paths] for path in paths: sort_by = shared.opts.data.get( "control_net_models_sort_models_by", "name") filter_by = shared.opts.data.get("control_net_models_name_filter", "") found = get_all_models(sort_by, filter_by, path) cn_models.update({**found, **cn_models}) # insert "None" at the beginning of `cn_models` in-place cn_models_copy = OrderedDict(cn_models) cn_models.clear() cn_models.update({**{"None": None}, **cn_models_copy}) cn_models_names.clear() for name_and_hash, filename in cn_models.items(): if filename is None: continue name = os.path.splitext(os.path.basename(filename))[0].lower() cn_models_names[name] = name_and_hash def get_sd_version() -> StableDiffusionVersion: if hasattr(shared.sd_model, 'is_sdxl'): if shared.sd_model.is_sdxl: return StableDiffusionVersion.SDXL elif shared.sd_model.is_sd2: return StableDiffusionVersion.SD2x elif shared.sd_model.is_sd1: return StableDiffusionVersion.SD1x else: return StableDiffusionVersion.UNKNOWN # backward compability for webui < 1.5.0 else: if hasattr(shared.sd_model, 'conditioner'): return StableDiffusionVersion.SDXL elif hasattr(shared.sd_model.cond_stage_model, 'model'): return StableDiffusionVersion.SD2x else: return StableDiffusionVersion.SD1x def select_control_type( control_type: str, sd_version: StableDiffusionVersion = StableDiffusionVersion.UNKNOWN, cn_models: Dict = cn_models, # Override or testing ) -> Tuple[List[str], List[str], str, str]: pattern = control_type.lower() all_models = list(cn_models.keys()) if pattern == "all": return [ [p.label for p in Preprocessor.get_sorted_preprocessors()], all_models, 'none', #default option "None" #default model ] filtered_model_list = [ model for model in all_models if model.lower() == "none" or (( pattern in model.lower() or any(a in model.lower() for a in Preprocessor.tag_to_filters(control_type)) ) and ( sd_version.is_compatible_with(StableDiffusionVersion.detect_from_model_name(model)) )) ] assert len(filtered_model_list) > 0, "'None' model should always be available." if len(filtered_model_list) == 1: default_model = "None" else: default_model = filtered_model_list[1] for x in filtered_model_list: if "11" in x.split("[")[0]: default_model = x break return ( [p.label for p in Preprocessor.get_filtered_preprocessors(control_type)], filtered_model_list, Preprocessor.get_default_preprocessor(control_type).label, default_model ) ================================================ FILE: scripts/hook.py ================================================ import torch import hashlib import numpy as np import torch.nn as nn from functools import partial from typing import Optional, Any, List from scripts.logging import logger from scripts.enums import ( ControlModelType, AutoMachine, HiResFixOption, ControlNetUnionControlType, ) from scripts.ipadapter.ipadapter_model import ImageEmbed from scripts.controlnet_sparsectrl import SparseCtrl from modules import devices, lowvram, shared, scripts from ldm.modules.diffusionmodules.util import timestep_embedding, make_beta_schedule from ldm.modules.diffusionmodules.openaimodel import UNetModel from ldm.modules.attention import BasicTransformerBlock from ldm.models.diffusion.ddpm import extract_into_tensor from modules.prompt_parser import MulticondLearnedConditioning, ComposableScheduledPromptConditioning, ScheduledPromptConditioning from modules.processing import StableDiffusionProcessing try: from sgm.modules.attention import BasicTransformerBlock as BasicTransformerBlockSGM except ImportError: print('Warning: ControlNet failed to load SGM - will use LDM instead.') BasicTransformerBlockSGM = BasicTransformerBlock cond_cast_unet = getattr(devices, 'cond_cast_unet', lambda x: x) POSITIVE_MARK_TOKEN = 1024 NEGATIVE_MARK_TOKEN = - POSITIVE_MARK_TOKEN MARK_EPS = 1e-3 def prompt_context_is_marked(x): t = x[..., 0, :] m = torch.abs(t) - POSITIVE_MARK_TOKEN m = torch.mean(torch.abs(m)).detach().cpu().float().numpy() return float(m) < MARK_EPS def mark_prompt_context(x, positive): if isinstance(x, list): for i in range(len(x)): x[i] = mark_prompt_context(x[i], positive) return x if isinstance(x, MulticondLearnedConditioning): x.batch = mark_prompt_context(x.batch, positive) return x if isinstance(x, ComposableScheduledPromptConditioning): x.schedules = mark_prompt_context(x.schedules, positive) return x if isinstance(x, ScheduledPromptConditioning): if isinstance(x.cond, dict): cond = x.cond['crossattn'] if prompt_context_is_marked(cond): return x mark = POSITIVE_MARK_TOKEN if positive else NEGATIVE_MARK_TOKEN cond = torch.cat([torch.zeros_like(cond)[:1] + mark, cond], dim=0) return ScheduledPromptConditioning(end_at_step=x.end_at_step, cond=dict(crossattn=cond, vector=x.cond['vector'])) else: cond = x.cond if prompt_context_is_marked(cond): return x mark = POSITIVE_MARK_TOKEN if positive else NEGATIVE_MARK_TOKEN cond = torch.cat([torch.zeros_like(cond)[:1] + mark, cond], dim=0) return ScheduledPromptConditioning(end_at_step=x.end_at_step, cond=cond) return x disable_controlnet_prompt_warning = True # You can disable this warning using disable_controlnet_prompt_warning. def unmark_prompt_context(x): if not prompt_context_is_marked(x): # ControlNet must know whether a prompt is conditional prompt (positive prompt) or unconditional conditioning prompt (negative prompt). # You can use the hook.py's `mark_prompt_context` to mark the prompts that will be seen by ControlNet. # Let us say XXX is a MulticondLearnedConditioning or a ComposableScheduledPromptConditioning or a ScheduledPromptConditioning or a list of these components, # if XXX is a positive prompt, you should call mark_prompt_context(XXX, positive=True) # if XXX is a negative prompt, you should call mark_prompt_context(XXX, positive=False) # After you mark the prompts, the ControlNet will know which prompt is cond/uncond and works as expected. # After you mark the prompts, the mismatch errors will disappear. if not disable_controlnet_prompt_warning: logger.warning('ControlNet Error: Failed to detect whether an instance is cond or uncond!') logger.warning('ControlNet Error: This is mainly because other extension(s) blocked A1111\'s \"process.sample()\" and deleted ControlNet\'s sample function.') logger.warning('ControlNet Error: ControlNet will shift to a backup backend but the results will be worse than expectation.') logger.warning('Solution (For extension developers): Take a look at ControlNet\' hook.py ' 'UnetHook.hook.process_sample and manually call mark_prompt_context to mark cond/uncond prompts.') mark_batch = torch.ones(size=(x.shape[0], 1, 1, 1), dtype=x.dtype, device=x.device) context = x return mark_batch, [], [], context mark = x[:, 0, :] context = x[:, 1:, :] mark = torch.mean(torch.abs(mark - NEGATIVE_MARK_TOKEN), dim=1) mark = (mark > MARK_EPS).float() mark_batch = mark[:, None, None, None].to(x.dtype).to(x.device) mark = mark.detach().cpu().numpy().tolist() uc_indices = [i for i, item in enumerate(mark) if item < 0.5] c_indices = [i for i, item in enumerate(mark) if not item < 0.5] StableDiffusionProcessing.cached_c = [None, None] StableDiffusionProcessing.cached_uc = [None, None] return mark_batch, uc_indices, c_indices, context class HackedImageRNG: def __init__(self, rng, noise_modifier, sd_model): self.rng = rng self.noise_modifier = noise_modifier self.sd_model = sd_model def next(self): result = self.rng.next() x0 = self.noise_modifier if result.shape[2] != x0.shape[2] or result.shape[3] != x0.shape[3]: return result x0 = x0.to(result.dtype).to(result.device) ts = torch.tensor([999] * result.shape[0]).long().to(result.device) result = predict_q_sample(self.sd_model, x0, ts, result) logger.info(f'[ControlNet] Initial noise hack applied to {result.shape}.') return result class TorchHijackForUnet: """ This is torch, but with cat that resizes tensors to appropriate dimensions if they do not match; this makes it possible to create pictures with dimensions that are multiples of 8 rather than 64 """ def __getattr__(self, item): if item == 'cat': return self.cat if hasattr(torch, item): return getattr(torch, item) raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, item)) def cat(self, tensors, *args, **kwargs): if len(tensors) == 2: a, b = tensors if a.shape[-2:] != b.shape[-2:]: a = torch.nn.functional.interpolate(a, b.shape[-2:], mode="nearest") tensors = (a, b) return torch.cat(tensors, *args, **kwargs) th = TorchHijackForUnet() class ControlParams: def __init__( self, control_model, preprocessor, hint_cond, weight, guidance_stopped, start_guidance_percent, stop_guidance_percent, advanced_weighting, control_model_type, hr_hint_cond, global_average_pooling, soft_injection, cfg_injection, hr_option: HiResFixOption = HiResFixOption.BOTH, control_context_override: Optional[Any] = None, effective_region_mask: Optional[torch.Tensor] = None, union_control_types: List[ControlNetUnionControlType] = None, **kwargs # To avoid errors ): self.control_model = control_model self.preprocessor = preprocessor self._hint_cond = hint_cond self.weight = weight self.guidance_stopped = guidance_stopped self.start_guidance_percent = start_guidance_percent self.stop_guidance_percent = stop_guidance_percent self.advanced_weighting = advanced_weighting self.control_model_type = control_model_type self.global_average_pooling = global_average_pooling self.hr_hint_cond = hr_hint_cond self.hr_option = hr_option self.control_context_override = control_context_override self.effective_region_mask = effective_region_mask self.union_control_types = union_control_types or [] self.used_hint_cond = None self.used_hint_cond_latent = None self.used_hint_inpaint_hijack = None self.soft_injection = soft_injection self.cfg_injection = cfg_injection self.vision_hint_count = None @property def hint_cond(self): return self._hint_cond # fix for all the extensions that modify hint_cond, # by forcing used_hint_cond to update on the next timestep # hr_hint_cond can stay the same, since most extensions dont modify the hires pass # but if they do, it will cause problems @hint_cond.setter def hint_cond(self, new_hint_cond): self._hint_cond = new_hint_cond self.used_hint_cond = None self.used_hint_cond_latent = None self.used_hint_inpaint_hijack = None def disabled_by_hr_option(self, is_in_high_res_fix: bool) -> bool: if self.hr_option == HiResFixOption.BOTH: control_disabled = False elif self.hr_option == HiResFixOption.LOW_RES_ONLY: control_disabled = is_in_high_res_fix elif self.hr_option == HiResFixOption.HIGH_RES_ONLY: control_disabled = not is_in_high_res_fix else: assert False, "NOTREACHED" return control_disabled def apply_effective_region_mask(self, out: torch.Tensor) -> torch.Tensor: if self.effective_region_mask is None: return out B, C, H, W = out.shape mask = torch.nn.functional.interpolate( self.effective_region_mask.to(out.device), size=(H, W), mode="bilinear", ) return out * mask def aligned_adding(base, x, require_channel_alignment): if isinstance(x, float): if x == 0.0: return base return base + x if require_channel_alignment: zeros = torch.zeros_like(base) zeros[:, :x.shape[1], ...] = x x = zeros # resize to sample resolution base_h, base_w = base.shape[-2:] xh, xw = x.shape[-2:] if xh > 1 or xw > 1: if base_h != xh or base_w != xw: # logger.info('[Warning] ControlNet finds unexpected mis-alignment in tensor shape.') x = th.nn.functional.interpolate(x, size=(base_h, base_w), mode="nearest") return base + x # DFS Search for Torch.nn.Module, Written by Lvmin def torch_dfs(model: torch.nn.Module): result = [model] for child in model.children(): result += torch_dfs(child) return result class AbstractLowScaleModel(nn.Module): def __init__(self): super(AbstractLowScaleModel, self).__init__() self.register_schedule() def register_schedule(self, beta_schedule="linear", timesteps=1000, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s) alphas = 1. - betas alphas_cumprod = np.cumprod(alphas, axis=0) alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) timesteps, = betas.shape self.num_timesteps = int(timesteps) self.linear_start = linear_start self.linear_end = linear_end assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep' to_torch = partial(torch.tensor, dtype=torch.float32) self.register_buffer('betas', to_torch(betas)) self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev)) # calculations for diffusion q(x_t | x_{t-1}) and others self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod))) self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod))) self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod))) self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod))) self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1))) def q_sample(self, x_start, t, noise=None): if noise is None: noise = torch.randn_like(x_start) return (extract_into_tensor(self.sqrt_alphas_cumprod.to(x_start), t, x_start.shape) * x_start + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod.to(x_start), t, x_start.shape) * noise) def register_schedule(self): linear_start = 0.00085 linear_end = 0.0120 num_timesteps = 1000 betas = (torch.linspace(linear_start ** 0.5, linear_end ** 0.5, num_timesteps, dtype=torch.float64) ** 2.0).numpy() alphas = 1. - betas alphas_cumprod = np.cumprod(alphas, axis=0) alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) to_torch = partial(torch.tensor, dtype=torch.float32) setattr(self, 'betas', to_torch(betas)) # setattr(self, 'alphas_cumprod', to_torch(alphas_cumprod)) # a1111 already has this setattr(self, 'alphas_cumprod_prev', to_torch(alphas_cumprod_prev)) setattr(self, 'sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod))) setattr(self, 'sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod))) setattr(self, 'log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod))) setattr(self, 'sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod))) setattr(self, 'sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1))) def predict_q_sample(ldm, x_start, t, noise=None): if noise is None: noise = torch.randn_like(x_start) return extract_into_tensor(ldm.sqrt_alphas_cumprod.to(x_start), t, x_start.shape) * x_start + extract_into_tensor(ldm.sqrt_one_minus_alphas_cumprod.to(x_start), t, x_start.shape) * noise def predict_start_from_noise(ldm, x_t, t, noise): return extract_into_tensor(ldm.sqrt_recip_alphas_cumprod.to(x_t), t, x_t.shape) * x_t - extract_into_tensor(ldm.sqrt_recipm1_alphas_cumprod.to(x_t), t, x_t.shape) * noise def predict_noise_from_start(ldm, x_t, t, x0): return (extract_into_tensor(ldm.sqrt_recip_alphas_cumprod.to(x_t), t, x_t.shape) * x_t - x0) / extract_into_tensor(ldm.sqrt_recipm1_alphas_cumprod.to(x_t), t, x_t.shape) def blur(x, k): y = torch.nn.functional.pad(x, (k, k, k, k), mode='replicate') y = torch.nn.functional.avg_pool2d(y, (k*2+1, k*2+1), stride=(1, 1)) return y class TorchCache: def __init__(self): self.cache = {} def hash(self, key): v = key.detach().cpu().numpy().astype(np.float32) v = (v * 1000.0).astype(np.int32) v = np.ascontiguousarray(v.copy()) sha = hashlib.sha1(v).hexdigest() return sha def get(self, key): key = self.hash(key) return self.cache.get(key, None) def set(self, key, value): self.cache[self.hash(key)] = value class UnetHook(nn.Module): def __init__(self, lowvram=False) -> None: super().__init__() self.lowvram = lowvram self.model = None self.sd_ldm = None self.control_params = None self.attention_auto_machine = AutoMachine.Read self.attention_auto_machine_weight = 1.0 self.gn_auto_machine = AutoMachine.Read self.gn_auto_machine_weight = 1.0 self.current_style_fidelity = 0.0 self.current_uc_indices = [] self.current_c_indices = [] self.is_in_high_res_fix = False @staticmethod def call_vae_using_process(p, x, batch_size=None, mask=None): vae_cache = getattr(p, 'controlnet_vae_cache', None) if vae_cache is None: vae_cache = TorchCache() setattr(p, 'controlnet_vae_cache', vae_cache) try: if x.shape[1] > 3: x = x[:, 0:3, :, :] x = x * 2.0 - 1.0 if mask is not None: x = x * (1.0 - mask) x = x.type(devices.dtype_vae) vae_output = vae_cache.get(x) if vae_output is None: with devices.autocast(): vae_output = torch.stack([ p.sd_model.get_first_stage_encoding( p.sd_model.encode_first_stage(torch.unsqueeze(img, 0).to(device=devices.device)) )[0].to(img.device) for img in x ]) if torch.all(torch.isnan(vae_output)).item(): logger.info('ControlNet find Nans in the VAE encoding. \n ' 'Now ControlNet will automatically retry.\n ' 'To always start with 32-bit VAE, use --no-half-vae commandline flag.') devices.dtype_vae = torch.float32 x = x.to(devices.dtype_vae) p.sd_model.first_stage_model.to(devices.dtype_vae) vae_output = torch.stack([ p.sd_model.get_first_stage_encoding( p.sd_model.encode_first_stage(torch.unsqueeze(img, 0).to(device=devices.device)) )[0].to(img.device) for img in x ]) vae_cache.set(x, vae_output) logger.info(f'ControlNet used {str(devices.dtype_vae)} VAE to encode {vae_output.shape}.') latent = vae_output if batch_size is not None and latent.shape[0] != batch_size: latent = torch.cat([latent.clone() for _ in range(batch_size)], dim=0) latent = latent.type(devices.dtype_unet) return latent except Exception as e: logger.error(e) raise ValueError('ControlNet failed to use VAE. Please try to add `--no-half-vae`, `--no-half` and remove `--precision full` in launch cmd.') def guidance_schedule_handler(self, x): if not self.control_params: return for param in self.control_params: current_sampling_percent = (x.sampling_step / x.total_sampling_steps) param.guidance_stopped = current_sampling_percent < param.start_guidance_percent or current_sampling_percent > param.stop_guidance_percent if self.model is not None: self.model.current_sampling_percent = current_sampling_percent def hook(self, model, sd_ldm, control_params: List[ControlParams], process, batch_option_uint_separate=False, batch_option_style_align=False): self.model = model self.sd_ldm = sd_ldm self.control_params = control_params model_is_sdxl = getattr(self.sd_ldm, 'is_sdxl', False) outer = self def process_sample(*args, **kwargs): # ControlNet must know whether a prompt is conditional prompt (positive prompt) or unconditional conditioning prompt (negative prompt). # You can use the hook.py's `mark_prompt_context` to mark the prompts that will be seen by ControlNet. # Let us say XXX is a MulticondLearnedConditioning or a ComposableScheduledPromptConditioning or a ScheduledPromptConditioning or a list of these components, # if XXX is a positive prompt, you should call mark_prompt_context(XXX, positive=True) # if XXX is a negative prompt, you should call mark_prompt_context(XXX, positive=False) # After you mark the prompts, the ControlNet will know which prompt is cond/uncond and works as expected. # After you mark the prompts, the mismatch errors will disappear. mark_prompt_context(kwargs.get('conditioning', []), positive=True) mark_prompt_context(kwargs.get('unconditional_conditioning', []), positive=False) mark_prompt_context(getattr(process, 'hr_c', []), positive=True) mark_prompt_context(getattr(process, 'hr_uc', []), positive=False) return process.sample_before_CN_hack(*args, **kwargs) def forward(self, x, timesteps=None, context=None, y=None, **kwargs): is_sdxl = y is not None and model_is_sdxl total_t2i_adapter_embedding = [0.0] * 4 if is_sdxl: total_controlnet_embedding = [0.0] * 10 else: total_controlnet_embedding = [0.0] * 13 require_inpaint_hijack = False is_in_high_res_fix = False batch_size = int(x.shape[0]) # Handle cond-uncond marker cond_mark, outer.current_uc_indices, outer.current_c_indices, context = unmark_prompt_context(context) outer.model.cond_mark = cond_mark # logger.info(str(cond_mark[:, 0, 0, 0].detach().cpu().numpy().tolist()) + ' - ' + str(outer.current_uc_indices)) # Revision if is_sdxl: revision_y1280 = 0 for param in outer.control_params: if param.guidance_stopped: continue if param.control_model_type == ControlModelType.ReVision: if param.vision_hint_count is None: k = torch.Tensor([int(param.preprocessor['threshold_a'] * 1000)]).to(param.hint_cond).long().clip(0, 999) param.vision_hint_count = outer.revision_q_sampler.q_sample(param.hint_cond, k) revision_emb = param.vision_hint_count if isinstance(revision_emb, torch.Tensor): revision_y1280 += revision_emb * param.weight if isinstance(revision_y1280, torch.Tensor): y[:, :1280] = revision_y1280 * cond_mark[:, :, 0, 0] if any('ignore_prompt' in param.preprocessor['name'] for param in outer.control_params) \ or (getattr(process, 'prompt', '') == '' and getattr(process, 'negative_prompt', '') == ''): context = torch.zeros_like(context) # High-res fix for param in outer.control_params: # select which hint_cond to use if param.used_hint_cond is None: param.used_hint_cond = param.hint_cond param.used_hint_cond_latent = None param.used_hint_inpaint_hijack = None # has high-res fix if isinstance(param.hr_hint_cond, torch.Tensor) and x.ndim == 4 and param.hint_cond.ndim == 4 and param.hr_hint_cond.ndim == 4: _, _, h_lr, w_lr = param.hint_cond.shape _, _, h_hr, w_hr = param.hr_hint_cond.shape _, _, h, w = x.shape h, w = h * 8, w * 8 if abs(h - h_lr) < abs(h - h_hr): is_in_high_res_fix = False if param.used_hint_cond is not param.hint_cond: param.used_hint_cond = param.hint_cond param.used_hint_cond_latent = None param.used_hint_inpaint_hijack = None else: is_in_high_res_fix = True if param.used_hint_cond is not param.hr_hint_cond: param.used_hint_cond = param.hr_hint_cond param.used_hint_cond_latent = None param.used_hint_inpaint_hijack = None self.is_in_high_res_fix = is_in_high_res_fix outer.is_in_high_res_fix = is_in_high_res_fix # Convert control image to latent for param in outer.control_params: if param.used_hint_cond_latent is not None: continue if param.control_model_type not in [ControlModelType.AttentionInjection] \ and 'colorfix' not in param.preprocessor['name'] \ and 'inpaint_only' not in param.preprocessor['name']: continue param.used_hint_cond_latent = outer.call_vae_using_process(process, param.used_hint_cond, batch_size=batch_size) # vram for param in outer.control_params: if getattr(param.control_model, 'disable_memory_management', False): continue if param.control_model is not None: if outer.lowvram and is_sdxl and hasattr(param.control_model, 'aggressive_lowvram'): param.control_model.aggressive_lowvram() elif hasattr(param.control_model, 'fullvram'): param.control_model.fullvram() elif hasattr(param.control_model, 'to'): param.control_model.to(devices.get_device_for("controlnet")) # handle prompt token control for param in outer.control_params: if param.guidance_stopped or param.disabled_by_hr_option(self.is_in_high_res_fix): continue if param.control_model_type not in [ControlModelType.T2I_StyleAdapter]: continue control = param.control_model(x=x, hint=param.used_hint_cond, timesteps=timesteps, context=context) control = torch.cat([control.clone() for _ in range(batch_size)], dim=0) control *= param.weight control *= cond_mark[:, :, :, 0] context = torch.cat([context, control.clone()], dim=1) # handle ControlNet / T2I_Adapter for param_index, param in enumerate(outer.control_params): if param.guidance_stopped or param.disabled_by_hr_option(self.is_in_high_res_fix): continue if not ( param.control_model_type.is_controlnet or param.control_model_type == ControlModelType.T2I_Adapter ): continue # inpaint model workaround x_in = x control_model = param.control_model.control_model if param.control_model_type.is_controlnet: if x.shape[1] != control_model.input_blocks[0][0].in_channels and x.shape[1] == 9: # inpaint_model: 4 data + 4 downscaled image + 1 mask x_in = x[:, :4, ...] require_inpaint_hijack = True assert param.used_hint_cond is not None, "Controlnet is enabled but no input image is given" hint = param.used_hint_cond if param.control_model_type == ControlModelType.InstantID: assert isinstance(param.control_context_override, ImageEmbed) controlnet_context = param.control_context_override.eval(cond_mark).to(x.device, dtype=x.dtype) else: controlnet_context = context # ControlNet inpaint protocol if hint.shape[1] == 4 and not isinstance(control_model, SparseCtrl): c = hint[:, 0:3, :, :] m = hint[:, 3:4, :, :] m = (m > 0.5).float() hint = c * (1 - m) - m control = param.control_model( x=x_in, hint=hint, timesteps=timesteps, context=controlnet_context, y=y, control_type=( [ t.int_value() for t in param.union_control_types if t != ControlNetUnionControlType.UNKNOWN ] if param.control_model_type == ControlModelType.ControlNetUnion else None ), ) if is_sdxl: control_scales = [param.weight] * 10 else: control_scales = [param.weight] * 13 if param.cfg_injection or param.global_average_pooling: if param.control_model_type == ControlModelType.T2I_Adapter: control = [torch.cat([c.clone() for _ in range(batch_size)], dim=0) for c in control] control = [c * cond_mark for c in control] high_res_fix_forced_soft_injection = False if is_in_high_res_fix: if 'canny' in param.preprocessor['name']: high_res_fix_forced_soft_injection = True if 'mlsd' in param.preprocessor['name']: high_res_fix_forced_soft_injection = True if param.soft_injection or high_res_fix_forced_soft_injection: # important! use the soft weights with high-res fix can significantly reduce artifacts. if param.control_model_type == ControlModelType.T2I_Adapter: control_scales = [param.weight * x for x in (0.25, 0.62, 0.825, 1.0)] elif param.control_model_type.is_controlnet: control_scales = [param.weight * (0.825 ** float(12 - i)) for i in range(13)] if is_sdxl and param.control_model_type.is_controlnet: control_scales = control_scales[:10] if param.advanced_weighting is not None: logger.info(f"Advanced weighting enabled. {param.advanced_weighting}") if param.soft_injection or high_res_fix_forced_soft_injection: logger.warn("Advanced weighting overwrites soft_injection effect.") control_scales = param.advanced_weighting control = [ param.apply_effective_region_mask(c * scale) for c, scale in zip(control, control_scales) ] if param.global_average_pooling: control = [torch.mean(c, dim=(2, 3), keepdim=True) for c in control] for idx, item in enumerate(control): target = None if param.control_model_type.is_controlnet: target = total_controlnet_embedding if param.control_model_type == ControlModelType.T2I_Adapter: target = total_t2i_adapter_embedding if target is not None: if batch_option_uint_separate: for pi, ci in enumerate(outer.current_c_indices): if pi % len(outer.control_params) != param_index: item[ci] = 0 for pi, ci in enumerate(outer.current_uc_indices): if pi % len(outer.control_params) != param_index: item[ci] = 0 target[idx] = item + target[idx] else: target[idx] = item + target[idx] # Replace x_t to support inpaint models for param in outer.control_params: if not isinstance(param.used_hint_cond, torch.Tensor): continue if param.used_hint_cond.ndim < 2 or param.used_hint_cond.shape[1] != 4: continue if x.shape[1] != 9: continue if param.used_hint_inpaint_hijack is None: mask_pixel = param.used_hint_cond[:, 3:4, :, :] image_pixel = param.used_hint_cond[:, 0:3, :, :] mask_pixel = (mask_pixel > 0.5).to(mask_pixel.dtype) masked_latent = outer.call_vae_using_process(process, image_pixel, batch_size, mask=mask_pixel) mask_latent = torch.nn.functional.max_pool2d(mask_pixel, (8, 8)) if mask_latent.shape[0] != batch_size: mask_latent = torch.cat([mask_latent.clone() for _ in range(batch_size)], dim=0) param.used_hint_inpaint_hijack = torch.cat([mask_latent, masked_latent], dim=1) param.used_hint_inpaint_hijack.to(x.dtype).to(x.device) x = torch.cat([x[:, :4, :, :], param.used_hint_inpaint_hijack], dim=1) # vram for param in outer.control_params: if param.control_model is not None: if outer.lowvram: param.control_model.to('cpu') # A1111 fix for medvram. if shared.cmd_opts.medvram or (getattr(shared.cmd_opts, 'medvram_sdxl', False) and is_sdxl): try: # Trigger the register_forward_pre_hook outer.sd_ldm.model() except Exception as e: logger.debug("register_forward_pre_hook") logger.debug(e) # Clear attention and AdaIn cache for module in outer.attn_module_list: module.bank = [] module.style_cfgs = [] for module in outer.gn_module_list: module.mean_bank = [] module.var_bank = [] module.style_cfgs = [] # Handle attention and AdaIn control for param in outer.control_params: if param.guidance_stopped or param.disabled_by_hr_option(self.is_in_high_res_fix): continue if param.used_hint_cond_latent is None: continue if param.control_model_type not in [ControlModelType.AttentionInjection]: continue ref_xt = predict_q_sample(outer.sd_ldm, param.used_hint_cond_latent, torch.round(timesteps.float()).long()) # Inpaint Hijack if x.shape[1] == 9: ref_xt = torch.cat([ ref_xt, torch.zeros_like(ref_xt)[:, 0:1, :, :], param.used_hint_cond_latent ], dim=1) outer.current_style_fidelity = float(param.preprocessor['threshold_a']) outer.current_style_fidelity = max(0.0, min(1.0, outer.current_style_fidelity)) if is_sdxl: # sdxl's attention hacking is highly unstable. # We have no other methods but to reduce the style_fidelity a bit. # By default, 0.5 ** 3.0 = 0.125 outer.current_style_fidelity = outer.current_style_fidelity ** 3.0 if param.cfg_injection: outer.current_style_fidelity = 1.0 elif param.soft_injection or is_in_high_res_fix: outer.current_style_fidelity = 0.0 control_name = param.preprocessor['name'] if control_name in ['reference_only', 'reference_adain+attn']: outer.attention_auto_machine = AutoMachine.Write outer.attention_auto_machine_weight = param.weight if control_name in ['reference_adain', 'reference_adain+attn']: outer.gn_auto_machine = AutoMachine.Write outer.gn_auto_machine_weight = param.weight if is_sdxl: outer.original_forward( x=ref_xt.to(devices.dtype_unet), timesteps=timesteps.to(devices.dtype_unet), context=context.to(devices.dtype_unet), y=y ) else: outer.original_forward( x=ref_xt.to(devices.dtype_unet), timesteps=timesteps.to(devices.dtype_unet), context=context.to(devices.dtype_unet) ) outer.attention_auto_machine = AutoMachine.Read outer.gn_auto_machine = AutoMachine.Read # U-Net Encoder hs = [] with th.no_grad(): t_emb = cond_cast_unet(timestep_embedding(timesteps, self.model_channels, repeat_only=False)) emb = self.time_embed(t_emb) if is_sdxl: assert y.shape[0] == x.shape[0] emb = emb + self.label_emb(y) h = x for i, module in enumerate(self.input_blocks): self.current_h_shape = (h.shape[0], h.shape[1], h.shape[2], h.shape[3]) h = module(h, emb, context) t2i_injection = [3, 5, 8] if is_sdxl else [2, 5, 8, 11] if i in t2i_injection: h = aligned_adding(h, total_t2i_adapter_embedding.pop(0), require_inpaint_hijack) hs.append(h) self.current_h_shape = (h.shape[0], h.shape[1], h.shape[2], h.shape[3]) h = self.middle_block(h, emb, context) # U-Net Middle Block h = aligned_adding(h, total_controlnet_embedding.pop(), require_inpaint_hijack) if len(total_t2i_adapter_embedding) > 0 and is_sdxl: h = aligned_adding(h, total_t2i_adapter_embedding.pop(0), require_inpaint_hijack) # U-Net Decoder for i, module in enumerate(self.output_blocks): self.current_h_shape = (h.shape[0], h.shape[1], h.shape[2], h.shape[3]) h = th.cat([h, aligned_adding(hs.pop(), total_controlnet_embedding.pop(), require_inpaint_hijack)], dim=1) h = module(h, emb, context) # U-Net Output h = h.type(x.dtype) h = self.out(h) # Post-processing for color fix for param in outer.control_params: if param.used_hint_cond_latent is None: continue if 'colorfix' not in param.preprocessor['name']: continue k = int(param.preprocessor['threshold_a']) if is_in_high_res_fix and not param.disabled_by_hr_option(self.is_in_high_res_fix): k *= 2 # Inpaint hijack xt = x[:, :4, :, :] x0_origin = param.used_hint_cond_latent t = torch.round(timesteps.float()).long() x0_prd = predict_start_from_noise(outer.sd_ldm, xt, t, h) x0 = x0_prd - blur(x0_prd, k) + blur(x0_origin, k) if '+sharp' in param.preprocessor['name']: detail_weight = float(param.preprocessor['threshold_b']) * 0.01 neg = detail_weight * blur(x0, k) + (1 - detail_weight) * x0 x0 = cond_mark * x0 + (1 - cond_mark) * neg eps_prd = predict_noise_from_start(outer.sd_ldm, xt, t, x0) w = max(0.0, min(1.0, float(param.weight))) h = eps_prd * w + h * (1 - w) # Post-processing for restore for param in outer.control_params: if param.used_hint_cond_latent is None: continue if 'inpaint_only' not in param.preprocessor['name']: continue if param.used_hint_cond.shape[1] != 4: continue # Inpaint hijack xt = x[:, :4, :, :] mask = param.used_hint_cond[:, 3:4, :, :] mask = torch.nn.functional.max_pool2d(mask, (10, 10), stride=(8, 8), padding=1) x0_origin = param.used_hint_cond_latent t = torch.round(timesteps.float()).long() x0_prd = predict_start_from_noise(outer.sd_ldm, xt, t, h) x0 = x0_prd * mask + x0_origin * (1 - mask) eps_prd = predict_noise_from_start(outer.sd_ldm, xt, t, x0) w = max(0.0, min(1.0, float(param.weight))) h = eps_prd * w + h * (1 - w) return h def move_all_control_model_to_cpu(): for param in getattr(outer, 'control_params', []) or []: if isinstance(param.control_model, torch.nn.Module): param.control_model.to("cpu") def forward_webui(*args, **kwargs): # webui will handle other compoments try: if shared.cmd_opts.lowvram: lowvram.send_everything_to_cpu() return forward(*args, **kwargs) except Exception as e: move_all_control_model_to_cpu() raise e finally: if outer.lowvram: move_all_control_model_to_cpu() def hacked_basic_transformer_inner_forward(self, x, context=None): x_norm1 = self.norm1(x) self_attn1 = None if self.disable_self_attn: # Do not use self-attention self_attn1 = self.attn1(x_norm1, context=context) else: # Use self-attention self_attention_context = x_norm1 if outer.attention_auto_machine == AutoMachine.Write: if outer.attention_auto_machine_weight > self.attn_weight: self.bank.append(self_attention_context.detach().clone()) self.style_cfgs.append(outer.current_style_fidelity) if outer.attention_auto_machine == AutoMachine.Read: if len(self.bank) > 0: style_cfg = sum(self.style_cfgs) / float(len(self.style_cfgs)) self_attn1_uc = self.attn1(x_norm1, context=torch.cat([self_attention_context] + self.bank, dim=1)) self_attn1_c = self_attn1_uc.clone() if len(outer.current_uc_indices) > 0 and style_cfg > 1e-5: self_attn1_c[outer.current_uc_indices] = self.attn1( x_norm1[outer.current_uc_indices], context=self_attention_context[outer.current_uc_indices]) self_attn1 = style_cfg * self_attn1_c + (1.0 - style_cfg) * self_attn1_uc self.bank = [] self.style_cfgs = [] if outer.attention_auto_machine == AutoMachine.StyleAlign and not outer.is_in_high_res_fix: # very VRAM hungry - disable at high_res_fix def shared_attn1(inner_x): BB, FF, CC = inner_x.shape return self.attn1(inner_x.reshape(1, BB * FF, CC)).reshape(BB, FF, CC) uc_layer = shared_attn1(x_norm1[outer.current_uc_indices]) c_layer = shared_attn1(x_norm1[outer.current_c_indices]) self_attn1 = torch.zeros_like(x_norm1).to(uc_layer) self_attn1[outer.current_uc_indices] = uc_layer self_attn1[outer.current_c_indices] = c_layer del uc_layer, c_layer if self_attn1 is None: self_attn1 = self.attn1(x_norm1, context=self_attention_context) x = self_attn1.to(x.dtype) + x x = self.attn2(self.norm2(x), context=context) + x x = self.ff(self.norm3(x)) + x return x def hacked_group_norm_forward(self, *args, **kwargs): eps = 1e-6 x = self.original_forward_cn_hijack(*args, **kwargs) y = None if outer.gn_auto_machine == AutoMachine.Write: if outer.gn_auto_machine_weight > self.gn_weight: var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0) self.mean_bank.append(mean) self.var_bank.append(var) self.style_cfgs.append(outer.current_style_fidelity) if outer.gn_auto_machine == AutoMachine.Read: if len(self.mean_bank) > 0 and len(self.var_bank) > 0: style_cfg = sum(self.style_cfgs) / float(len(self.style_cfgs)) var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0) std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 mean_acc = sum(self.mean_bank) / float(len(self.mean_bank)) var_acc = sum(self.var_bank) / float(len(self.var_bank)) std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 y_uc = (((x - mean) / std) * std_acc) + mean_acc y_c = y_uc.clone() if len(outer.current_uc_indices) > 0 and style_cfg > 1e-5: y_c[outer.current_uc_indices] = x.to(y_c.dtype)[outer.current_uc_indices] y = style_cfg * y_c + (1.0 - style_cfg) * y_uc self.mean_bank = [] self.var_bank = [] self.style_cfgs = [] if y is None: y = x return y.to(x.dtype) if getattr(process, 'sample_before_CN_hack', None) is None: process.sample_before_CN_hack = process.sample process.sample = process_sample model._original_forward = model.forward outer.original_forward = model.forward model.forward = forward_webui.__get__(model, UNetModel) if model_is_sdxl: register_schedule(sd_ldm) outer.revision_q_sampler = AbstractLowScaleModel() need_attention_hijack = False for param in outer.control_params: if param.control_model_type in [ControlModelType.AttentionInjection]: need_attention_hijack = True if batch_option_style_align: need_attention_hijack = True outer.attention_auto_machine = AutoMachine.StyleAlign outer.gn_auto_machine = AutoMachine.StyleAlign all_modules = torch_dfs(model) if need_attention_hijack: attn_modules = [module for module in all_modules if isinstance(module, BasicTransformerBlock) or isinstance(module, BasicTransformerBlockSGM)] attn_modules = sorted(attn_modules, key=lambda x: - x.norm1.normalized_shape[0]) for i, module in enumerate(attn_modules): if getattr(module, '_original_inner_forward_cn_hijack', None) is None: module._original_inner_forward_cn_hijack = module._forward module._forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock) module.bank = [] module.style_cfgs = [] module.attn_weight = float(i) / float(len(attn_modules)) gn_modules = [model.middle_block] model.middle_block.gn_weight = 0 if model_is_sdxl: input_block_indices = [4, 5, 7, 8] output_block_indices = [0, 1, 2, 3, 4, 5] else: input_block_indices = [4, 5, 7, 8, 10, 11] output_block_indices = [0, 1, 2, 3, 4, 5, 6, 7] for w, i in enumerate(input_block_indices): module = model.input_blocks[i] module.gn_weight = 1.0 - float(w) / float(len(input_block_indices)) gn_modules.append(module) for w, i in enumerate(output_block_indices): module = model.output_blocks[i] module.gn_weight = float(w) / float(len(output_block_indices)) gn_modules.append(module) for i, module in enumerate(gn_modules): if getattr(module, 'original_forward_cn_hijack', None) is None: module.original_forward_cn_hijack = module.forward module.forward = hacked_group_norm_forward.__get__(module, torch.nn.Module) module.mean_bank = [] module.var_bank = [] module.style_cfgs = [] module.gn_weight *= 2 outer.attn_module_list = attn_modules outer.gn_module_list = gn_modules else: for module in all_modules: _original_inner_forward_cn_hijack = getattr(module, '_original_inner_forward_cn_hijack', None) original_forward_cn_hijack = getattr(module, 'original_forward_cn_hijack', None) if _original_inner_forward_cn_hijack is not None: module._forward = _original_inner_forward_cn_hijack if original_forward_cn_hijack is not None: module.forward = original_forward_cn_hijack outer.attn_module_list = [] outer.gn_module_list = [] scripts.script_callbacks.on_cfg_denoiser(self.guidance_schedule_handler) def restore(self): scripts.script_callbacks.remove_callbacks_for_function(self.guidance_schedule_handler) self.control_params = None if self.model is not None: if hasattr(self.model, "_original_forward"): self.model.forward = self.model._original_forward del self.model._original_forward ================================================ FILE: scripts/infotext.py ================================================ from typing import List, Tuple from enum import Enum import gradio as gr from modules.processing import StableDiffusionProcessing from internal_controlnet.external_code import ControlNetUnit from scripts.logging import logger class Infotext(object): def __init__(self) -> None: self.infotext_fields: List[Tuple[gr.components.IOComponent, str]] = [] self.paste_field_names: List[str] = [] @staticmethod def unit_prefix(unit_index: int) -> str: return f"ControlNet {unit_index}" def register_unit(self, unit_index: int, uigroup) -> None: """Register the unit's UI group. By regsitering the unit, A1111 will be able to paste values from infotext to IOComponents. Args: unit_index: The index of the ControlNet unit uigroup: The ControlNetUiGroup instance that contains all gradio iocomponents. """ unit_prefix = Infotext.unit_prefix(unit_index) for field in ControlNetUnit.infotext_fields(): # Every field in ControlNetUnit should have a cooresponding # IOComponent in ControlNetUiGroup. io_component = getattr(uigroup, field) component_locator = f"{unit_prefix} {field}" self.infotext_fields.append((io_component, component_locator)) self.paste_field_names.append(component_locator) @staticmethod def write_infotext(units: List[ControlNetUnit], p: StableDiffusionProcessing): """Write infotext to `p`.""" p.extra_generation_params.update( { Infotext.unit_prefix(i): unit.serialize() for i, unit in enumerate(units) if unit.enabled } ) @staticmethod def on_infotext_pasted(infotext: str, results: dict) -> None: """Parse ControlNet infotext string and write result to `results` dict.""" updates = {} for k, v in results.items(): if not k.startswith("ControlNet"): continue assert isinstance(v, str), f"Expect string but got {v}." try: for field, value in vars(ControlNetUnit.parse(v)).items(): if field not in ControlNetUnit.infotext_fields(): continue if value is None: logger.debug( f"InfoText: Skipping {field} because value is None." ) continue component_locator = f"{k} {field}" if isinstance(value, Enum): value = value.value updates[component_locator] = value logger.debug(f"InfoText: Setting {component_locator} = {value}") except Exception as e: logger.warn( f"Failed to parse infotext, legacy format infotext is no longer supported:\n{v}\n{e}" ) results.update(updates) ================================================ FILE: scripts/ipadapter/__init__.py ================================================ ================================================ FILE: scripts/ipadapter/image_proj_models.py ================================================ import math import torch import torch.nn as nn class MLPProjModel(torch.nn.Module): def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024): super().__init__() self.proj = torch.nn.Sequential( torch.nn.Linear(clip_embeddings_dim, clip_embeddings_dim), torch.nn.GELU(), torch.nn.Linear(clip_embeddings_dim, cross_attention_dim), torch.nn.LayerNorm(cross_attention_dim), ) def forward(self, image_embeds): clip_extra_context_tokens = self.proj(image_embeds) return clip_extra_context_tokens class MLPProjModelFaceId(torch.nn.Module): """MLPProjModel used for FaceId. Source: https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/ip_adapter_faceid.py """ def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4): super().__init__() self.cross_attention_dim = cross_attention_dim self.num_tokens = num_tokens self.proj = torch.nn.Sequential( torch.nn.Linear(id_embeddings_dim, id_embeddings_dim * 2), torch.nn.GELU(), torch.nn.Linear(id_embeddings_dim * 2, cross_attention_dim * num_tokens), ) self.norm = torch.nn.LayerNorm(cross_attention_dim) def forward(self, id_embeds): clip_extra_context_tokens = self.proj(id_embeds) clip_extra_context_tokens = clip_extra_context_tokens.reshape( -1, self.num_tokens, self.cross_attention_dim ) clip_extra_context_tokens = self.norm(clip_extra_context_tokens) return clip_extra_context_tokens class FacePerceiverResampler(torch.nn.Module): """Source: https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/ip_adapter_faceid.py""" def __init__( self, *, dim=768, depth=4, dim_head=64, heads=16, embedding_dim=1280, output_dim=768, ff_mult=4, ): super().__init__() self.proj_in = torch.nn.Linear(embedding_dim, dim) self.proj_out = torch.nn.Linear(dim, output_dim) self.norm_out = torch.nn.LayerNorm(output_dim) self.layers = torch.nn.ModuleList([]) for _ in range(depth): self.layers.append( torch.nn.ModuleList( [ PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), FeedForward(dim=dim, mult=ff_mult), ] ) ) def forward(self, latents, x): x = self.proj_in(x) for attn, ff in self.layers: latents = attn(x, latents) + latents latents = ff(latents) + latents latents = self.proj_out(latents) return self.norm_out(latents) class ProjModelFaceIdPlus(torch.nn.Module): """Source: https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/ip_adapter_faceid.py""" def __init__( self, cross_attention_dim=768, id_embeddings_dim=512, clip_embeddings_dim=1280, num_tokens=4, ): super().__init__() self.cross_attention_dim = cross_attention_dim self.num_tokens = num_tokens self.proj = torch.nn.Sequential( torch.nn.Linear(id_embeddings_dim, id_embeddings_dim * 2), torch.nn.GELU(), torch.nn.Linear(id_embeddings_dim * 2, cross_attention_dim * num_tokens), ) self.norm = torch.nn.LayerNorm(cross_attention_dim) self.perceiver_resampler = FacePerceiverResampler( dim=cross_attention_dim, depth=4, dim_head=64, heads=cross_attention_dim // 64, embedding_dim=clip_embeddings_dim, output_dim=cross_attention_dim, ff_mult=4, ) def forward(self, id_embeds, clip_embeds, scale=1.0, shortcut=False): x = self.proj(id_embeds) x = x.reshape(-1, self.num_tokens, self.cross_attention_dim) x = self.norm(x) out = self.perceiver_resampler(x, clip_embeds) if shortcut: out = x + scale * out return out class ImageProjModel(torch.nn.Module): """Projection Model""" def __init__( self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4, ): super().__init__() self.cross_attention_dim = cross_attention_dim self.clip_extra_context_tokens = clip_extra_context_tokens self.proj = torch.nn.Linear( clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim ) self.norm = torch.nn.LayerNorm(cross_attention_dim) def forward(self, image_embeds): embeds = image_embeds clip_extra_context_tokens = self.proj(embeds).reshape( -1, self.clip_extra_context_tokens, self.cross_attention_dim ) clip_extra_context_tokens = self.norm(clip_extra_context_tokens) return clip_extra_context_tokens def FeedForward(dim, mult=4): inner_dim = int(dim * mult) return nn.Sequential( nn.LayerNorm(dim), nn.Linear(dim, inner_dim, bias=False), nn.GELU(), nn.Linear(inner_dim, dim, bias=False), ) def reshape_tensor(x, heads): bs, length, width = x.shape # (bs, length, width) --> (bs, length, n_heads, dim_per_head) x = x.view(bs, length, heads, -1) # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head) x = x.transpose(1, 2) # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head) x = x.reshape(bs, heads, length, -1) return x class PerceiverAttention(nn.Module): def __init__(self, *, dim, dim_head=64, heads=8): super().__init__() self.scale = dim_head**-0.5 self.dim_head = dim_head self.heads = heads inner_dim = dim_head * heads self.norm1 = nn.LayerNorm(dim) self.norm2 = nn.LayerNorm(dim) self.to_q = nn.Linear(dim, inner_dim, bias=False) self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) self.to_out = nn.Linear(inner_dim, dim, bias=False) def forward(self, x, latents): """ Args: x (torch.Tensor): image features shape (b, n1, D) latent (torch.Tensor): latent features shape (b, n2, D) """ x = self.norm1(x) latents = self.norm2(latents) b, l, _ = latents.shape # noqa: E741 q = self.to_q(latents) kv_input = torch.cat((x, latents), dim=-2) k, v = self.to_kv(kv_input).chunk(2, dim=-1) q = reshape_tensor(q, self.heads) k = reshape_tensor(k, self.heads) v = reshape_tensor(v, self.heads) # attention scale = 1 / math.sqrt(math.sqrt(self.dim_head)) weight = (q * scale) @ (k * scale).transpose( -2, -1 ) # More stable with f16 than dividing afterwards weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) out = weight @ v out = out.permute(0, 2, 1, 3).reshape(b, l, -1) return self.to_out(out) class Resampler(nn.Module): def __init__( self, dim=1024, depth=8, dim_head=64, heads=16, num_queries=8, embedding_dim=768, output_dim=1024, ff_mult=4, ): super().__init__() self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5) self.proj_in = nn.Linear(embedding_dim, dim) self.proj_out = nn.Linear(dim, output_dim) self.norm_out = nn.LayerNorm(output_dim) self.layers = nn.ModuleList([]) for _ in range(depth): self.layers.append( nn.ModuleList( [ PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), FeedForward(dim=dim, mult=ff_mult), ] ) ) def forward(self, x): latents = self.latents.repeat(x.size(0), 1, 1) x = self.proj_in(x) for attn, ff in self.layers: latents = attn(x, latents) + latents latents = ff(latents) + latents latents = self.proj_out(latents) return self.norm_out(latents) class PuLIDEncoder(nn.Module): def __init__(self, width=1280, context_dim=2048, num_token=5): super().__init__() self.num_token = num_token self.context_dim = context_dim h1 = min((context_dim * num_token) // 4, 1024) h2 = min((context_dim * num_token) // 2, 1024) self.body = nn.Sequential( nn.Linear(width, h1), nn.LayerNorm(h1), nn.LeakyReLU(), nn.Linear(h1, h2), nn.LayerNorm(h2), nn.LeakyReLU(), nn.Linear(h2, context_dim * num_token), ) for i in range(5): setattr( self, f"mapping_{i}", nn.Sequential( nn.Linear(1024, 1024), nn.LayerNorm(1024), nn.LeakyReLU(), nn.Linear(1024, 1024), nn.LayerNorm(1024), nn.LeakyReLU(), nn.Linear(1024, context_dim), ), ) setattr( self, f"mapping_patch_{i}", nn.Sequential( nn.Linear(1024, 1024), nn.LayerNorm(1024), nn.LeakyReLU(), nn.Linear(1024, 1024), nn.LayerNorm(1024), nn.LeakyReLU(), nn.Linear(1024, context_dim), ), ) def forward(self, x, y): # x shape [N, C] x = self.body(x) x = x.reshape(-1, self.num_token, self.context_dim) hidden_states = () for i, emb in enumerate(y): hidden_state = getattr(self, f"mapping_{i}")(emb[:, :1]) + getattr( self, f"mapping_patch_{i}" )(emb[:, 1:]).mean(dim=1, keepdim=True) hidden_states += (hidden_state,) hidden_states = torch.cat(hidden_states, dim=1) return torch.cat([x, hidden_states], dim=1) ================================================ FILE: scripts/ipadapter/ipadapter_model.py ================================================ from __future__ import annotations from typing import List, NamedTuple, Tuple, Union import torch import torch.nn as nn import numpy as np from transformers.models.clip.modeling_clip import CLIPVisionModelOutput from .image_proj_models import ( Resampler, ImageProjModel, MLPProjModel, MLPProjModelFaceId, ProjModelFaceIdPlus, PuLIDEncoder, ) class ImageEmbed(NamedTuple): """Image embed for a single image.""" cond_emb: torch.Tensor uncond_emb: torch.Tensor def eval(self, cond_mark: torch.Tensor) -> torch.Tensor: assert cond_mark.ndim == 4 assert self.cond_emb.ndim == self.uncond_emb.ndim == 3 assert ( self.uncond_emb.shape[0] == 1 or self.cond_emb.shape[0] == self.uncond_emb.shape[0] ) assert ( self.cond_emb.shape[0] == 1 or self.cond_emb.shape[0] == cond_mark.shape[0] ) cond_mark = cond_mark[:, :, :, 0].to(self.cond_emb) device = cond_mark.device dtype = cond_mark.dtype return self.cond_emb.to( device=device, dtype=dtype ) * cond_mark + self.uncond_emb.to(device=device, dtype=dtype) * (1 - cond_mark) def average_of(*args: List[Tuple[torch.Tensor, torch.Tensor]]) -> "ImageEmbed": conds, unconds = zip(*args) def average_tensors(tensors: List[torch.Tensor]) -> torch.Tensor: return torch.sum(torch.stack(tensors), dim=0) / len(tensors) return ImageEmbed(average_tensors(conds), average_tensors(unconds)) class To_KV(torch.nn.Module): def __init__(self, state_dict): super().__init__() self.to_kvs = nn.ModuleDict() for key, value in state_dict.items(): k = key.replace(".weight", "").replace(".", "_") self.to_kvs[k] = nn.Linear(value.shape[1], value.shape[0], bias=False) self.to_kvs[k].weight.data = value class IPAdapterModel(torch.nn.Module): def __init__( self, state_dict, clip_embeddings_dim, cross_attention_dim, is_plus, is_sdxl: bool, sdxl_plus, is_full, is_faceid: bool, is_portrait: bool, is_instantid: bool, is_pulid: bool, is_v2: bool, ): super().__init__() self.device = "cpu" self.clip_embeddings_dim = clip_embeddings_dim self.cross_attention_dim = cross_attention_dim self.is_plus = is_plus self.is_sdxl = is_sdxl self.sdxl_plus = sdxl_plus self.is_full = is_full self.is_v2 = is_v2 self.is_faceid = is_faceid self.is_instantid = is_instantid self.is_pulid = is_pulid self.clip_extra_context_tokens = 16 if (self.is_plus or is_portrait) else 4 if self.is_pulid: self.image_proj_model = PuLIDEncoder() elif self.is_instantid: self.image_proj_model = self.init_proj_instantid() elif is_faceid: self.image_proj_model = self.init_proj_faceid() elif self.is_plus: if self.is_full: self.image_proj_model = MLPProjModel( cross_attention_dim=cross_attention_dim, clip_embeddings_dim=clip_embeddings_dim, ) else: self.image_proj_model = Resampler( dim=1280 if sdxl_plus else cross_attention_dim, depth=4, dim_head=64, heads=20 if sdxl_plus else 12, num_queries=self.clip_extra_context_tokens, embedding_dim=clip_embeddings_dim, output_dim=self.cross_attention_dim, ff_mult=4, ) else: self.clip_extra_context_tokens = ( state_dict["image_proj"]["proj.weight"].shape[0] // self.cross_attention_dim ) self.image_proj_model = ImageProjModel( cross_attention_dim=self.cross_attention_dim, clip_embeddings_dim=clip_embeddings_dim, clip_extra_context_tokens=self.clip_extra_context_tokens, ) self.image_proj_model.load_state_dict(state_dict["image_proj"]) self.ip_layers = To_KV(state_dict["ip_adapter"]) def init_proj_faceid(self): if self.is_plus: image_proj_model = ProjModelFaceIdPlus( cross_attention_dim=self.cross_attention_dim, id_embeddings_dim=512, clip_embeddings_dim=self.clip_embeddings_dim, num_tokens=4, ) else: image_proj_model = MLPProjModelFaceId( cross_attention_dim=self.cross_attention_dim, id_embeddings_dim=512, num_tokens=self.clip_extra_context_tokens, ) return image_proj_model def init_proj_instantid(self, image_emb_dim=512, num_tokens=16): image_proj_model = Resampler( dim=1280, depth=4, dim_head=64, heads=20, num_queries=num_tokens, embedding_dim=image_emb_dim, output_dim=self.cross_attention_dim, ff_mult=4, ) return image_proj_model @torch.inference_mode() def _get_image_embeds( self, clip_vision_output: CLIPVisionModelOutput ) -> ImageEmbed: self.image_proj_model.to(self.device) if self.is_plus: from annotator.clipvision import clip_vision_h_uc, clip_vision_vith_uc cond = self.image_proj_model( clip_vision_output["hidden_states"][-2].to( device=self.device, dtype=torch.float32 ) ) uncond = ( clip_vision_vith_uc.to(cond) if self.sdxl_plus else self.image_proj_model(clip_vision_h_uc.to(cond)) ) return ImageEmbed(cond, uncond) clip_image_embeds = clip_vision_output["image_embeds"].to( device=self.device, dtype=torch.float32 ) image_prompt_embeds = self.image_proj_model(clip_image_embeds) # input zero vector for unconditional. uncond_image_prompt_embeds = self.image_proj_model( torch.zeros_like(clip_image_embeds) ) return ImageEmbed(image_prompt_embeds, uncond_image_prompt_embeds) @torch.inference_mode() def _get_image_embeds_faceid_plus( self, face_embed: torch.Tensor, clip_vision_output: CLIPVisionModelOutput, is_v2: bool, ) -> ImageEmbed: face_embed = face_embed.to(self.device, dtype=torch.float32) from annotator.clipvision import clip_vision_h_uc clip_embed = clip_vision_output["hidden_states"][-2].to( device=self.device, dtype=torch.float32 ) return ImageEmbed( self.image_proj_model(face_embed, clip_embed, shortcut=is_v2), self.image_proj_model( torch.zeros_like(face_embed), clip_vision_h_uc.to(clip_embed), shortcut=is_v2, ), ) @torch.inference_mode() def _get_image_embeds_faceid(self, insightface_output: torch.Tensor) -> ImageEmbed: """Get image embeds for non-plus faceid. Multiple inputs are supported.""" self.image_proj_model.to(self.device) faceid_embed = insightface_output.to(self.device, dtype=torch.float32) return ImageEmbed( self.image_proj_model(faceid_embed), self.image_proj_model(torch.zeros_like(faceid_embed)), ) @torch.inference_mode() def _get_image_embeds_instantid( self, prompt_image_emb: Union[torch.Tensor, np.ndarray] ) -> ImageEmbed: """Get image embeds for instantid.""" image_proj_model_in_features = 512 if isinstance(prompt_image_emb, torch.Tensor): prompt_image_emb = prompt_image_emb.clone().detach() else: prompt_image_emb = torch.tensor(prompt_image_emb) prompt_image_emb = prompt_image_emb.to(device=self.device, dtype=torch.float32) prompt_image_emb = prompt_image_emb.reshape( [1, -1, image_proj_model_in_features] ) return ImageEmbed( self.image_proj_model(prompt_image_emb), self.image_proj_model(torch.zeros_like(prompt_image_emb)), ) def _get_image_embeds_pulid(self, pulid_proj_input) -> ImageEmbed: """Get image embeds for pulid.""" id_cond = torch.cat( [ pulid_proj_input.id_ante_embedding.to( device=self.device, dtype=torch.float32 ), pulid_proj_input.id_cond_vit.to( device=self.device, dtype=torch.float32 ), ], dim=-1, ) id_vit_hidden = [ t.to(device=self.device, dtype=torch.float32) for t in pulid_proj_input.id_vit_hidden ] return ImageEmbed( self.image_proj_model( id_cond, id_vit_hidden, ), self.image_proj_model( torch.zeros_like(id_cond), [torch.zeros_like(t) for t in id_vit_hidden], ), ) @staticmethod def load(state_dict: dict, model_name: str) -> IPAdapterModel: """ Arguments: - state_dict: model state_dict. - model_name: file name of the model. """ is_v2 = "v2" in model_name is_faceid = "faceid" in model_name is_instantid = "instant_id" in model_name is_pulid = "pulid" in model_name.lower() is_portrait = "portrait" in model_name is_full = "proj.3.weight" in state_dict["image_proj"] is_plus = ( is_full or "latents" in state_dict["image_proj"] or "perceiver_resampler.proj_in.weight" in state_dict["image_proj"] ) cross_attention_dim = state_dict["ip_adapter"]["1.to_k_ip.weight"].shape[1] sdxl = cross_attention_dim == 2048 sdxl_plus = sdxl and is_plus if is_instantid or is_pulid: # InstantID/PuLID does not use clip embedding. clip_embeddings_dim = None elif is_faceid: if is_plus: clip_embeddings_dim = 1280 else: # Plain faceid does not use clip_embeddings_dim. clip_embeddings_dim = None elif is_plus: if sdxl_plus: clip_embeddings_dim = int(state_dict["image_proj"]["latents"].shape[2]) elif is_full: clip_embeddings_dim = int( state_dict["image_proj"]["proj.0.weight"].shape[1] ) else: clip_embeddings_dim = int( state_dict["image_proj"]["proj_in.weight"].shape[1] ) else: clip_embeddings_dim = int(state_dict["image_proj"]["proj.weight"].shape[1]) return IPAdapterModel( state_dict, clip_embeddings_dim=clip_embeddings_dim, cross_attention_dim=cross_attention_dim, is_plus=is_plus, is_sdxl=sdxl, sdxl_plus=sdxl_plus, is_full=is_full, is_faceid=is_faceid, is_portrait=is_portrait, is_instantid=is_instantid, is_v2=is_v2, is_pulid=is_pulid, ) def get_image_emb(self, preprocessor_output) -> ImageEmbed: if self.is_pulid: return self._get_image_embeds_pulid(preprocessor_output) elif self.is_instantid: return self._get_image_embeds_instantid(preprocessor_output) elif self.is_faceid and self.is_plus: # Note: FaceID plus uses both face_embed and clip_embed. # This should be the return value from preprocessor. return self._get_image_embeds_faceid_plus( preprocessor_output.face_embed, preprocessor_output.clip_embed, is_v2=self.is_v2, ) elif self.is_faceid: return self._get_image_embeds_faceid(preprocessor_output) else: return self._get_image_embeds(preprocessor_output) ================================================ FILE: scripts/ipadapter/plugable_ipadapter.py ================================================ import itertools import torch import math from typing import Union, Dict, Optional, Callable from .pulid_attn import PuLIDAttnSetting from .ipadapter_model import ImageEmbed, IPAdapterModel from ..enums import StableDiffusionVersion, TransformerID def get_block(model, flag): return { "input": model.input_blocks, "middle": [model.middle_block], "output": model.output_blocks, }[flag] def attn_forward_hacked(self, x, context=None, **kwargs): batch_size, sequence_length, inner_dim = x.shape h = self.heads head_dim = inner_dim // h if context is None: context = x q = self.to_q(x) k = self.to_k(context) v = self.to_v(context) del context q, k, v = map( lambda t: t.view(batch_size, -1, h, head_dim).transpose(1, 2), (q, k, v), ) out = torch.nn.functional.scaled_dot_product_attention( q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False ) out = out.transpose(1, 2).reshape(batch_size, -1, h * head_dim) del k, v for f in self.ipadapter_hacks: out = out + f(self, x, q) del q, x return self.to_out(out) all_hacks = {} current_model = None def hack_blk(block, function, type): if not hasattr(block, "ipadapter_hacks"): block.ipadapter_hacks = [] if len(block.ipadapter_hacks) == 0: all_hacks[block] = block.forward block.forward = attn_forward_hacked.__get__(block, type) block.ipadapter_hacks.append(function) return def set_model_attn2_replace( model, target_cls, function, transformer_id: TransformerID, ): block = get_block(model, transformer_id.block_type.value) module = ( block[transformer_id.block_id][1] .transformer_blocks[transformer_id.block_index] .attn2 ) hack_blk(module, function, target_cls) def clear_all_ip_adapter(): global all_hacks, current_model for k, v in all_hacks.items(): k.forward = v k.ipadapter_hacks = [] all_hacks = {} current_model = None return class PlugableIPAdapter(torch.nn.Module): def __init__(self, ipadapter: IPAdapterModel): super().__init__() self.ipadapter: IPAdapterModel = ipadapter self.disable_memory_management = True self.dtype = None self.weight: Union[float, Dict[int, float]] = 1.0 self.cache = None self.p_start = 0.0 self.p_end = 1.0 self.latent_width: int = 0 self.latent_height: int = 0 self.effective_region_mask = None self.pulid_attn_setting: Optional[PuLIDAttnSetting] = None def reset(self): self.cache = {} @torch.no_grad() def hook( self, model, preprocessor_outputs, weight, start: float, end: float, latent_width: int, latent_height: int, effective_region_mask: Optional[torch.Tensor], pulid_attn_setting: Optional[PuLIDAttnSetting] = None, dtype=torch.float32, ): global current_model current_model = model self.p_start = start self.p_end = end self.latent_width = latent_width self.latent_height = latent_height self.effective_region_mask = effective_region_mask self.pulid_attn_setting = pulid_attn_setting self.cache = {} self.weight = weight device = torch.device("cpu") self.dtype = dtype self.ipadapter.to(device, dtype=self.dtype) if isinstance(preprocessor_outputs, (list, tuple)): preprocessor_outputs = preprocessor_outputs else: preprocessor_outputs = [preprocessor_outputs] self.image_emb = ImageEmbed.average_of( *[self.ipadapter.get_image_emb(o) for o in preprocessor_outputs] ) if self.ipadapter.is_sdxl: sd_version = StableDiffusionVersion.SDXL from sgm.modules.attention import CrossAttention else: sd_version = StableDiffusionVersion.SD1x from ldm.modules.attention import CrossAttention input_ids, output_ids, middle_ids = sd_version.transformer_ids for i, transformer_id in enumerate( itertools.chain(input_ids, output_ids, middle_ids) ): set_model_attn2_replace( model, CrossAttention, self.patch_forward(i, transformer_id.transformer_index), transformer_id, ) def weight_on_transformer(self, transformer_index: int) -> float: if isinstance(self.weight, dict): return self.weight.get(transformer_index, 0.0) else: assert isinstance(self.weight, (float, int)) return self.weight def call_ip(self, key: str, feat, device): if key in self.cache: return self.cache[key] else: ip = self.ipadapter.ip_layers.to_kvs[key](feat).to(device) self.cache[key] = ip return ip def apply_effective_region_mask(self, out: torch.Tensor) -> torch.Tensor: if self.effective_region_mask is None: return out _, sequence_length, _ = out.shape # sequence_length = mask_h * mask_w # sequence_length = (latent_height * factor) * (latent_height * factor) # sequence_length = (latent_height * latent_height) * factor ^ 2 factor = math.sqrt(sequence_length / (self.latent_width * self.latent_height)) assert ( factor > 0 ), f"{factor}, {sequence_length}, {self.latent_width}, {self.latent_height}" mask_h = int(self.latent_height * factor) mask_w = int(self.latent_width * factor) mask = torch.nn.functional.interpolate( self.effective_region_mask.to(out.device), size=(mask_h, mask_w), mode="bilinear", ).squeeze() mask = mask.repeat(len(current_model.cond_mark), 1, 1) mask = mask.view(mask.shape[0], -1, 1).repeat(1, 1, out.shape[2]) return out * mask def attn_eval( self, hidden_states: torch.Tensor, query: torch.Tensor, cond_uncond_image_emb: torch.Tensor, attn_heads: int, head_dim: int, emb_to_k: Callable[[torch.Tensor], torch.Tensor], emb_to_v: Callable[[torch.Tensor], torch.Tensor], ): if self.ipadapter.is_pulid: assert self.pulid_attn_setting is not None return self.pulid_attn_setting.eval( hidden_states, query, cond_uncond_image_emb, attn_heads, head_dim, emb_to_k, emb_to_v, ) else: return self._attn_eval_ipadapter( hidden_states, query, cond_uncond_image_emb, attn_heads, head_dim, emb_to_k, emb_to_v, ) def _attn_eval_ipadapter( self, hidden_states: torch.Tensor, query: torch.Tensor, cond_uncond_image_emb: torch.Tensor, attn_heads: int, head_dim: int, emb_to_k: Callable[[torch.Tensor], torch.Tensor], emb_to_v: Callable[[torch.Tensor], torch.Tensor], ): assert hidden_states.ndim == 3 batch_size, sequence_length, inner_dim = hidden_states.shape ip_k = emb_to_k(cond_uncond_image_emb) ip_v = emb_to_v(cond_uncond_image_emb) ip_k, ip_v = map( lambda t: t.view(batch_size, -1, attn_heads, head_dim).transpose(1, 2), (ip_k, ip_v), ) assert ip_k.dtype == ip_v.dtype # On MacOS, q can be float16 instead of float32. # https://github.com/Mikubill/sd-webui-controlnet/issues/2208 if query.dtype != ip_k.dtype: ip_k = ip_k.to(dtype=query.dtype) ip_v = ip_v.to(dtype=query.dtype) ip_out = torch.nn.functional.scaled_dot_product_attention( query, ip_k, ip_v, attn_mask=None, dropout_p=0.0, is_causal=False ) ip_out = ip_out.transpose(1, 2).reshape(batch_size, -1, attn_heads * head_dim) return ip_out @torch.no_grad() def patch_forward(self, number: int, transformer_index: int): @torch.no_grad() def forward(attn_blk, x, q): batch_size, sequence_length, inner_dim = x.shape h = attn_blk.heads head_dim = inner_dim // h weight = self.weight_on_transformer(transformer_index) current_sampling_percent = getattr( current_model, "current_sampling_percent", 0.5 ) if ( current_sampling_percent < self.p_start or current_sampling_percent > self.p_end or weight == 0.0 ): return 0.0 k_key = f"{number * 2 + 1}_to_k_ip" v_key = f"{number * 2 + 1}_to_v_ip" ip_out = self.attn_eval( hidden_states=x, query=q, cond_uncond_image_emb=self.image_emb.eval(current_model.cond_mark), attn_heads=h, head_dim=head_dim, emb_to_k=lambda emb: self.call_ip(k_key, emb, device=q.device), emb_to_v=lambda emb: self.call_ip(v_key, emb, device=q.device), ) return self.apply_effective_region_mask(ip_out * weight) return forward ================================================ FILE: scripts/ipadapter/presets.py ================================================ from __future__ import annotations from ..enums import StableDiffusionVersion from typing import NamedTuple, Optional, List class IPAdapterPreset(NamedTuple): """Preset for IPAdapter.""" name: str module: str # Preprocessor model: str # Name of model file sd_version: StableDiffusionVersion # Supported SD version. lora: Optional[str] = None @staticmethod def match_model(model_name: str) -> IPAdapterPreset: model_name = model_name.split("[")[0].strip() assert ( model_name in _preset_by_model ), f"{model_name} not found in ipadapter presets. Please try manually pick preprocessor." return _preset_by_model[model_name] clip_h = "ip-adapter_clip_h" clip_g = "ip-adapter_clip_g" insightface = "ip-adapter_face_id" insightface_clip_h = "ip-adapter_face_id_plus" ipadapter_presets: List[IPAdapterPreset] = [ IPAdapterPreset( name="light", module=clip_h, model="ip-adapter_sd15_light", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="light_v11", module=clip_h, model="ip-adapter_sd15_light_v11", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="vit-g", module=clip_g, model="ip-adapter_sd15_vit-G", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="standard", module=clip_h, model="ip-adapter_sd15", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="plus", module=clip_h, model="ip-adapter-plus_sd15", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="plus", module=clip_h, model="ip-adapter_sd15_plus", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="plus-composition", module=clip_h, model="ip-adapter_plus_composition_sd15", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="plus_face", module=clip_h, model="ip-adapter-plus-face_sd15", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="full_face", module=clip_h, model="ip-adapter-full-face_sd15", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="face_id", module=insightface, model="ip-adapter-faceid_sd15", lora="ip-adapter-faceid_sd15_lora", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="face_id_plus", module=insightface_clip_h, model="ip-adapter-faceid-plus_sd15", lora="ip-adapter-faceid-plus_sd15_lora", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="face_id_plus_v2", module=insightface_clip_h, model="ip-adapter-faceid-plusv2_sd15", lora="ip-adapter-faceid-plusv2_sd15_lora", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="face_id_portrait", module=insightface, model="ip-adapter-faceid-portrait_sd15", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="face_id_portrait_v11", module=insightface, model="ip-adapter-faceid-portrait-v11_sd15", sd_version=StableDiffusionVersion.SD1x, ), IPAdapterPreset( name="standard-g", module=clip_g, model="ip-adapter_sdxl", sd_version=StableDiffusionVersion.SDXL, ), IPAdapterPreset( name="standard-h", module=clip_h, model="ip-adapter_sdxl_vit-h", sd_version=StableDiffusionVersion.SDXL, ), IPAdapterPreset( name="plus-h", module=clip_h, model="ip-adapter-plus_sdxl_vit-h", sd_version=StableDiffusionVersion.SDXL, ), IPAdapterPreset( name="plus-composition", module=clip_h, model="ip-adapter_plus_composition_sdxl", sd_version=StableDiffusionVersion.SDXL, ), IPAdapterPreset( name="plus_face-h", module=clip_h, model="ip-adapter-plus-face_sdxl_vit-h", sd_version=StableDiffusionVersion.SDXL, ), IPAdapterPreset( name="face_id", module=insightface, model="ip-adapter-faceid_sdxl", lora="ip-adapter-faceid_sdxl_lora", sd_version=StableDiffusionVersion.SDXL, ), IPAdapterPreset( name="face_id_plusv2", module=insightface_clip_h, model="ip-adapter-faceid-plusv2_sdxl", lora="ip-adapter-faceid-plusv2_sdxl_lora", sd_version=StableDiffusionVersion.SDXL, ), IPAdapterPreset( name="face_id_portrait", module=insightface, model="ip-adapter-faceid-portrait_sdxl", sd_version=StableDiffusionVersion.SDXL, ), IPAdapterPreset( name="pulid", module="ip-adapter_pulid", model="ip-adapter_pulid_sdxl_fp16", sd_version=StableDiffusionVersion.SDXL, ), ] _preset_by_model = {p.model: p for p in ipadapter_presets} ================================================ FILE: scripts/ipadapter/pulid_attn.py ================================================ import torch import torch.nn.functional as F from dataclasses import dataclass from typing import Callable @dataclass class PuLIDAttnSetting: num_zero: int = 0 ortho: bool = False ortho_v2: bool = False def eval( self, hidden_states: torch.Tensor, query: torch.Tensor, id_embedding: torch.Tensor, attn_heads: int, head_dim: int, id_to_k: Callable[[torch.Tensor], torch.Tensor], id_to_v: Callable[[torch.Tensor], torch.Tensor], ): assert hidden_states.ndim == 3 batch_size, sequence_length, inner_dim = hidden_states.shape if self.num_zero == 0: id_key = id_to_k(id_embedding).to(query.dtype) id_value = id_to_v(id_embedding).to(query.dtype) else: zero_tensor = torch.zeros( (id_embedding.size(0), self.num_zero, id_embedding.size(-1)), dtype=id_embedding.dtype, device=id_embedding.device, ) id_key = id_to_k(torch.cat((id_embedding, zero_tensor), dim=1)).to( query.dtype ) id_value = id_to_v(torch.cat((id_embedding, zero_tensor), dim=1)).to( query.dtype ) id_key = id_key.view(batch_size, -1, attn_heads, head_dim).transpose(1, 2) id_value = id_value.view(batch_size, -1, attn_heads, head_dim).transpose(1, 2) # the output of sdp = (batch, num_heads, seq_len, head_dim) id_hidden_states = F.scaled_dot_product_attention( query, id_key, id_value, attn_mask=None, dropout_p=0.0, is_causal=False ) id_hidden_states = id_hidden_states.transpose(1, 2).reshape( batch_size, -1, attn_heads * head_dim ) id_hidden_states = id_hidden_states.to(query.dtype) if not self.ortho and not self.ortho_v2: return id_hidden_states elif self.ortho_v2: orig_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) id_hidden_states = id_hidden_states.to(torch.float32) attn_map = query @ id_key.transpose(-2, -1) attn_mean = attn_map.softmax(dim=-1).mean(dim=1) attn_mean = attn_mean[:, :, :5].sum(dim=-1, keepdim=True) projection = ( torch.sum((hidden_states * id_hidden_states), dim=-2, keepdim=True) / torch.sum((hidden_states * hidden_states), dim=-2, keepdim=True) * hidden_states ) orthogonal = id_hidden_states + (attn_mean - 1) * projection return orthogonal.to(orig_dtype) else: orig_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) id_hidden_states = id_hidden_states.to(torch.float32) projection = ( torch.sum((hidden_states * id_hidden_states), dim=-2, keepdim=True) / torch.sum((hidden_states * hidden_states), dim=-2, keepdim=True) * hidden_states ) orthogonal = id_hidden_states - projection return orthogonal.to(orig_dtype) PULID_SETTING_FIDELITY = PuLIDAttnSetting( num_zero=8, ortho=False, ortho_v2=True, ) PULID_SETTING_STYLE = PuLIDAttnSetting( num_zero=16, ortho=True, ortho_v2=False, ) ================================================ FILE: scripts/ipadapter/weight.py ================================================ # From https://github.com/cubiq/ComfyUI_IPAdapter_plus from ..enums import StableDiffusionVersion, TransformerIDResult def calc_weights( weight_type: str, weight: float, sd_version: StableDiffusionVersion, weight_composition: float = 0.0, ): layers = sd_version.transformer_block_num return [ _calc_weight( weight_type, weight, sd_version, t_idx, weight_composition, ) for t_idx in range(layers) ] def _calc_weight( weight_type: str, weight: float, sd_version: StableDiffusionVersion, t_idx: int, weight_composition: float = 0.0, ): is_sdxl = sd_version == StableDiffusionVersion.SDXL layers = sd_version.transformer_block_num ids: TransformerIDResult = sd_version.transformer_ids block_type = [i for i in ids.to_list() if i.transformer_index == t_idx][ 0 ].block_type.value if weight_type == "normal": return weight elif weight_type == "ease in": weight = weight * (0.05 + 0.95 * (1 - t_idx / layers)) elif weight_type == "ease out": weight = weight * (0.05 + 0.95 * (t_idx / layers)) elif weight_type == "ease in-out": weight = weight * (0.05 + 0.95 * (1 - abs(t_idx - (layers / 2)) / (layers / 2))) elif weight_type == "reverse in-out": weight = weight * (0.05 + 0.95 * (abs(t_idx - (layers / 2)) / (layers / 2))) elif weight_type == "weak input": weight = weight * 0.2 if block_type == "input" else weight elif weight_type == "weak middle": weight = weight * 0.2 if block_type == "middle" else weight elif weight_type == "weak output": weight = weight * 0.2 if block_type == "output" else weight elif weight_type == "strong middle": weight = ( weight * 0.2 if (block_type == "input" or block_type == "output") else weight ) elif weight_type.startswith("style transfer"): weight = ( {6: weight} if is_sdxl else { 0: weight, 1: weight, 2: weight, 3: weight, 9: weight, 10: weight, 11: weight, 12: weight, 13: weight, 14: weight, 15: weight, } ) elif weight_type.startswith("composition"): weight = {3: weight} if is_sdxl else {4: weight * 0.25, 5: weight} elif weight_type == "strong style transfer": if is_sdxl: weight = { 0: weight, 1: weight, 2: weight, 4: weight, 5: weight, 6: weight, 7: weight, 8: weight, 9: weight, 10: weight, } else: weight = { 0: weight, 1: weight, 2: weight, 3: weight, 6: weight, 7: weight, 8: weight, 9: weight, 10: weight, 11: weight, 12: weight, 13: weight, 14: weight, 15: weight, } elif weight_type == "style and composition": if is_sdxl: weight = {3: weight_composition, 6: weight} else: weight = { 0: weight, 1: weight, 2: weight, 3: weight, 4: weight_composition * 0.25, 5: weight_composition, 9: weight, 10: weight, 11: weight, 12: weight, 13: weight, 14: weight, 15: weight, } elif weight_type == "strong style and composition": if is_sdxl: weight = { 0: weight, 1: weight, 2: weight, 3: weight_composition, 4: weight, 5: weight, 6: weight, 7: weight, 8: weight, 9: weight, 10: weight, } else: weight = { 0: weight, 1: weight, 2: weight, 3: weight, 4: weight_composition, 5: weight_composition, 6: weight, 7: weight, 8: weight, 9: weight, 10: weight, 11: weight, 12: weight, 13: weight, 14: weight, 15: weight, } else: raise Exception("Unrecognized weight type!") if isinstance(weight, dict): return weight.get(t_idx, 0.0) else: return weight ================================================ FILE: scripts/logging.py ================================================ import logging import copy import sys from modules import shared class ColoredFormatter(logging.Formatter): COLORS = { "DEBUG": "\033[0;36m", # CYAN "INFO": "\033[0;32m", # GREEN "WARNING": "\033[0;33m", # YELLOW "ERROR": "\033[0;31m", # RED "CRITICAL": "\033[0;37;41m", # WHITE ON RED "RESET": "\033[0m", # RESET COLOR } def format(self, record): colored_record = copy.copy(record) levelname = colored_record.levelname seq = self.COLORS.get(levelname, self.COLORS["RESET"]) colored_record.levelname = f"{seq}{levelname}{self.COLORS['RESET']}" return super().format(colored_record) # Create a new logger logger = logging.getLogger("ControlNet") logger.propagate = False # Add handler if we don't have one. if not logger.handlers: handler = logging.StreamHandler(sys.stdout) handler.setFormatter( ColoredFormatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") ) logger.addHandler(handler) # Configure logger loglevel_string = getattr(shared.cmd_opts, "controlnet_loglevel", "INFO") loglevel = getattr(logging, loglevel_string.upper(), None) logger.setLevel(loglevel) ================================================ FILE: scripts/lvminthin.py ================================================ # High Quality Edge Thinning using Pure Python # Written by Lvmin Zhang # 2023 April # Stanford University # If you use this, please Cite "High Quality Edge Thinning using Pure Python", Lvmin Zhang, In Mikubill/sd-webui-controlnet. import cv2 import numpy as np lvmin_kernels_raw = [ np.array([ [-1, -1, -1], [0, 1, 0], [1, 1, 1] ], dtype=np.int32), np.array([ [0, -1, -1], [1, 1, -1], [0, 1, 0] ], dtype=np.int32) ] lvmin_kernels = [] lvmin_kernels += [np.rot90(x, k=0, axes=(0, 1)) for x in lvmin_kernels_raw] lvmin_kernels += [np.rot90(x, k=1, axes=(0, 1)) for x in lvmin_kernels_raw] lvmin_kernels += [np.rot90(x, k=2, axes=(0, 1)) for x in lvmin_kernels_raw] lvmin_kernels += [np.rot90(x, k=3, axes=(0, 1)) for x in lvmin_kernels_raw] lvmin_prunings_raw = [ np.array([ [-1, -1, -1], [-1, 1, -1], [0, 0, -1] ], dtype=np.int32), np.array([ [-1, -1, -1], [-1, 1, -1], [-1, 0, 0] ], dtype=np.int32) ] lvmin_prunings = [] lvmin_prunings += [np.rot90(x, k=0, axes=(0, 1)) for x in lvmin_prunings_raw] lvmin_prunings += [np.rot90(x, k=1, axes=(0, 1)) for x in lvmin_prunings_raw] lvmin_prunings += [np.rot90(x, k=2, axes=(0, 1)) for x in lvmin_prunings_raw] lvmin_prunings += [np.rot90(x, k=3, axes=(0, 1)) for x in lvmin_prunings_raw] def remove_pattern(x, kernel): objects = cv2.morphologyEx(x, cv2.MORPH_HITMISS, kernel) objects = np.where(objects > 127) x[objects] = 0 return x, objects[0].shape[0] > 0 def thin_one_time(x, kernels): y = x is_done = True for k in kernels: y, has_update = remove_pattern(y, k) if has_update: is_done = False return y, is_done def lvmin_thin(x, prunings=True): y = x for i in range(32): y, is_done = thin_one_time(y, lvmin_kernels) if is_done: break if prunings: y, _ = thin_one_time(y, lvmin_prunings) return y def nake_nms(x): f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8) f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8) f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8) f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8) y = np.zeros_like(x) for f in [f1, f2, f3, f4]: np.putmask(y, cv2.dilate(x, kernel=f) == x, x) return y ================================================ FILE: scripts/movie2movie.py ================================================ import copy import os import shutil import cv2 import gradio as gr import modules.scripts as scripts from modules import images from modules.processing import process_images from modules.shared import opts from PIL import Image import numpy as np _BASEDIR = "/controlnet-m2m" _BASEFILE = "animation" def get_all_frames(video_path): if video_path is None: return None cap = cv2.VideoCapture(video_path) frame_list = [] if not cap.isOpened(): return while True: ret, frame = cap.read() if ret: frame_list.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) else: return frame_list def get_min_frame_num(video_list): min_frame_num = -1 for video in video_list: if video is None: continue else: frame_num = len(video) print(frame_num) if min_frame_num < 0: min_frame_num = frame_num elif frame_num < min_frame_num: min_frame_num = frame_num return min_frame_num def pil2cv(image): new_image = np.array(image, dtype=np.uint8) if new_image.ndim == 2: pass elif new_image.shape[2] == 3: new_image = new_image[:, :, ::-1] elif new_image.shape[2] == 4: new_image = new_image[:, :, [2, 1, 0, 3]] return new_image def save_gif(path, image_list, name, duration): tmp_dir = path + "/tmp/" if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.mkdir(tmp_dir) for i, image in enumerate(image_list): images.save_image(image, tmp_dir, f"output_{i}") os.makedirs(f"{path}{_BASEDIR}", exist_ok=True) image_list[0].save(f"{path}{_BASEDIR}/{name}.gif", save_all=True, append_images=image_list[1:], optimize=False, duration=duration, loop=0) class Script(scripts.Script): def title(self): return "controlnet m2m" def show(self, is_img2img): return True def ui(self, is_img2img): # How the script's is displayed in the UI. See https://gradio.app/docs/#components # for the different UI components you can use and how to create them. # Most UI components can return a value, such as a boolean for a checkbox. # The returned values are passed to the run method as parameters. ctrls_group = () max_models = opts.data.get("control_net_unit_count", 3) with gr.Group(): with gr.Accordion("ControlNet-M2M", open = False): duration = gr.Slider(label=f"Duration", value=50.0, minimum=10.0, maximum=200.0, step=10, interactive=True, elem_id='controlnet_movie2movie_duration_slider') with gr.Tabs(): for i in range(max_models): with gr.Tab(f"ControlNet-{i}"): with gr.TabItem("Movie Input"): ctrls_group += (gr.Video(format='mp4', source='upload', elem_id = f"video_{i}"), ) with gr.TabItem("Image Input"): ctrls_group += (gr.Image(source='upload', brush_radius=20, mirror_webcam=False, type='numpy', tool='sketch', elem_id=f'image_{i}'), ) ctrls_group += (gr.Checkbox(label=f"Save preprocessed", value=False, elem_id = f"save_pre_{i}"),) ctrls_group += (duration,) return ctrls_group def run(self, p, *args): # This is where the additional processing is implemented. The parameters include # self, the model object "p" (a StableDiffusionProcessing class, see # processing.py), and the parameters returned by the ui method. # Custom functions can be defined here, and additional libraries can be imported # to be used in processing. The return value should be a Processed object, which is # what is returned by the process_images method. contents_num = opts.data.get("control_net_unit_count", 3) arg_num = 3 item_list = [] video_list = [] for input_set in [tuple(args[:contents_num * arg_num][i:i+3]) for i in range(0, len(args[:contents_num * arg_num]), arg_num)]: if input_set[0] is not None: item_list.append([get_all_frames(input_set[0]), "video"]) video_list.append(get_all_frames(input_set[0])) if input_set[1] is not None: item_list.append([cv2.cvtColor(pil2cv(input_set[1]["image"]), cv2.COLOR_BGRA2RGB), "image"]) save_pre = list(args[2:contents_num * arg_num:3]) item_num = len(item_list) video_num = len(video_list) duration, = args[contents_num * arg_num:] frame_num = get_min_frame_num(video_list) if frame_num > 0: output_image_list = [] pre_output_image_list = [] for i in range(item_num): pre_output_image_list.append([]) for frame in range(frame_num): copy_p = copy.copy(p) copy_p.control_net_input_image = [] for item in item_list: if item[1] == "video": copy_p.control_net_input_image.append(item[0][frame]) elif item[1] == "image": copy_p.control_net_input_image.append(item[0]) else: continue proc = process_images(copy_p) img = proc.images[0] output_image_list.append(img) for i in range(len(save_pre)): if save_pre[i]: try: pre_output_image_list[i].append(proc.images[i + 1]) except: print(f"proc.images[{i} failed") copy_p.close() # filename format is seq-seed-animation.gif seq is 5 places left filled with 0 seq = images.get_next_sequence_number(f"{p.outpath_samples}{_BASEDIR}", "") filename = f"{seq:05}-{proc.seed}-{_BASEFILE}" save_gif(p.outpath_samples, output_image_list, filename, duration) proc.images = [f"{p.outpath_samples}{_BASEDIR}/{filename}.gif"] for i in range(len(save_pre)): if save_pre[i]: # control files add -controlX.gif where X is the controlnet number save_gif(p.outpath_samples, pre_output_image_list[i], f"{filename}-control{i}", duration) proc.images.append(f"{p.outpath_samples}{_BASEDIR}/{filename}-control{i}.gif") else: proc = process_images(p) return proc ================================================ FILE: scripts/preprocessor/__init__.py ================================================ from .teed import * from .pulid import * from .inpaint import * from .lama_inpaint import * from .ip_adapter_auto import * from .normal_dsine import * from .model_free_preprocessors import * from .legacy.legacy_preprocessors import * from .mobile_sam import * ================================================ FILE: scripts/preprocessor/inpaint.py ================================================ from scripts.utils import visualize_inpaint_mask from ..supported_preprocessor import Preprocessor, PreprocessorParameter class PreprocessorInpaint(Preprocessor): def __init__(self): super().__init__(name="inpaint") self._label = "inpaint_global_harmonious" self.tags = ["Inpaint"] self.slider_resolution = PreprocessorParameter(visible=False) self.sorting_priority = 0 self.accepts_mask = True self.requires_mask = True def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs ): return Preprocessor.Result( value=input_image, display_images=visualize_inpaint_mask(input_image)[None, :, :, :], ) class PreprocessorInpaintOnly(Preprocessor): def __init__(self): super().__init__(name="inpaint_only") self.tags = ["Inpaint"] self.slider_resolution = PreprocessorParameter(visible=False) self.sorting_priority = 100 self.accepts_mask = True self.requires_mask = True def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs ): return Preprocessor.Result( value=input_image, display_images=visualize_inpaint_mask(input_image)[None, :, :, :], ) Preprocessor.add_supported_preprocessor(PreprocessorInpaint()) Preprocessor.add_supported_preprocessor(PreprocessorInpaintOnly()) ================================================ FILE: scripts/preprocessor/ip_adapter_auto.py ================================================ from ..ipadapter.presets import IPAdapterPreset from ..supported_preprocessor import Preprocessor from ..logging import logger class PreprocessorIPAdapterAuto(Preprocessor): def __init__(self): super().__init__(name="ip-adapter-auto") self.tags = ["IP-Adapter"] self.sorting_priority = 1000 self.returns_image = False self.show_control_mode = False @staticmethod def get_preprocessor_by_model(model): module: str = IPAdapterPreset.match_model(model).module return Preprocessor.get_preprocessor(module) def __call__(self, *args, **kwargs): assert "model" in kwargs model: str = kwargs["model"] p = PreprocessorIPAdapterAuto.get_preprocessor_by_model(model) logger.info(f"ip-adapter-auto => {p.label}") assert p is not None return p(*args, **kwargs) Preprocessor.add_supported_preprocessor(PreprocessorIPAdapterAuto()) ================================================ FILE: scripts/preprocessor/lama_inpaint.py ================================================ import cv2 import numpy as np from ..supported_preprocessor import Preprocessor, PreprocessorParameter from ..utils import resize_image_with_pad, visualize_inpaint_mask class PreprocessorLamaInpaint(Preprocessor): def __init__(self): super().__init__(name="inpaint_only+lama") self.tags = ["Inpaint"] self.returns_image = True self.model = None self.slider_resolution = PreprocessorParameter(visible=False) self.accepts_mask = True self.requires_mask = True def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs ): img = input_image H, W, C = img.shape assert C == 4, "No mask is provided!" raw_color = img[:, :, 0:3].copy() raw_mask = img[:, :, 3:4].copy() res = 256 # Always use 256 since lama is trained on 256 img_res, remove_pad = resize_image_with_pad(img, res) if self.model is None: from annotator.lama import LamaInpainting self.model = LamaInpainting() # applied auto inversion prd_color = self.model(img_res) prd_color = remove_pad(prd_color) prd_color = cv2.resize(prd_color, (W, H)) alpha = raw_mask.astype(np.float32) / 255.0 fin_color = prd_color.astype(np.float32) * alpha + raw_color.astype( np.float32 ) * (1 - alpha) fin_color = fin_color.clip(0, 255).astype(np.uint8) result = np.concatenate([fin_color, raw_mask], axis=2) return Preprocessor.Result( value=result, display_images=[ result[:, :, :3], visualize_inpaint_mask(result), ], ) Preprocessor.add_supported_preprocessor(PreprocessorLamaInpaint()) ================================================ FILE: scripts/preprocessor/legacy/legacy_preprocessors.py ================================================ # This is a python script to convert all old preprocessors to new format. # However, the old preprocessors are not very memory effective # and eventually we should move all old preprocessors to new format manually # see also the forge_preprocessor_normalbae/scripts/preprocessor_normalbae for # how to make better implementation of preprocessors. # No newer preprocessors should be written in this legacy way. # Never add new leagcy preprocessors please. # The new forge_preprocessor_normalbae/scripts/preprocessor_normalbae # is much more effective and maintainable from annotator.util import HWC3 from .preprocessor_compiled import legacy_preprocessors from ...supported_preprocessor import Preprocessor, PreprocessorParameter ### # This file has lots of unreasonable historical designs and should be viewed as a frozen blackbox library. # If you want to add preprocessor, # please instead look at `extensions-builtin/forge_preprocessor_normalbae/scripts/preprocessor_normalbae` # If you want to use preprocessor, # please instead use `from modules_forge.shared import supported_preprocessors` # and then use any preprocessor like: depth_midas = supported_preprocessors['depth_midas'] # Please do not hack/edit/modify/rely-on any codes in this file. # Never use methods in this file to add anything! # This file will be eventually removed but the workload is super high and we need more time to do this. ### class LegacyPreprocessor(Preprocessor): def __init__(self, name: str, legacy_dict): super().__init__(name) self._label = legacy_dict["label"] self.call_function = legacy_dict["call_function"] self.unload_function = legacy_dict["unload_function"] self.managed_model = legacy_dict["managed_model"] self.do_not_need_model = legacy_dict["model_free"] self.show_control_mode = not legacy_dict["no_control_mode"] self.sorting_priority = legacy_dict["priority"] self.tags = legacy_dict["tags"] self.returns_image = legacy_dict.get("returns_image", True) self.accepts_mask = legacy_dict.get("accepts_mask", False) self.requires_mask = legacy_dict.get("requires_mask", False) if legacy_dict.get("use_soft_projection_in_hr_fix", False): self.use_soft_projection_in_hr_fix = True if legacy_dict["resolution"] is None: self.resolution = PreprocessorParameter(visible=False) else: legacy_dict["resolution"]["label"] = "Resolution" legacy_dict["resolution"]["step"] = 8 self.resolution = PreprocessorParameter( **legacy_dict["resolution"], visible=True ) if legacy_dict["slider_1"] is None: self.slider_1 = PreprocessorParameter(visible=False) else: self.slider_1 = PreprocessorParameter( **legacy_dict["slider_1"], visible=True ) if legacy_dict["slider_2"] is None: self.slider_2 = PreprocessorParameter(visible=False) else: self.slider_2 = PreprocessorParameter( **legacy_dict["slider_2"], visible=True ) if legacy_dict["slider_3"] is None: self.slider_3 = PreprocessorParameter(visible=False) else: self.slider_3 = PreprocessorParameter( **legacy_dict["slider_3"], visible=True ) self.active_with_model = False def unload(self): if self.active_with_model: self.unload_function() self.active_with_model = False return True return False def __call__( self, input_image, resolution=512, slider_1=None, slider_2=None, slider_3=None, **kwargs ): # Legacy Preprocessors does not have slider 3 del slider_3 result, is_image = self.call_function( img=input_image, res=resolution, thr_a=slider_1, thr_b=slider_2, **kwargs ) if self.managed_model is not None: self.active_with_model = True if is_image: result = HWC3(result) return result for name, data in legacy_preprocessors.items(): p = LegacyPreprocessor(name, data) Preprocessor.add_supported_preprocessor(p) ================================================ FILE: scripts/preprocessor/legacy/preprocessor_compiled.py ================================================ from .processor import * # noqa import functools legacy_preprocessors = { # "none": { # "label": "none", # "call_function": lambda x, *args, **kwargs: (x, True), # "unload_function": None, # "managed_model": None, # "model_free": False, # "no_control_mode": False, # "resolution": None, # "slider_1": None, # "slider_2": None, # "slider_3": None, # "priority": 100, # "tags": [] # }, # "invert": { # "label": "invert (from white bg & black line)", # "call_function": invert, # "unload_function": None, # "managed_model": None, # "model_free": False, # "no_control_mode": False, # "resolution": None, # "slider_1": None, # "slider_2": None, # "slider_3": None, # "priority": 20, # "tags": [ # "Canny", "Lineart", "Scribble", "MLSD", # ] # }, "animal_openpose": { "label": "animal_openpose", "call_function": functools.partial(g_openpose_model.run_model, include_body=True, include_hand=False, include_face=False, use_animal_pose=True), "unload_function": g_openpose_model.unload, "managed_model": "g_openpose_model", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "OpenPose" ] }, # "blur_gaussian": { # "label": "blur_gaussian", # "call_function": blur_gaussian, # "unload_function": None, # "managed_model": None, # "model_free": False, # "no_control_mode": False, # "resolution": { # "label": "Resolution", # "value": 512, # "minimum": 64, # "maximum": 2048 # }, # "slider_1": { # "label": "Sigma", # "minimum": 0.01, # "maximum": 64.0, # "value": 9.0 # }, # "slider_2": None, # "slider_3": None, # "priority": 0, # "tags": [ # "Tile", # ] # }, # "canny": { # "label": "canny", # "call_function": canny, # "unload_function": None, # "managed_model": "model_canny", # "model_free": False, # "no_control_mode": False, # "resolution": { # "label": "Resolution", # "value": 512, # "minimum": 64, # "maximum": 2048 # }, # "slider_1": { # "label": "Canny Low Threshold", # "value": 100, # "minimum": 1, # "maximum": 255 # }, # "slider_2": { # "label": "Canny High Threshold", # "value": 200, # "minimum": 1, # "maximum": 255 # }, # "slider_3": None, # "priority": 100, # "tags": [ # "Canny" # ] # }, "densepose": { "label": "densepose (pruple bg & purple torso)", "call_function": functools.partial(densepose, cmap="viridis"), "unload_function": unload_densepose, "managed_model": "unknown", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "OpenPose" ] }, "densepose_parula": { "label": "densepose_parula (black bg & blue torso)", "call_function": functools.partial(densepose, cmap="parula"), "unload_function": unload_densepose, "managed_model": "unknown", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "OpenPose" ] }, "depth_anything": { "label": "depth_anything", "call_function": functools.partial(depth_anything, colored=False), "unload_function": unload_depth_anything, "managed_model": "model_depth_anything", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Depth" ] }, "depth_anything_v2": { "label": "depth_anything_v2", "call_function": functools.partial(depth_anything_v2, colored=False), "unload_function": unload_depth_anything_v2, "managed_model": "model_depth_anything_v2", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Depth" ] }, "depth_hand_refiner": { "label": "depth_hand_refiner", "call_function": g_hand_refiner_model.run_model, "unload_function": g_hand_refiner_model.unload, "managed_model": "g_hand_refiner_model", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "value": 512, "minimum": 64, "maximum": 2048 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Depth" ] }, "depth_leres": { "label": "depth_leres", "call_function": functools.partial(leres, boost=False), "unload_function": unload_leres, "managed_model": "model_leres", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": { "label": "Remove Near %", "minimum": 0, "maximum": 100, "value": 0, "step": 0.1 }, "slider_2": { "label": "Remove Background %", "minimum": 0, "maximum": 100, "value": 0, "step": 0.1 }, "slider_3": None, "priority": 0, "tags": [ "Depth" ] }, "depth_leres++": { "label": "depth_leres++", "call_function": functools.partial(leres, boost=True), "unload_function": unload_leres, "managed_model": "model_leres", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": { "label": "Remove Near %", "minimum": 0, "maximum": 100, "value": 0, "step": 0.1 }, "slider_2": { "label": "Remove Background %", "minimum": 0, "maximum": 100, "value": 0, "step": 0.1 }, "slider_3": None, "priority": 0, "tags": [ "Depth" ] }, "depth": { "label": "depth_midas", "call_function": midas, "unload_function": unload_midas, "managed_model": "model_midas", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "Depth" ] }, "depth_zoe": { "label": "depth_zoe", "call_function": zoe_depth, "unload_function": unload_zoe_depth, "managed_model": "model_zoe_depth", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Depth" ] }, "dw_openpose_full": { "label": "dw_openpose_full", "call_function": functools.partial(g_openpose_model.run_model, include_body=True, include_hand=True, include_face=True, use_dw_pose=True), "unload_function": g_openpose_model.unload, "managed_model": 'g_openpose_model', "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "OpenPose" ] }, # "inpaint": { # "label": "inpaint_global_harmonious", # "call_function": identity, # "unload_function": None, # "managed_model": None, # "model_free": False, # "no_control_mode": False, # "resolution": None, # "slider_1": None, # "slider_2": None, # "slider_3": None, # "priority": 0, # "tags": [ # "Inpaint" # ] # }, # "inpaint_only": { # "label": "inpaint_only", # "call_function": identity, # "unload_function": None, # "managed_model": None, # "model_free": False, # "no_control_mode": False, # "resolution": None, # "slider_1": None, # "slider_2": None, # "slider_3": None, # "priority": 100, # "tags": [ # "Inpaint" # ] # }, # "inpaint_only+lama": { # "label": "inpaint_only+lama", # "call_function": lama_inpaint, # "unload_function": unload_lama_inpaint, # "managed_model": "model_lama", # "model_free": False, # "no_control_mode": False, # "resolution": None, # "slider_1": None, # "slider_2": None, # "slider_3": None, # "priority": 0, # "tags": [ # "Inpaint" # ] # }, "instant_id_face_embedding": { "label": "instant_id_face_embedding", "call_function": functools.partial(g_insight_face_instant_id_model.run_model_instant_id, return_keypoints=False), "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Instant-ID" ], "returns_image": False, }, "instant_id_face_keypoints": { "label": "instant_id_face_keypoints", "call_function": functools.partial(g_insight_face_instant_id_model.run_model_instant_id, return_keypoints=True), "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Instant-ID" ] }, "ip-adapter_clip_sd15": { "label": "ip-adapter_clip_h", "call_function": functools.partial(clip, config='clip_h'), "unload_function": functools.partial(unload_clip, config='clip_h'), "managed_model": "unknown", "model_free": False, "no_control_mode": True, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 100, "accepts_mask": True, # CLIP mask "tags": [ "IP-Adapter" ], "returns_image": False, }, "ip-adapter_clip_sdxl": { "label": "ip-adapter_clip_g", "call_function": functools.partial(clip, config='clip_g'), "unload_function": functools.partial(unload_clip, config='clip_g'), "managed_model": "unknown", "model_free": False, "no_control_mode": True, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "accepts_mask": True, # CLIP mask "tags": [ "IP-Adapter" ], "returns_image": False, }, "ip-adapter_clip_sdxl_plus_vith": { "label": "ip-adapter_clip_sdxl_plus_vith", "call_function": functools.partial(clip, config='clip_h'), "unload_function": functools.partial(unload_clip, config='clip_h'), "managed_model": "unknown", "model_free": False, "no_control_mode": True, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "accepts_mask": True, # CLIP mask "tags": [ "IP-Adapter" ], "returns_image": False, }, "ip-adapter_face_id": { "label": "ip-adapter_face_id", "call_function": g_insight_face_model.run_model, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": True, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "IP-Adapter" ], "returns_image": False, }, "ip-adapter_face_id_plus": { "label": "ip-adapter_face_id_plus", "call_function": face_id_plus, "unload_function": functools.partial(unload_clip, config='clip_h'), "managed_model": "unknown", "model_free": False, "no_control_mode": True, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "IP-Adapter" ], "returns_image": False, }, "lineart_anime": { "label": "lineart_anime", "call_function": lineart_anime, "unload_function": unload_lineart_anime, "managed_model": "model_lineart_anime", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Lineart" ] }, "lineart_anime_denoise": { "label": "lineart_anime_denoise", "call_function": lineart_anime_denoise, "unload_function": unload_lineart_anime_denoise, "managed_model": "model_manga_line", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Lineart" ] }, "lineart_coarse": { "label": "lineart_coarse", "call_function": lineart_coarse, "unload_function": unload_lineart_coarse, "managed_model": "model_lineart_coarse", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Lineart" ] }, "lineart": { "label": "lineart_realistic", "call_function": lineart, "unload_function": unload_lineart, "managed_model": "model_lineart", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Lineart" ] }, "lineart_standard": { "label": "lineart_standard (from white bg & black line)", "call_function": lineart_standard, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "Lineart" ] }, "mediapipe_face": { "label": "mediapipe_face", "call_function": mediapipe_face, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "value": 512, "minimum": 64, "maximum": 2048 }, "slider_1": { "label": "Max Faces", "value": 1, "minimum": 1, "maximum": 10, "step": 1 }, "slider_2": { "label": "Min Face Confidence", "value": 0.5, "minimum": 0.01, "maximum": 1.0, "step": 0.01 }, "slider_3": None, "priority": 0, "tags": [] }, "mlsd": { "label": "mlsd", "call_function": mlsd, "unload_function": unload_mlsd, "managed_model": "model_mlsd", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": { "label": "MLSD Value Threshold", "minimum": 0.01, "maximum": 2.0, "value": 0.1, "step": 0.01 }, "slider_2": { "label": "MLSD Distance Threshold", "minimum": 0.01, "maximum": 20.0, "value": 0.1, "step": 0.01 }, "slider_3": None, "priority": 100, "tags": [ "MLSD" ], "use_soft_projection_in_hr_fix": True }, "normal_bae": { "label": "normal_bae", "call_function": normal_bae, "unload_function": unload_normal_bae, "managed_model": "model_normal_bae", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "NormalMap" ] }, "normal_map": { "label": "normal_midas", "call_function": midas_normal, "unload_function": unload_midas, "managed_model": "model_midas", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": { "label": "Normal Background Threshold", "minimum": 0.0, "maximum": 1.0, "value": 0.4, "step": 0.01 }, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "NormalMap" ] }, "openpose": { "label": "openpose", "call_function": functools.partial(g_openpose_model.run_model, include_body=True, include_hand=False, include_face=False), "unload_function": g_openpose_model.unload, "managed_model": "g_openpose_model", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "OpenPose" ] }, "openpose_face": { "label": "openpose_face", "call_function": functools.partial(g_openpose_model.run_model, include_body=True, include_hand=False, include_face=True), "unload_function": g_openpose_model.unload, "managed_model": "g_openpose_model", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "OpenPose" ] }, "openpose_faceonly": { "label": "openpose_faceonly", "call_function": functools.partial(g_openpose_model.run_model, include_body=False, include_hand=False, include_face=True), "unload_function": g_openpose_model.unload, "managed_model": "g_openpose_model", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "OpenPose" ] }, "openpose_full": { "label": "openpose_full", "call_function": functools.partial(g_openpose_model.run_model, include_body=True, include_hand=True, include_face=True), "unload_function": g_openpose_model.unload, "managed_model": "g_openpose_model", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "OpenPose" ] }, "openpose_hand": { "label": "openpose_hand", "call_function": functools.partial(g_openpose_model.run_model, include_body=True, include_hand=True, include_face=False), "unload_function": g_openpose_model.unload, "managed_model": "g_openpose_model", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "OpenPose" ] }, "recolor_intensity": { "label": "recolor_intensity", "call_function": recolor_intensity, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": { "label": "Gamma Correction", "value": 1.0, "minimum": 0.1, "maximum": 2.0, "step": 0.001 }, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Recolor" ] }, "recolor_luminance": { "label": "recolor_luminance", "call_function": recolor_luminance, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": { "label": "Gamma Correction", "value": 1.0, "minimum": 0.1, "maximum": 2.0, "step": 0.001 }, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "Recolor" ] }, "reference_adain": { "label": "reference_adain", "call_function": identity, "unload_function": None, "managed_model": None, "model_free": True, "no_control_mode": False, "resolution": None, "slider_1": { "label": "Style Fidelity (only for Balanced mode)", "value": 0.5, "minimum": 0.0, "maximum": 1.0, "step": 0.01 }, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Reference" ] }, "reference_adain+attn": { "label": "reference_adain+attn", "call_function": identity, "unload_function": None, "managed_model": None, "model_free": True, "no_control_mode": False, "resolution": None, "slider_1": { "label": "Style Fidelity (only for Balanced mode)", "value": 0.5, "minimum": 0.0, "maximum": 1.0, "step": 0.01 }, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Reference" ] }, "reference_only": { "label": "reference_only", "call_function": identity, "unload_function": None, "managed_model": None, "model_free": True, "no_control_mode": False, "resolution": None, "slider_1": { "label": "Style Fidelity (only for Balanced mode)", "value": 0.5, "minimum": 0.0, "maximum": 1.0, "step": 0.01 }, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "Reference" ] }, "revision_clipvision": { "label": "revision_clipvision", "call_function": functools.partial(clip, config='clip_g'), "unload_function": functools.partial(unload_clip, config='clip_g'), "managed_model": None, "model_free": True, "no_control_mode": True, "resolution": None, "slider_1": { "label": "Noise Augmentation", "value": 0.0, "minimum": 0.0, "maximum": 1.0 }, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "Revision" ], "returns_image": False, }, "revision_ignore_prompt": { "label": "revision_ignore_prompt", "call_function": functools.partial(clip, config='clip_g'), "unload_function": functools.partial(unload_clip, config='clip_g'), "managed_model": None, "model_free": True, "no_control_mode": True, "resolution": None, "slider_1": { "label": "Noise Augmentation", "value": 0.0, "minimum": 0.0, "maximum": 1.0 }, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Revision" ], "returns_image": False, }, "scribble_hed": { "label": "scribble_hed", "call_function": scribble_hed, "unload_function": unload_hed, "managed_model": "model_hed", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Scribble", "SparseCtrl", ] }, "pidinet_scribble": { "label": "scribble_pidinet", "call_function": scribble_pidinet, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "Scribble", "SparseCtrl", ] }, # "scribble_xdog": { # "label": "scribble_xdog", # "call_function": scribble_xdog, # "unload_function": None, # "managed_model": None, # "model_free": False, # "no_control_mode": False, # "resolution": { # "label": "Resolution", # "value": 512, # "minimum": 64, # "maximum": 2048 # }, # "slider_1": { # "label": "XDoG Threshold", # "minimum": 1, # "maximum": 64, # "value": 32 # }, # "slider_2": None, # "slider_3": None, # "priority": 0, # "tags": [ # "Scribble", # ] # }, "anime_face_segment": { "label": "seg_anime_face", "call_function": anime_face_segment, "unload_function": unload_anime_face_segment, "managed_model": "model_anime_face_segment", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "value": 512, "minimum": 64, "maximum": 2048 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Segmentation" ] }, "oneformer_ade20k": { "label": "seg_ofade20k", "call_function": oneformer_ade20k, "unload_function": unload_oneformer_ade20k, "managed_model": "model_oneformer_ade20k", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "Segmentation" ] }, "oneformer_coco": { "label": "seg_ofcoco", "call_function": oneformer_coco, "unload_function": unload_oneformer_coco, "managed_model": "model_oneformer_coco", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Segmentation" ] }, "segmentation": { "label": "seg_ufade20k", "call_function": uniformer, "unload_function": unload_uniformer, "managed_model": "model_uniformer", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Segmentation" ] }, # "shuffle": { # "label": "shuffle", # "call_function": shuffle, # "unload_function": None, # "managed_model": None, # "model_free": False, # "no_control_mode": False, # "resolution": None, # "slider_1": None, # "slider_2": None, # "slider_3": None, # "priority": 100, # "tags": [ # "Shuffle" # ] # }, "hed": { "label": "softedge_hed", "call_function": hed, "unload_function": unload_hed, "managed_model": "model_hed", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "SoftEdge" ] }, "hed_safe": { "label": "softedge_hedsafe", "call_function": hed_safe, "unload_function": unload_hed, "managed_model": "model_hed", "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "minimum": 64, "maximum": 2048, "value": 512 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "SoftEdge" ] }, "pidinet": { "label": "softedge_pidinet", "call_function": pidinet, "unload_function": unload_pidinet, "managed_model": "model_pidinet", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "SoftEdge" ] }, "pidinet_safe": { "label": "softedge_pidisafe", "call_function": pidinet_safe, "unload_function": unload_pidinet, "managed_model": "model_pidinet", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "SoftEdge" ] }, # "te_hed": { # "label": "softedge_teed", # "call_function": te_hed, # "unload_function": unload_te_hed, # "managed_model": "model_te_hed", # "model_free": False, # "no_control_mode": False, # "resolution": { # "label": "Resolution", # "value": 512, # "minimum": 64, # "maximum": 2048 # }, # "slider_1": { # "label": "Safe Steps", # "minimum": 0, # "maximum": 10, # "value": 2, # "step": 1 # }, # "slider_2": None, # "slider_3": None, # "priority": 0, # "tags": [ # "SoftEdge" # ] # }, "color": { "label": "t2ia_color_grid", "call_function": color, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "value": 512, "minimum": 64, "maximum": 2048 }, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "T2I-Adapter" ] }, "pidinet_sketch": { "label": "t2ia_sketch_pidi", "call_function": pidinet_ts, "unload_function": unload_pidinet, "managed_model": "model_pidinet", "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "T2I-Adapter" ] }, "clip_vision": { "label": "t2ia_style_clipvision", "call_function": functools.partial(clip, config='clip_vitl'), "unload_function": functools.partial(unload_clip, config='clip_vitl'), "managed_model": "unknown", "model_free": False, "no_control_mode": True, "resolution": None, "slider_1": None, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "T2I-Adapter" ], "returns_image": False, }, "threshold": { "label": "threshold", "call_function": threshold, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": { "label": "Resolution", "value": 512, "minimum": 64, "maximum": 2048 }, "slider_1": { "label": "Binarization Threshold", "minimum": 0, "maximum": 255, "value": 127 }, "slider_2": None, "slider_3": None, "priority": 0, "tags": [] }, "tile_colorfix": { "label": "tile_colorfix", "call_function": identity, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": { "label": "Variation", "value": 8.0, "minimum": 3.0, "maximum": 32.0, "step": 1.0 }, "slider_2": None, "slider_3": None, "priority": 0, "tags": [ "Tile", ] }, "tile_colorfix+sharp": { "label": "tile_colorfix+sharp", "call_function": identity, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": { "label": "Variation", "value": 8.0, "minimum": 3.0, "maximum": 32.0, "step": 1.0 }, "slider_2": { "label": "Sharpness", "value": 1.0, "minimum": 0.0, "maximum": 2.0, "step": 0.01 }, "slider_3": None, "priority": 0, "tags": [ "Tile", ] }, "tile_resample": { "label": "tile_resample", "call_function": tile_resample, "unload_function": None, "managed_model": None, "model_free": False, "no_control_mode": False, "resolution": None, "slider_1": { "label": "Down Sampling Rate", "value": 1.0, "minimum": 1.0, "maximum": 8.0, "step": 0.01 }, "slider_2": None, "slider_3": None, "priority": 100, "tags": [ "Tile", ] } } ================================================ FILE: scripts/preprocessor/legacy/processor.py ================================================ import os import cv2 import numpy as np import torch import math from dataclasses import dataclass from transformers.models.clip.modeling_clip import CLIPVisionModelOutput from typing import Callable, Tuple, Union from modules.safe import Extra from modules import devices from annotator.util import HWC3 from scripts.logging import logger def torch_handler(module: str, name: str): """ Allow all torch access. Bypass A1111 safety whitelist. """ if module == 'torch': return getattr(torch, name) if module == 'torch._tensor': # depth_anything dep. return getattr(torch._tensor, name) def pad64(x): return int(np.ceil(float(x) / 64.0) * 64 - x) def safer_memory(x): # Fix many MAC/AMD problems return np.ascontiguousarray(x.copy()).copy() def resize_image_with_pad(input_image, resolution, skip_hwc3=False): if skip_hwc3: img = input_image else: img = HWC3(input_image) H_raw, W_raw, _ = img.shape k = float(resolution) / float(min(H_raw, W_raw)) interpolation = cv2.INTER_CUBIC if k > 1 else cv2.INTER_AREA H_target = int(np.round(float(H_raw) * k)) W_target = int(np.round(float(W_raw) * k)) img = cv2.resize(img, (W_target, H_target), interpolation=interpolation) H_pad, W_pad = pad64(H_target), pad64(W_target) img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode='edge') def remove_pad(x): return safer_memory(x[:H_target, :W_target]) return safer_memory(img_padded), remove_pad def canny(img, res=512, thr_a=100, thr_b=200, **kwargs): img, remove_pad = resize_image_with_pad(img, res) result = cv2.Canny(img, thr_a, thr_b) return remove_pad(result), True def scribble_xdog(img, res=512, thr_a=32, **kwargs): img, remove_pad = resize_image_with_pad(img, res) g1 = cv2.GaussianBlur(img.astype(np.float32), (0, 0), 0.5) g2 = cv2.GaussianBlur(img.astype(np.float32), (0, 0), 5.0) dog = (255 - np.min(g2 - g1, axis=2)).clip(0, 255).astype(np.uint8) result = np.zeros_like(img, dtype=np.uint8) result[2 * (255 - dog) > thr_a] = 255 return remove_pad(result), True def tile_resample(img, res=512, thr_a=1.0, **kwargs): img = HWC3(img) if thr_a < 1.1: return img, True H, W, C = img.shape H = int(float(H) / float(thr_a)) W = int(float(W) / float(thr_a)) img = cv2.resize(img, (W, H), interpolation=cv2.INTER_AREA) return img, True def threshold(img, res=512, thr_a=127, **kwargs): img, remove_pad = resize_image_with_pad(img, res) result = np.zeros_like(img, dtype=np.uint8) result[np.min(img, axis=2) > thr_a] = 255 return remove_pad(result), True def identity(img, **kwargs): return img, True def invert(img, res=512, **kwargs): return 255 - HWC3(img), True model_hed = None def hed(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_hed if model_hed is None: from annotator.hed import apply_hed model_hed = apply_hed result = model_hed(img) return remove_pad(result), True def hed_safe(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_hed if model_hed is None: from annotator.hed import apply_hed model_hed = apply_hed result = model_hed(img, is_safe=True) return remove_pad(result), True def unload_hed(): global model_hed if model_hed is not None: from annotator.hed import unload_hed_model unload_hed_model() def scribble_hed(img, res=512, **kwargs): result, _ = hed(img, res) import cv2 from annotator.util import nms result = nms(result, 127, 3.0) result = cv2.GaussianBlur(result, (0, 0), 3.0) result[result > 4] = 255 result[result < 255] = 0 return result, True model_mediapipe_face = None def mediapipe_face(img, res=512, thr_a: int = 10, thr_b: float = 0.5, **kwargs): max_faces = int(thr_a) min_confidence = thr_b img, remove_pad = resize_image_with_pad(img, res) global model_mediapipe_face if model_mediapipe_face is None: from annotator.mediapipe_face import apply_mediapipe_face model_mediapipe_face = apply_mediapipe_face result = model_mediapipe_face(img, max_faces=max_faces, min_confidence=min_confidence) return remove_pad(result), True model_mlsd = None def mlsd(img, res=512, thr_a=0.1, thr_b=0.1, **kwargs): thr_v, thr_d = thr_a, thr_b img, remove_pad = resize_image_with_pad(img, res) global model_mlsd if model_mlsd is None: from annotator.mlsd import apply_mlsd model_mlsd = apply_mlsd result = model_mlsd(img, thr_v, thr_d) return remove_pad(result), True def unload_mlsd(): global model_mlsd if model_mlsd is not None: from annotator.mlsd import unload_mlsd_model unload_mlsd_model() model_depth_anything = None def depth_anything(img, res:int = 512, colored:bool = True, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_depth_anything if model_depth_anything is None: with Extra(torch_handler): from annotator.depth_anything import DepthAnythingDetector device = devices.get_device_for("controlnet") model_depth_anything = DepthAnythingDetector(device) return remove_pad(model_depth_anything(img, colored=colored)), True def unload_depth_anything(): if model_depth_anything is not None: model_depth_anything.unload_model() model_depth_anything_v2 = None def depth_anything_v2(img, res:int = 512, colored:bool = True, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_depth_anything_v2 if model_depth_anything_v2 is None: with Extra(torch_handler): from annotator.depth_anything_v2 import DepthAnythingV2Detector device = devices.get_device_for("controlnet") model_depth_anything_v2 = DepthAnythingV2Detector(device) return remove_pad(model_depth_anything_v2(img, colored=colored)), True def unload_depth_anything_v2(): if model_depth_anything_v2 is not None: model_depth_anything_v2.unload_model() model_midas = None def midas(img, res=512, a=np.pi * 2.0, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_midas if model_midas is None: from annotator.midas import apply_midas model_midas = apply_midas result, _ = model_midas(img, a) return remove_pad(result), True def midas_normal(img, res=512, a=np.pi * 2.0, thr_a=0.4, **kwargs): # bg_th -> thr_a bg_th = thr_a img, remove_pad = resize_image_with_pad(img, res) global model_midas if model_midas is None: from annotator.midas import apply_midas model_midas = apply_midas _, result = model_midas(img, a, bg_th) return remove_pad(result), True def unload_midas(): global model_midas if model_midas is not None: from annotator.midas import unload_midas_model unload_midas_model() model_leres = None def leres(img, res=512, a=np.pi * 2.0, thr_a=0, thr_b=0, boost=False, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_leres if model_leres is None: from annotator.leres import apply_leres model_leres = apply_leres result = model_leres(img, thr_a, thr_b, boost=boost) return remove_pad(result), True def unload_leres(): global model_leres if model_leres is not None: from annotator.leres import unload_leres_model unload_leres_model() class OpenposeModel(object): def __init__(self) -> None: self.model_openpose = None def run_model( self, img: np.ndarray, include_body: bool, include_hand: bool, include_face: bool, use_dw_pose: bool = False, use_animal_pose: bool = False, json_pose_callback: Callable[[str], None] = None, res: int = 512, **kwargs # Ignore rest of kwargs ) -> Tuple[np.ndarray, bool]: """Run the openpose model. Returns a tuple of - result image - is_image flag The JSON format pose string is passed to `json_pose_callback`. """ if json_pose_callback is None: json_pose_callback = lambda x: None img, remove_pad = resize_image_with_pad(img, res) if self.model_openpose is None: from annotator.openpose import OpenposeDetector self.model_openpose = OpenposeDetector() return remove_pad(self.model_openpose( img, include_body=include_body, include_hand=include_hand, include_face=include_face, use_dw_pose=use_dw_pose, use_animal_pose=use_animal_pose, json_pose_callback=json_pose_callback )), True def unload(self): if self.model_openpose is not None: self.model_openpose.unload_model() g_openpose_model = OpenposeModel() model_uniformer = None def uniformer(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_uniformer if model_uniformer is None: from annotator.uniformer import apply_uniformer model_uniformer = apply_uniformer result = model_uniformer(img) return remove_pad(result), True def unload_uniformer(): global model_uniformer if model_uniformer is not None: from annotator.uniformer import unload_uniformer_model unload_uniformer_model() model_pidinet = None def pidinet(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_pidinet if model_pidinet is None: from annotator.pidinet import apply_pidinet model_pidinet = apply_pidinet result = model_pidinet(img) return remove_pad(result), True def pidinet_ts(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_pidinet if model_pidinet is None: from annotator.pidinet import apply_pidinet model_pidinet = apply_pidinet result = model_pidinet(img, apply_fliter=True) return remove_pad(result), True def pidinet_safe(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_pidinet if model_pidinet is None: from annotator.pidinet import apply_pidinet model_pidinet = apply_pidinet result = model_pidinet(img, is_safe=True) return remove_pad(result), True def scribble_pidinet(img, res=512, **kwargs): result, _ = pidinet(img, res) import cv2 from annotator.util import nms result = nms(result, 127, 3.0) result = cv2.GaussianBlur(result, (0, 0), 3.0) result[result > 4] = 255 result[result < 255] = 0 return result, True def unload_pidinet(): global model_pidinet if model_pidinet is not None: from annotator.pidinet import unload_pid_model unload_pid_model() clip_encoder = { 'clip_g': None, 'clip_h': None, 'clip_vitl': None, } def clip(img, res=512, config='clip_vitl', low_vram=False, **kwargs): global clip_encoder if clip_encoder[config] is None: from annotator.clipvision import ClipVisionDetector if low_vram: logger.info("Loading CLIP model on CPU.") clip_encoder[config] = ClipVisionDetector(config, low_vram) result = clip_encoder[config](img) return result, False def unload_clip(config='clip_vitl'): global clip_encoder if clip_encoder[config] is not None: clip_encoder[config].unload_model() clip_encoder[config] = None model_color = None def color(img, res=512, **kwargs): img = HWC3(img) global model_color if model_color is None: from annotator.color import apply_color model_color = apply_color result = model_color(img, res=res) return result, True def lineart_standard(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) x = img.astype(np.float32) g = cv2.GaussianBlur(x, (0, 0), 6.0) intensity = np.min(g - x, axis=2).clip(0, 255) intensity /= max(16, np.median(intensity[intensity > 8])) intensity *= 127 result = intensity.clip(0, 255).astype(np.uint8) return remove_pad(result), True model_lineart = None def lineart(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_lineart if model_lineart is None: from annotator.lineart import LineartDetector model_lineart = LineartDetector(LineartDetector.model_default) # applied auto inversion result = 255 - model_lineart(img) return remove_pad(result), True def unload_lineart(): global model_lineart if model_lineart is not None: model_lineart.unload_model() model_lineart_coarse = None def lineart_coarse(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_lineart_coarse if model_lineart_coarse is None: from annotator.lineart import LineartDetector model_lineart_coarse = LineartDetector(LineartDetector.model_coarse) # applied auto inversion result = 255 - model_lineart_coarse(img) return remove_pad(result), True def unload_lineart_coarse(): global model_lineart_coarse if model_lineart_coarse is not None: model_lineart_coarse.unload_model() model_lineart_anime = None def lineart_anime(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_lineart_anime if model_lineart_anime is None: from annotator.lineart_anime import LineartAnimeDetector model_lineart_anime = LineartAnimeDetector() # applied auto inversion result = 255 - model_lineart_anime(img) return remove_pad(result), True def unload_lineart_anime(): global model_lineart_anime if model_lineart_anime is not None: model_lineart_anime.unload_model() model_manga_line = None def lineart_anime_denoise(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_manga_line if model_manga_line is None: from annotator.manga_line import MangaLineExtration model_manga_line = MangaLineExtration() # applied auto inversion result = model_manga_line(img) return remove_pad(result), True def unload_lineart_anime_denoise(): global model_manga_line if model_manga_line is not None: model_manga_line.unload_model() model_zoe_depth = None def zoe_depth(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_zoe_depth if model_zoe_depth is None: from annotator.zoe import ZoeDetector model_zoe_depth = ZoeDetector() result = model_zoe_depth(img) return remove_pad(result), True def unload_zoe_depth(): global model_zoe_depth if model_zoe_depth is not None: model_zoe_depth.unload_model() model_normal_bae = None def normal_bae(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_normal_bae if model_normal_bae is None: from annotator.normalbae import NormalBaeDetector model_normal_bae = NormalBaeDetector() result = model_normal_bae(img) return remove_pad(result), True def unload_normal_bae(): global model_normal_bae if model_normal_bae is not None: model_normal_bae.unload_model() model_oneformer_coco = None def oneformer_coco(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_oneformer_coco if model_oneformer_coco is None: from annotator.oneformer import OneformerDetector model_oneformer_coco = OneformerDetector(OneformerDetector.configs["coco"]) result = model_oneformer_coco(img) return remove_pad(result), True def unload_oneformer_coco(): global model_oneformer_coco if model_oneformer_coco is not None: model_oneformer_coco.unload_model() model_oneformer_ade20k = None def oneformer_ade20k(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_oneformer_ade20k if model_oneformer_ade20k is None: from annotator.oneformer import OneformerDetector model_oneformer_ade20k = OneformerDetector(OneformerDetector.configs["ade20k"]) result = model_oneformer_ade20k(img) return remove_pad(result), True def unload_oneformer_ade20k(): global model_oneformer_ade20k if model_oneformer_ade20k is not None: model_oneformer_ade20k.unload_model() def recolor_luminance(img, res=512, thr_a=1.0, **kwargs): result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2LAB) result = result[:, :, 0].astype(np.float32) / 255.0 result = result ** thr_a result = (result * 255.0).clip(0, 255).astype(np.uint8) result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB) return result, True def recolor_intensity(img, res=512, thr_a=1.0, **kwargs): result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2HSV) result = result[:, :, 2].astype(np.float32) / 255.0 result = result ** thr_a result = (result * 255.0).clip(0, 255).astype(np.uint8) result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB) return result, True def blur_gaussian(img, res=512, thr_a=1.0, **kwargs): img, remove_pad = resize_image_with_pad(img, res) img = remove_pad(img) result = cv2.GaussianBlur(img, (0, 0), float(thr_a)) return result, True model_anime_face_segment = None def anime_face_segment(img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) global model_anime_face_segment if model_anime_face_segment is None: from annotator.anime_face_segment import AnimeFaceSegment model_anime_face_segment = AnimeFaceSegment() result = model_anime_face_segment(img) return remove_pad(result), True def unload_anime_face_segment(): global model_anime_face_segment if model_anime_face_segment is not None: model_anime_face_segment.unload_model() def densepose(img, res=512, cmap="viridis", **kwargs): img, remove_pad = resize_image_with_pad(img, res) from annotator.densepose import apply_densepose result = apply_densepose(img, cmap=cmap) return remove_pad(result), True def unload_densepose(): from annotator.densepose import unload_model unload_model() class InsightFaceModel: def __init__(self, face_analysis_model_name: str = "buffalo_l"): self.model = None self.face_analysis_model_name = face_analysis_model_name self.antelopev2_installed = False @staticmethod def pick_largest_face(faces): if not faces: raise Exception("Insightface: No face found in image.") if len(faces) > 1: logger.warn("Insightface: More than one face is detected in the image. " "Only the biggest one will be used.") # only use the biggest face face = sorted(faces, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1] return face def install_antelopev2(self): """insightface's github release on antelopev2 model is down. Downloading from huggingface mirror.""" from scripts.utils import load_file_from_url from annotator.annotator_path import models_path model_root = os.path.join(models_path, "insightface", "models", "antelopev2") if not model_root: os.makedirs(model_root, exist_ok=True) for local_file, url in ( ("1k3d68.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/1k3d68.onnx"), ("2d106det.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/2d106det.onnx"), ("genderage.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/genderage.onnx"), ("glintr100.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/glintr100.onnx"), ("scrfd_10g_bnkps.onnx", "https://huggingface.co/DIAMONIK7777/antelopev2/resolve/main/scrfd_10g_bnkps.onnx"), ): local_path = os.path.join(model_root, local_file) if not os.path.exists(local_path): load_file_from_url(url, model_dir=model_root) self.antelopev2_installed = True def load_model(self): if self.model is None: from insightface.app import FaceAnalysis from annotator.annotator_path import models_path self.model = FaceAnalysis( name=self.face_analysis_model_name, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'], root=os.path.join(models_path, "insightface"), ) self.model.prepare(ctx_id=0, det_size=(640, 640)) def run_model(self, img: np.ndarray, **kwargs) -> Tuple[torch.Tensor, bool]: self.load_model() img = img[:, :, :3] # Drop alpha channel if there is one. faces = self.model.get(cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) face = InsightFaceModel.pick_largest_face(faces) return torch.from_numpy(face.normed_embedding).unsqueeze(0), False def run_model_instant_id( self, img: np.ndarray, res: int = 512, return_keypoints: bool = False, **kwargs ) -> Tuple[Union[np.ndarray, torch.Tensor], bool]: """Run the insightface model for instant_id. Arguments: - img: Input image in any size. - res: Resolution used to resize image. - return_keypoints: Whether to return keypoints image or face embedding. """ def draw_kps(img: np.ndarray, kps, color_list=[(255,0,0), (0,255,0), (0,0,255), (255,255,0), (255,0,255)]): stickwidth = 4 limbSeq = np.array([[0, 2], [1, 2], [3, 2], [4, 2]]) kps = np.array(kps) h, w, _ = img.shape out_img = np.zeros([h, w, 3]) for i in range(len(limbSeq)): index = limbSeq[i] color = color_list[index[0]] x = kps[index][:, 0] y = kps[index][:, 1] length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5 angle = math.degrees(math.atan2(y[0] - y[1], x[0] - x[1])) polygon = cv2.ellipse2Poly((int(np.mean(x)), int(np.mean(y))), (int(length / 2), stickwidth), int(angle), 0, 360, 1) out_img = cv2.fillConvexPoly(out_img.copy(), polygon, color) out_img = (out_img * 0.6).astype(np.uint8) for idx_kp, kp in enumerate(kps): color = color_list[idx_kp] x, y = kp out_img = cv2.circle(out_img.copy(), (int(x), int(y)), 10, color, -1) return out_img.astype(np.uint8) if not self.antelopev2_installed: self.install_antelopev2() self.load_model() img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) img, remove_pad = resize_image_with_pad(img, res) faces = self.model.get(img) face_info = InsightFaceModel.pick_largest_face(faces) if return_keypoints: return remove_pad(draw_kps(img, face_info['kps'])), True else: return torch.from_numpy(face_info['embedding']), False g_insight_face_model = InsightFaceModel() g_insight_face_instant_id_model = InsightFaceModel(face_analysis_model_name="antelopev2") @dataclass class FaceIdPlusInput: face_embed: torch.Tensor clip_embed: CLIPVisionModelOutput def face_id_plus(img, low_vram=False, **kwargs): """ FaceID plus uses both face_embeding from insightface and clip_embeding from clip. """ face_embed, _ = g_insight_face_model.run_model(img) clip_embed, _ = clip(img, config='clip_h', low_vram=low_vram) return FaceIdPlusInput(face_embed, clip_embed), False class HandRefinerModel: def __init__(self): self.model = None self.device = devices.get_device_for("controlnet") def load_model(self): if self.model is None: from annotator.annotator_path import models_path from hand_refiner import MeshGraphormerDetector # installed via hand_refiner_portable with Extra(torch_handler): self.model = MeshGraphormerDetector.from_pretrained( "hr16/ControlNet-HandRefiner-pruned", cache_dir=os.path.join(models_path, "hand_refiner"), device=self.device, ) else: self.model.to(self.device) def unload(self): if self.model is not None: self.model.to("cpu") def run_model(self, img, res=512, **kwargs): img, remove_pad = resize_image_with_pad(img, res) self.load_model() with Extra(torch_handler): depth_map, mask, info = self.model( img, output_type="np", detect_resolution=res, mask_bbox_padding=30, ) return remove_pad(depth_map), True g_hand_refiner_model = HandRefinerModel() ================================================ FILE: scripts/preprocessor/mobile_sam.py ================================================ from annotator.mobile_sam import SamDetector_Aux from scripts.supported_preprocessor import Preprocessor class PreprocessorMobileSam(Preprocessor): def __init__(self): super().__init__(name="mobile_sam") self.tags = ["Segmentation"] self.model = None def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs ): if self.model is None: self.model = SamDetector_Aux.from_pretrained() result = self.model(input_image, detect_resolution=resolution, image_resolution=resolution, output_type="cv2") return result Preprocessor.add_supported_preprocessor(PreprocessorMobileSam()) ================================================ FILE: scripts/preprocessor/model_free_preprocessors.py ================================================ """Preprocessors that do not need to run a torch model.""" import cv2 import numpy as np from ..supported_preprocessor import Preprocessor, PreprocessorParameter from ..utils import resize_image_with_pad from annotator.util import HWC3 class PreprocessorNone(Preprocessor): def __init__(self): super().__init__(name="none") self.sorting_priority = 10 self.tags = ["InstructP2P"] def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, input_mask=None, **kwargs ): return input_image class PreprocessorCanny(Preprocessor): def __init__(self): super().__init__(name="canny") self.tags = ["Canny"] self.slider_1 = PreprocessorParameter( minimum=1, maximum=255, step=1, value=100, label="Low Threshold", ) self.slider_2 = PreprocessorParameter( minimum=1, maximum=255, step=1, value=200, label="High Threshold", ) self.sorting_priority = 100 self.use_soft_projection_in_hr_fix = True def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs ): input_image, remove_pad = resize_image_with_pad(input_image, resolution) canny_image = cv2.cvtColor( cv2.Canny(input_image, int(slider_1), int(slider_2)), cv2.COLOR_GRAY2RGB ) return remove_pad(canny_image) class PreprocessorInvert(Preprocessor): def __init__(self): super().__init__(name="invert") self._label = "invert (from white bg & black line)" self.tags = [ "Canny", "Lineart", "Scribble", "MLSD", ] self.slider_resolution = PreprocessorParameter(visible=False) self.sorting_priority = 20 def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs ): return 255 - HWC3(input_image) class PreprocessorBlurGaussian(Preprocessor): def __init__(self): super().__init__(name="blur_gaussian") self.slider_1 = PreprocessorParameter( label="Sigma", minimum=0.01, maximum=64.0, value=9.0 ) self.tags = ["Tile"] def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, input_mask=None, **kwargs ): img, remove_pad = resize_image_with_pad(input_image, resolution) img = remove_pad(img) result = cv2.GaussianBlur(img, (0, 0), float(slider_1)) return result class PreprocessorScribbleXdog(Preprocessor): def __init__(self): super().__init__(name="scribble_xdog") self.slider_1 = PreprocessorParameter( label="XDoG Threshold", minimum=1, maximum=64, value=32 ) self.tags = [ "Scribble", "SparseCtrl", ] def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, input_mask=None, **kwargs ): img, remove_pad = resize_image_with_pad(input_image, resolution) g1 = cv2.GaussianBlur(img.astype(np.float32), (0, 0), 0.5) g2 = cv2.GaussianBlur(img.astype(np.float32), (0, 0), 5.0) dog = (255 - np.min(g2 - g1, axis=2)).clip(0, 255).astype(np.uint8) result = np.zeros_like(img, dtype=np.uint8) result[2 * (255 - dog) > slider_1] = 255 return remove_pad(result) class PreprocessorShuffle(Preprocessor): def __init__(self): super().__init__(name="shuffle") self.tags = ["Shuffle"] self.model_shuffle = None # Fix res to 512. self.slider_resolution = PreprocessorParameter(value=512, visible=False) def _cached_call(self, *args, **kwargs): """No cache for shuffle, as each call depends on different numpy seed.""" return self(*args, **kwargs) def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, input_mask=None, **kwargs ): img, remove_pad = resize_image_with_pad(input_image, resolution) img = remove_pad(img) if self.model_shuffle is None: from annotator.shuffle import ContentShuffleDetector self.model_shuffle = ContentShuffleDetector() result = self.model_shuffle(img) return result Preprocessor.add_supported_preprocessor(PreprocessorNone()) Preprocessor.add_supported_preprocessor(PreprocessorCanny()) Preprocessor.add_supported_preprocessor(PreprocessorInvert()) Preprocessor.add_supported_preprocessor(PreprocessorBlurGaussian()) Preprocessor.add_supported_preprocessor(PreprocessorScribbleXdog()) Preprocessor.add_supported_preprocessor(PreprocessorShuffle()) ================================================ FILE: scripts/preprocessor/normal_dsine.py ================================================ from ..supported_preprocessor import Preprocessor, PreprocessorParameter class PreprocessorNormalDsine(Preprocessor): def __init__(self): super().__init__(name="normal_dsine") self.tags = ["NormalMap"] self.slider_1 = PreprocessorParameter( minimum=0, maximum=360, step=0.1, value=60, label="Fov", ) self.slider_2 = PreprocessorParameter( minimum=1, maximum=20, step=1, value=5, label="Iterations", ) self.model = None def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs ): if self.model is None: from annotator.normaldsine import NormalDsineDetector self.model = NormalDsineDetector() result = self.model( input_image, new_fov=float(slider_1), iterations=int(slider_2), resulotion=resolution, ) return result Preprocessor.add_supported_preprocessor(PreprocessorNormalDsine()) ================================================ FILE: scripts/preprocessor/pulid.py ================================================ # https://github.com/ToTheBeginning/PuLID import gc import torch import cv2 import numpy as np from typing import Optional, List from dataclasses import dataclass from facexlib.parsing import init_parsing_model from facexlib.utils.face_restoration_helper import FaceRestoreHelper from torchvision.transforms.functional import normalize from ..supported_preprocessor import Preprocessor, PreprocessorParameter from scripts.utils import npimg2tensor, tensor2npimg, resize_image_with_pad def to_gray(img): x = 0.299 * img[:, 0:1] + 0.587 * img[:, 1:2] + 0.114 * img[:, 2:3] x = x.repeat(1, 3, 1, 1) return x class PreprocessorFaceXLib(Preprocessor): def __init__(self): super().__init__(name="facexlib") self.tags = [] self.slider_resolution = PreprocessorParameter(visible=False) self.model: Optional[FaceRestoreHelper] = None def load_model(self): if self.model is None: self.model = FaceRestoreHelper( upscale_factor=1, face_size=512, crop_ratio=(1, 1), det_model="retinaface_resnet50", save_ext="png", device=self.device, ) self.model.face_parse = init_parsing_model( model_name="bisenet", device=self.device ) self.model.face_parse.to(device=self.device) self.model.face_det.to(device=self.device) return self.model def unload(self) -> bool: """@Override""" if self.model is not None: self.model.face_parse.to(device="cpu") self.model.face_det.to(device="cpu") return True return False def __call__( self, input_image, resolution=512, slider_1=None, slider_2=None, slider_3=None, input_mask=None, return_tensor=False, **kwargs ): """ @Override Returns black and white face features image with background removed. """ self.load_model() self.model.clean_all() input_image, _ = resize_image_with_pad(input_image, resolution) # using facexlib to detect and align face image_bgr = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR) self.model.read_image(image_bgr) self.model.get_face_landmarks_5(only_center_face=True) self.model.align_warp_face() if len(self.model.cropped_faces) == 0: raise RuntimeError("facexlib align face fail") align_face = self.model.cropped_faces[0] align_face_rgb = cv2.cvtColor(align_face, cv2.COLOR_BGR2RGB) input = npimg2tensor(align_face_rgb) input = input.to(self.device) parsing_out = self.model.face_parse( normalize(input, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) )[0] parsing_out = parsing_out.argmax(dim=1, keepdim=True) bg_label = [0, 16, 18, 7, 8, 9, 14, 15] bg = sum(parsing_out == i for i in bg_label).bool() white_image = torch.ones_like(input) # only keep the face features face_features_image = torch.where(bg, white_image, to_gray(input)) if return_tensor: return face_features_image else: return tensor2npimg(face_features_image) @dataclass class PuLIDProjInput: id_ante_embedding: torch.Tensor id_cond_vit: torch.Tensor id_vit_hidden: List[torch.Tensor] class PreprocessorPuLID(Preprocessor): """PuLID preprocessor.""" def __init__(self): super().__init__(name="ip-adapter_pulid") self.tags = ["IP-Adapter"] self.slider_resolution = PreprocessorParameter(visible=False) self.returns_image = False self.preprocessor_deps = [ "facexlib", "instant_id_face_embedding", "EVA02-CLIP-L-14-336", ] def facexlib_detect(self, input_image: np.ndarray) -> torch.Tensor: facexlib_preprocessor = Preprocessor.get_preprocessor("facexlib") return facexlib_preprocessor(input_image, return_tensor=True) def insightface_antelopev2_detect(self, input_image: np.ndarray) -> torch.Tensor: antelopev2_preprocessor = Preprocessor.get_preprocessor( "instant_id_face_embedding" ) return antelopev2_preprocessor(input_image) def unload(self) -> bool: unloaded = False for p_name in self.preprocessor_deps: p = Preprocessor.get_preprocessor(p_name) if p is not None: unloaded = unloaded or p.unload() return unloaded def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, input_mask=None, **kwargs ) -> Preprocessor.Result: id_ante_embedding = self.insightface_antelopev2_detect(input_image) if id_ante_embedding.ndim == 1: id_ante_embedding = id_ante_embedding.unsqueeze(0) face_features_image = self.facexlib_detect(input_image) evaclip_preprocessor = Preprocessor.get_preprocessor("EVA02-CLIP-L-14-336") assert ( evaclip_preprocessor is not None ), "EVA02-CLIP-L-14-336 preprocessor not found! Please install sd-webui-controlnet-evaclip" r = evaclip_preprocessor(face_features_image) # Free memory # This is necessary as facexlib and evaclip both seem to # not properly free memory on themselves. gc.collect() torch.cuda.empty_cache() return Preprocessor.Result( value=PuLIDProjInput( id_ante_embedding=id_ante_embedding, id_cond_vit=r.id_cond_vit, id_vit_hidden=r.id_vit_hidden, ), display_images=[tensor2npimg(face_features_image)], ) Preprocessor.add_supported_preprocessor(PreprocessorFaceXLib()) Preprocessor.add_supported_preprocessor(PreprocessorPuLID()) ================================================ FILE: scripts/preprocessor/teed.py ================================================ import numpy as np from skimage import morphology from annotator.teed import TEEDDetector from annotator.util import HWC3 from scripts.supported_preprocessor import Preprocessor, PreprocessorParameter from scripts.utils import resize_image_with_pad class PreprocessorTEED(Preprocessor): def __init__(self): super().__init__(name="softedge_teed") self.tags = ["SoftEdge"] self.slider_1 = PreprocessorParameter( label="Safe Steps", minimum=0, maximum=10, value=2, step=1, ) self.model = None def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs ): img, remove_pad = resize_image_with_pad(input_image, resolution) if self.model is None: self.model = TEEDDetector() result = self.model(img, safe_steps=int(slider_1)) return remove_pad(result) def get_intensity_mask(image_array, lower_bound, upper_bound): mask = image_array[:, :, 0] mask = np.where((mask >= lower_bound) & (mask <= upper_bound), mask, 0) mask = np.expand_dims(mask, 2).repeat(3, axis=2) return mask def combine_layers(base_layer, top_layer): mask = top_layer.astype(bool) temp = 1 - (1 - top_layer) * (1 - base_layer) result = base_layer * (~mask) + temp * mask return result class PreprocessorAnyline(Preprocessor): def __init__(self): super().__init__(name="softedge_anyline") self.tags = ["SoftEdge"] self.slider_resolution = PreprocessorParameter( label="Resolution", minimum=64, maximum=2048, value=1280, step=8, visible=True, ) self.slider_1 = PreprocessorParameter( label="Safe Steps", minimum=0, maximum=10, value=2, step=1, ) self.preprocessor_deps = ["lineart_standard"] self.model = None def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, **kwargs ): img, remove_pad = resize_image_with_pad(input_image, resolution) if self.model is None: self.model = TEEDDetector(mteed=True) mteed_result = self.model(img, safe_steps=int(slider_1)) mteed_result = HWC3(mteed_result) lineart_preprocessor = Preprocessor.get_preprocessor("lineart_standard") assert lineart_preprocessor is not None lineart_result = lineart_preprocessor(img, resolution) lineart_result = get_intensity_mask( lineart_result, lower_bound=0, upper_bound=1 ) cleaned = morphology.remove_small_objects( lineart_result.astype(bool), min_size=36, connectivity=1 ) lineart_result = lineart_result * cleaned final_result = combine_layers(mteed_result, lineart_result) return remove_pad(final_result) Preprocessor.add_supported_preprocessor(PreprocessorTEED()) Preprocessor.add_supported_preprocessor(PreprocessorAnyline()) ================================================ FILE: scripts/supported_preprocessor.py ================================================ from abc import ABC, abstractmethod from typing import List, ClassVar, Dict, Optional, Set, NamedTuple, Any from dataclasses import dataclass, field import numpy as np import torch from modules import shared, devices from scripts.enums import ControlNetUnionControlType from scripts.logging import logger from scripts.utils import ndarray_lru_cache CACHE_SIZE = getattr(shared.cmd_opts, "controlnet_preprocessor_cache_size", 0) @dataclass class PreprocessorParameter: """ Class representing a parameter for a preprocessor. Attributes: label (str): The label for the parameter. minimum (float): The minimum value of the parameter. Default is 0.0. maximum (float): The maximum value of the parameter. Default is 1.0. step (float): The step size for the parameter. Default is 0.01. value (float): The initial value of the parameter. Default is 0.5. visible (bool): Whether the parameter is visible or not. Default is False. """ label: str = "EMPTY_LABEL" minimum: float = 0.0 maximum: float = 1.0 step: float = 0.01 value: float = 0.5 visible: bool = True @property def gradio_update_kwargs(self) -> dict: return dict( minimum=self.minimum, maximum=self.maximum, step=self.step, label=self.label, value=self.value, visible=self.visible, ) @property def api_json(self) -> dict: return dict( name=self.label, value=self.value, min=self.minimum, max=self.maximum, step=self.step, ) @dataclass class Preprocessor(ABC): """ Class representing a preprocessor. Attributes: name (str): The name of the preprocessor. tags (List[str]): The tags associated with the preprocessor. slider_resolution (PreprocessorParameter): The parameter representing the resolution of the slider. slider_1 (PreprocessorParameter): The first parameter of the slider. slider_2 (PreprocessorParameter): The second parameter of the slider. slider_3 (PreprocessorParameter): The third parameter of the slider. show_control_mode (bool): Whether to show the control mode or not. do_not_need_model (bool): Whether the preprocessor needs a model or not. sorting_priority (int): The sorting priority of the preprocessor. corp_image_with_a1111_mask_when_in_img2img_inpaint_tab (bool): Whether to crop the image with a1111 mask when in img2img inpaint tab or not. fill_mask_with_one_when_resize_and_fill (bool): Whether to fill the mask with one when resizing and filling or not. use_soft_projection_in_hr_fix (bool): Whether to use soft projection in hr fix or not. expand_mask_when_resize_and_fill (bool): Whether to expand the mask when resizing and filling or not. """ name: str _label: str = None tags: List[str] = field(default_factory=list) slider_resolution = PreprocessorParameter( label="Resolution", minimum=64, maximum=2048, value=512, step=8, visible=True, ) slider_1 = PreprocessorParameter(visible=False) slider_2 = PreprocessorParameter(visible=False) slider_3 = PreprocessorParameter(visible=False) returns_image: bool = True show_control_mode = True do_not_need_model = False sorting_priority = 0 # higher goes to top in the list accepts_mask: bool = False requires_mask: bool = False corp_image_with_a1111_mask_when_in_img2img_inpaint_tab = True fill_mask_with_one_when_resize_and_fill = False use_soft_projection_in_hr_fix = False expand_mask_when_resize_and_fill = False model: Optional[torch.nn.Module] = None device = devices.get_device_for("controlnet") preprocessor_deps: List[str] = field(default_factory=list) all_processors: ClassVar[Dict[str, "Preprocessor"]] = {} all_processors_by_name: ClassVar[Dict[str, "Preprocessor"]] = {} @property def label(self) -> str: """Display name on UI.""" return self._label if self._label is not None else self.name @classmethod def add_supported_preprocessor(cls, p: "Preprocessor"): assert p.label not in cls.all_processors, f"{p.label} already registered!" cls.all_processors[p.label] = p assert p.name not in cls.all_processors_by_name, f"{p.name} already registered!" cls.all_processors_by_name[p.name] = p logger.debug( f"{p.name} registered. Total preprocessors ({len(cls.all_processors)})." ) @classmethod def get_preprocessor(cls, name: str) -> Optional["Preprocessor"]: return cls.all_processors.get(name, cls.all_processors_by_name.get(name, None)) @classmethod def get_sorted_preprocessors(cls) -> List["Preprocessor"]: preprocessors = [p for k, p in cls.all_processors.items() if k != "none"] return [cls.all_processors["none"]] + sorted( preprocessors, key=lambda x: str(x.sorting_priority).zfill(8) + x.label, reverse=True, ) @classmethod def get_all_preprocessor_tags(cls): tags = set() for _, p in cls.all_processors.items(): tags.update(set(p.tags)) return ["All"] + sorted(list(tags)) @classmethod def get_filtered_preprocessors(cls, tag: str) -> List["Preprocessor"]: if tag == "All": return cls.all_processors return [ p for p in cls.get_sorted_preprocessors() if tag in p.tags or p.label == "none" ] @classmethod def get_default_preprocessor(cls, tag: str) -> "Preprocessor": ps = cls.get_filtered_preprocessors(tag) assert len(ps) > 0 return ps[0] if len(ps) == 1 else ps[1] @classmethod def tag_to_filters(cls, tag: str) -> Set[str]: filters_aliases = { "instructp2p": ["ip2p"], "segmentation": ["seg"], "normalmap": ["normal"], "t2i-adapter": ["t2i_adapter", "t2iadapter", "t2ia"], "ip-adapter": ["ip_adapter", "ipadapter"], "openpose": ["openpose", "densepose"], "instant-id": ["instant_id", "instantid"], "scribble": ["sketch"], "tile": ["blur"], } tag = tag.lower() union_tags = ["union"] if tag in ControlNetUnionControlType.all_tags() else [] return set([tag] + filters_aliases.get(tag, []) + union_tags) @classmethod def unload_unused(cls, active_processors: Set["Preprocessor"]): logger.debug( f"Unload unused preprocessors. Active: {[p.name for p in active_processors]}" ) for p in cls.all_processors.values(): if p not in active_processors: success = p.unload() if success: logger.debug(f"Unload unused preprocessor {p.name}") class Result(NamedTuple): value: Any # The display images shown on UI. display_images: List[np.ndarray] def cached_call(self, input_image, *args, **kwargs) -> "Preprocessor.Result": """The function exposed that also returns an image for display.""" result = self._cached_call(input_image, *args, **kwargs) if isinstance(result, Preprocessor.Result): return result else: return Preprocessor.Result( value=result, display_images=[result if self.returns_image else input_image], ) @ndarray_lru_cache(max_size=CACHE_SIZE) def _cached_call(self, *args, **kwargs): """The actual cached function.""" logger.debug(f"Calling preprocessor {self.name} outside of cache.") return self(*args, **kwargs) def __hash__(self): return hash(self.name) def __eq__(self, other): return self.__hash__() == other.__hash__() @abstractmethod def __call__( self, input_image, resolution, slider_1=None, slider_2=None, slider_3=None, input_mask=None, **kwargs, ): pass def unload(self): if self.model is not None: if hasattr(self.model, "unload_model"): self.model.unload_model() return True if hasattr(self.model, "to"): self.model.to("cpu") return True raise Exception(f"Unable to unload model {self.model}") return False ================================================ FILE: scripts/utils.py ================================================ from einops import rearrange import torch import os import functools import time import base64 import numpy as np import safetensors.torch import cv2 import logging from typing import Any, Callable, Dict, List from modules.safe import unsafe_torch_load from modules.modelloader import load_file_from_url # noqa: F401 from scripts.logging import logger def load_state_dict(ckpt_path, location="cpu"): _, extension = os.path.splitext(ckpt_path) if extension.lower() == ".safetensors": state_dict = safetensors.torch.load_file(ckpt_path, device=location) else: state_dict = unsafe_torch_load(ckpt_path, map_location=torch.device(location)) state_dict = get_state_dict(state_dict) logger.info(f"Loaded state_dict from [{ckpt_path}]") return state_dict def get_state_dict(d): return d.get("state_dict", d) def ndarray_lru_cache(max_size: int = 128, typed: bool = False): """ Decorator to enable caching for functions with numpy array arguments. Numpy arrays are mutable, and thus not directly usable as hash keys. The idea here is to wrap the incoming arguments with type `np.ndarray` as `HashableNpArray` so that `lru_cache` can correctly handles `np.ndarray` arguments. `HashableNpArray` functions exactly the same way as `np.ndarray` except having `__hash__` and `__eq__` overriden. """ def decorator(func: Callable): """The actual decorator that accept function as input.""" class HashableNpArray(np.ndarray): def __new__(cls, input_array): # Input array is an instance of ndarray. # The view makes the input array and returned array share the same data. obj = np.asarray(input_array).view(cls) return obj def __eq__(self, other) -> bool: return np.array_equal(self, other) def __hash__(self): # Hash the bytes representing the data of the array. return hash(self.tobytes()) @functools.lru_cache(maxsize=max_size, typed=typed) def cached_func(*args, **kwargs): """This function only accepts `HashableNpArray` as input params.""" return func(*args, **kwargs) # Preserves original function.__name__ and __doc__. @functools.wraps(func) def decorated_func(*args, **kwargs): """The decorated function that delegates the original function.""" def convert_item(item: Any): if isinstance(item, np.ndarray): return HashableNpArray(item) if isinstance(item, tuple): return tuple(convert_item(i) for i in item) return item args = [convert_item(arg) for arg in args] kwargs = {k: convert_item(arg) for k, arg in kwargs.items()} return cached_func(*args, **kwargs) return decorated_func return decorator def timer_decorator(func): """Time the decorated function and output the result to debug logger.""" if logger.level != logging.DEBUG: return func @functools.wraps(func) def wrapper(*args, **kwargs): start_time = time.time() result = func(*args, **kwargs) end_time = time.time() duration = end_time - start_time # Only report function that are significant enough. if duration > 1e-3: logger.debug(f"{func.__name__} ran in: {duration:.3f} sec") return result return wrapper class TimeMeta(type): """Metaclass to record execution time on all methods of the child class.""" def __new__(cls, name, bases, attrs): for attr_name, attr_value in attrs.items(): if callable(attr_value): attrs[attr_name] = timer_decorator(attr_value) return super().__new__(cls, name, bases, attrs) # svgsupports svgsupport = False try: import io from svglib.svglib import svg2rlg from reportlab.graphics import renderPM svgsupport = True except ImportError: pass def svg_preprocess(inputs: Dict, preprocess: Callable): if not inputs: return None if inputs["image"].startswith("data:image/svg+xml;base64,") and svgsupport: svg_data = base64.b64decode( inputs["image"].replace("data:image/svg+xml;base64,", "") ) drawing = svg2rlg(io.BytesIO(svg_data)) png_data = renderPM.drawToString(drawing, fmt="PNG") encoded_string = base64.b64encode(png_data) base64_str = str(encoded_string, "utf-8") base64_str = "data:image/png;base64," + base64_str inputs["image"] = base64_str return preprocess(inputs) def get_unique_axis0(data): arr = np.asanyarray(data) idxs = np.lexsort(arr.T) arr = arr[idxs] unique_idxs = np.empty(len(arr), dtype=np.bool_) unique_idxs[:1] = True unique_idxs[1:] = np.any(arr[:-1, :] != arr[1:, :], axis=-1) return arr[unique_idxs] def read_image(img_path: str) -> str: """Read image from specified path and return a base64 string.""" img = cv2.imread(img_path) _, bytes = cv2.imencode(".png", img) encoded_image = base64.b64encode(bytes).decode("utf-8") return encoded_image def read_image_dir( img_dir: str, suffixes=(".png", ".jpg", ".jpeg", ".webp") ) -> List[str]: """Try read all images in given img_dir.""" images = [] for filename in os.listdir(img_dir): if filename.endswith(suffixes): img_path = os.path.join(img_dir, filename) try: images.append(read_image(img_path)) except IOError: logger.error(f"Error opening {img_path}") return images def align_dim_latent(x: int) -> int: """Align the pixel dimension (w/h) to latent dimension. Stable diffusion 1:8 ratio for latent/pixel, i.e., 1 latent unit == 8 pixel unit.""" return (x // 8) * 8 def pad64(x): return int(np.ceil(float(x) / 64.0) * 64 - x) def safer_memory(x): # Fix many MAC/AMD problems return np.ascontiguousarray(x.copy()).copy() def resize_image_with_pad(img: np.ndarray, resolution: int): # Convert greyscale image to RGB. if img.ndim == 2: img = img[:, :, None] img = np.concatenate([img, img, img], axis=2) H_raw, W_raw, _ = img.shape k = float(resolution) / float(min(H_raw, W_raw)) interpolation = cv2.INTER_CUBIC if k > 1 else cv2.INTER_AREA H_target = int(np.round(float(H_raw) * k)) W_target = int(np.round(float(W_raw) * k)) img = cv2.resize(img, (W_target, H_target), interpolation=interpolation) H_pad, W_pad = pad64(H_target), pad64(W_target) img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode="edge") def remove_pad(x): return safer_memory(x[:H_target, :W_target]) return safer_memory(img_padded), remove_pad def npimg2tensor(img: np.ndarray) -> torch.Tensor: """Convert numpy img ([H, W, C]) to tensor ([1, C, H, W])""" return rearrange(torch.from_numpy(img).float() / 255.0, "h w c -> 1 c h w") def tensor2npimg(t: torch.Tensor) -> np.ndarray: """Convert tensor ([1, C, H, W]) to numpy RGB img ([H, W, C])""" return ( (rearrange(t, "1 c h w -> h w c") * 255.0) .to(dtype=torch.uint8) .cpu() .numpy() ) def visualize_inpaint_mask(img): if img.ndim == 3 and img.shape[2] == 4: result = img.copy() mask = result[:, :, 3] mask = 255 - mask // 2 result[:, :, 3] = mask return np.ascontiguousarray(result.copy()) return img ================================================ FILE: scripts/xyz_grid_support.py ================================================ import re import numpy as np from modules import scripts, shared try: from scripts.global_state import update_cn_models, cn_models_names from scripts.external_code import ResizeMode, ControlMode from scripts.supported_preprocessor import Preprocessor from scripts.logging import logger except (ImportError, NameError): import_error = True else: import_error = False DEBUG_MODE = False def debug_info(func): def debug_info_(*args, **kwargs): if DEBUG_MODE: print(f"Debug info: {func.__name__}, {args}") return func(*args, **kwargs) return debug_info_ def find_dict(dict_list, keyword, search_key="name", stop=False): result = next((d for d in dict_list if d[search_key] == keyword), None) if result or not stop: return result else: raise ValueError(f"Dictionary with value '{keyword}' in key '{search_key}' not found.") def flatten(lst): result = [] for element in lst: if isinstance(element, list): result.extend(flatten(element)) else: result.append(element) return result def is_all_included(target_list, check_list, allow_blank=False, stop=False): for element in flatten(target_list): if allow_blank and str(element) in ["None", ""]: continue elif element not in check_list: if not stop: return False else: raise ValueError(f"'{element}' is not included in check list.") return True class ListParser(): """This class restores a broken list caused by the following process in the xyz_grid module. -> valslist = [x.strip() for x in chain.from_iterable( csv.reader(StringIO(vals)))] It also performs type conversion, adjusts the number of elements in the list, and other operations. This class directly modifies the received list. """ numeric_pattern = { int: { "range": r"\s*([+-]?\s*\d+)\s*-\s*([+-]?\s*\d+)(?:\s*\(([+-]\d+)\s*\))?\s*", "count": r"\s*([+-]?\s*\d+)\s*-\s*([+-]?\s*\d+)(?:\s*\[(\d+)\s*\])?\s*" }, float: { "range": r"\s*([+-]?\s*\d+(?:\.\d*)?)\s*-\s*([+-]?\s*\d+(?:\.\d*)?)(?:\s*\(([+-]\d+(?:\.\d*)?)\s*\))?\s*", "count": r"\s*([+-]?\s*\d+(?:\.\d*)?)\s*-\s*([+-]?\s*\d+(?:\.\d*)?)(?:\s*\[(\d+(?:\.\d*)?)\s*\])?\s*" } } ################################################ # # Initialization method from here. # ################################################ def __init__(self, my_list, converter=None, allow_blank=True, exclude_list=None, run=True): self.my_list = my_list self.converter = converter self.allow_blank = allow_blank self.exclude_list = exclude_list self.re_bracket_start = None self.re_bracket_start_precheck = None self.re_bracket_end = None self.re_bracket_end_precheck = None self.re_range = None self.re_count = None self.compile_regex() if run: self.auto_normalize() def compile_regex(self): exclude_pattern = "|".join(self.exclude_list) if self.exclude_list else None if exclude_pattern is None: self.re_bracket_start = re.compile(r"^\[") self.re_bracket_end = re.compile(r"\]$") else: self.re_bracket_start = re.compile(fr"^\[(?!(?:{exclude_pattern})\])") self.re_bracket_end = re.compile(fr"(? valslist = [opt.type(x) for x in valslist] # Perform type conversion using the function # set to the confirm attribute instead. # def identity(x): return x def enable_script_control(): shared.opts.data["control_net_allow_script_control"] = True def apply_field(field): @debug_info def apply_field_(p, x, xs): enable_script_control() setattr(p, field, x) return apply_field_ ################################################ # The confirm function defined in this module # enables list notation and performs type conversion. # # Example: # any = [any, any, any, ...] # [any] = [any, None, None, ...] # [None, None, any] = [None, None, any] # [,,any] = [None, None, any] # any, [,any,] = [any, any, any, ...], [None, any, None] # # Enabled Only: # any = [any] = [any, None, None, ...] # (any and [any] are considered equivalent) # def confirm(func_or_str): @debug_info def confirm_(p, xs): if callable(func_or_str): # func_or_str is converter ListParser(xs, func_or_str, allow_blank=True) return elif isinstance(func_or_str, str): # func_or_str is keyword valid_data = find_dict(validation_data, func_or_str, stop=True) converter = valid_data["type"] exclude_list = valid_data["exclude"]() if valid_data["exclude"] else None check_list = valid_data["check"]() ListParser(xs, converter, allow_blank=True, exclude_list=exclude_list) is_all_included(xs, check_list, allow_blank=True, stop=True) return else: raise TypeError(f"Argument must be callable or str, not {type(func_or_str).__name__}.") return confirm_ def bool_(string): string = str(string) if string in ["None", ""]: return None elif string.lower() in ["true", "1"]: return True elif string.lower() in ["false", "0"]: return False else: raise ValueError(f"Could not convert string to boolean: {string}") def choices_bool(): return ["False", "True"] def choices_model(): update_cn_models() return list(cn_models_names.values()) def choices_control_mode(): return [e.value for e in ControlMode] def choices_resize_mode(): return [e.value for e in ResizeMode] def choices_preprocessor(): return list(Preprocessor.all_processors.keys()) def make_excluded_list(): pattern = re.compile(r"\[(\w+)\]") return [match.group(1) for s in choices_model() for match in pattern.finditer(s)] validation_data = [ {"name": "model", "type": str, "check": choices_model, "exclude": make_excluded_list}, {"name": "control_mode", "type": str, "check": choices_control_mode, "exclude": None}, {"name": "resize_mode", "type": str, "check": choices_resize_mode, "exclude": None}, {"name": "preprocessor", "type": str, "check": choices_preprocessor, "exclude": None}, ] extra_axis_options = [ xyz_grid.AxisOption("[ControlNet] Enabled", identity, apply_field("control_net_enabled"), confirm=confirm(bool_), choices=choices_bool), xyz_grid.AxisOption("[ControlNet] Model", identity, apply_field("control_net_model"), confirm=confirm("model"), choices=choices_model, cost=0.9), xyz_grid.AxisOption("[ControlNet] Weight", identity, apply_field("control_net_weight"), confirm=confirm(float)), xyz_grid.AxisOption("[ControlNet] Guidance Start", identity, apply_field("control_net_guidance_start"), confirm=confirm(float)), xyz_grid.AxisOption("[ControlNet] Guidance End", identity, apply_field("control_net_guidance_end"), confirm=confirm(float)), xyz_grid.AxisOption("[ControlNet] Control Mode", identity, apply_field("control_net_control_mode"), confirm=confirm("control_mode"), choices=choices_control_mode), xyz_grid.AxisOption("[ControlNet] Resize Mode", identity, apply_field("control_net_resize_mode"), confirm=confirm("resize_mode"), choices=choices_resize_mode), xyz_grid.AxisOption("[ControlNet] Preprocessor", identity, apply_field("control_net_module"), confirm=confirm("preprocessor"), choices=choices_preprocessor), xyz_grid.AxisOption("[ControlNet] Pre Resolution", identity, apply_field("control_net_pres"), confirm=confirm(int)), xyz_grid.AxisOption("[ControlNet] Pre Threshold A", identity, apply_field("control_net_pthr_a"), confirm=confirm(float)), xyz_grid.AxisOption("[ControlNet] Pre Threshold B", identity, apply_field("control_net_pthr_b"), confirm=confirm(float)), ] xyz_grid.axis_options.extend(extra_axis_options) def run(): xyz_grid = find_module({"xyz_grid.py", "xy_grid.py", "scripts.xyz_grid", "scripts.xy_grid"}) if xyz_grid: add_axis_options(xyz_grid) else: logger.warn("xyz script not found") if not import_error: run() ================================================ FILE: style.css ================================================ .cnet-modal { display: none; /* Hidden by default */ position: fixed; /* Stay in place */ z-index: 2147483647; /* Sit on top */ left: 0; top: 0; width: 100%; /* Full width */ height: 100%; /* Full height */ overflow: auto; /* Enable scroll if needed */ background-color: rgba(0, 0, 0, 0.4); /* Black with opacity */ max-width: none !important; /* Fix sizing with SD.Next (vladmandic/automatic#2594) */ } .cnet-modal-content { position: relative; background-color: var(--background-fill-primary); margin: 5vh auto; /* 15% from the top and centered */ padding: 20px; border: 1px solid #888; width: 95%; height: 90vh; /* Could be more or less, depending on screen size */ box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19); animation-name: animatetop; animation-duration: 0.4s; max-width: none !important; /* Fix sizing with SD.Next (vladmandic/automatic#2594) */ } .cnet-modal-content iframe { position: absolute; top: 0; left: 0; width: 100%; height: 100%; border: none; } .cnet-modal-content.alert { padding: var(--size-5); } .cnet-modal-content.alert ul { list-style-type: none; } .cnet-modal-close { color: white !important; right: 0.25em; top: 0; cursor: pointer; position: absolute; font-size: 56px; font-weight: bold; } @keyframes animatetop { from { top: -300px; opacity: 0 } to { top: 0; opacity: 1 } } .cnet-generated-image-control-group, .cnet-upload-pose { display: flex; flex-direction: column; align-items: flex-end; position: absolute; right: var(--size-2); bottom: var(--size-2); } /* Gradio button style */ .cnet-download-pose a, .cnet-close-preview, .cnet-edit-pose, .cnet-upload-pose, .cnet-photopea-child-trigger { font-size: x-small !important; font-weight: bold !important; padding: 2px !important; box-shadow: var(--shadow-drop); border: 1px solid var(--button-secondary-border-color); border-radius: var(--radius-sm); background: var(--background-fill-primary); height: var(--size-5); color: var(--block-label-text-color) !important; display: flex; justify-content: center; cursor: pointer; } .cnet-download-pose:hover a, .cnet-close-preview:hover a, .cnet-edit-pose:hover, .cnet-upload-pose:hover, .cnet-photopea-child-trigger:hover { color: var(--block-label-text-color) !important; } .cnet-unit-active { color: green !important; font-weight: bold !important; } .dark .cnet-unit-active { color: greenyellow !important; } .cnet-badge { display: inline-block; padding: 0.25em 0.75em; font-size: 0.75em; font-weight: bold; color: white; border-radius: 0.5em; text-align: center; vertical-align: middle; margin-left: var(--size-2); } .cnet-badge.primary { background-color: green; } .cnet-a1111-badge { position: absolute; bottom: 0px; right: 0px; } .cnet-disabled-radio { opacity: 50%; } .controlnet_row { margin-top: 10px !important; } /* JSON pose upload button styling */ .cnet-upload-pose input[type=file] { position: absolute; left: 0; top: 0; opacity: 0; width: 100%; height: 100%; } /* Photopea integration styles */ .photopea-button-group { position: absolute; top: -30px; /* 20px modal padding + 10px margin */ } .photopea-button { font-size: 3rem; font-weight: bold; padding: 2px !important; margin: 2px !important; box-shadow: var(--shadow-drop); border: 1px solid var(--button-secondary-border-color); border-radius: var(--radius-sm); background: var(--background-fill-primary); color: var(--block-label-text-color); } ================================================ FILE: tests/README.md ================================================ # Tests There are 2 types of tests: - unittest: backend based tests that directly import A1111 shared modules - api test: test functionality through A1111 web API # Run tests locally Make sure the current working directory is A1111 root. ## Install test dependencies `pip install -r requirements-test.txt` ## Start test server ```shell python -m coverage run --data-file=.coverage.server launch.py --skip-prepare-environment --skip-torch-cuda-test --test-server --do-not-download-clip --no-half --disable-opt-split-attention --use-cpu all --api-server-stop ``` ## Setting environment variables Setting `CONTROLNET_TEST_SD_VERSION` for stable diffusion model family used during testing. - 1 for SD1.x - 2 for SD2.x - 3 for SDXL ## Run test ```shell python -m pytest -vv --junitxml=test/results.xml --cov ./extensions/sd-webui-controlnet --cov-report=xml --verify-base-url ./extensions/sd-webui-controlnet/tests ``` ## Check code coverage Text report ```shell python -m coverage report -i ``` HTML report ```shell python -m coverage html -i ``` ================================================ FILE: tests/annotator_tests/openpose_tests/body_test.py ================================================ import unittest import numpy as np import importlib utils = importlib.import_module('extensions.sd-webui-controlnet.tests.utils', 'utils') from annotator.openpose.body import Body, Keypoint, BodyResult class TestFormatBodyResult(unittest.TestCase): def setUp(self): self.candidate = np.array([ [10, 20, 0.9, 0], [30, 40, 0.8, 1], [50, 60, 0.7, 2], [70, 80, 0.6, 3] ]) self.subset = np.array([ [-1, 0, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1.7, 2], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 0.6, 1] ]) def test_format_body_result(self): expected_result = [ BodyResult( keypoints=[ None, Keypoint(x=10, y=20, score=0.9, id=0), Keypoint(x=30, y=40, score=0.8, id=1), None ] + [None] * 14, total_score=1.7, total_parts=2 ), BodyResult( keypoints=[None] * 17 + [ Keypoint(x=70, y=80, score=0.6, id=3) ], total_score=0.6, total_parts=1 ) ] result = Body.format_body_result(self.candidate, self.subset) self.assertEqual(result, expected_result) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/annotator_tests/openpose_tests/detection_test.py ================================================ import unittest import numpy as np import importlib utils = importlib.import_module('extensions.sd-webui-controlnet.tests.utils', 'utils') from annotator.openpose.util import faceDetect, handDetect from annotator.openpose.body import Keypoint, BodyResult class TestFaceDetect(unittest.TestCase): def test_no_faces(self): oriImg = np.zeros((100, 100, 3), dtype=np.uint8) body = BodyResult([None] * 18, total_score=3, total_parts=0) expected_result = None result = faceDetect(body, oriImg) self.assertEqual(result, expected_result) def test_single_face(self): body = BodyResult([ Keypoint(50, 50), *([None] * 13), Keypoint(30, 40), Keypoint(70, 40), Keypoint(20, 50), Keypoint(80, 50), ], total_score=2, total_parts=5) oriImg = np.zeros((100, 100, 3), dtype=np.uint8) expected_result = (0, 0, 120) result = faceDetect(body, oriImg) self.assertEqual(result, expected_result) class TestHandDetect(unittest.TestCase): def test_no_hands(self): oriImg = np.zeros((100, 100, 3), dtype=np.uint8) body = BodyResult([None] * 18, total_score=3, total_parts=0) expected_result = [] result = handDetect(body, oriImg) self.assertEqual(result, expected_result) def test_single_left_hand(self): oriImg = np.zeros((100, 100, 3), dtype=np.uint8) body = BodyResult([ None, None, None, None, None, Keypoint(20, 20), Keypoint(40, 30), Keypoint(60, 40), *([None] * 8), Keypoint(20, 60), Keypoint(40, 70), Keypoint(60, 80) ], total_score=3, total_parts=0.5) expected_result = [(49, 26, 33, True)] result = handDetect(body, oriImg) self.assertEqual(result, expected_result) def test_single_right_hand(self): oriImg = np.zeros((100, 100, 3), dtype=np.uint8) body = BodyResult([ None, None, Keypoint(20, 20), Keypoint(40, 30), Keypoint(60, 40), *([None] * 11), Keypoint(20, 60), Keypoint(40, 70), Keypoint(60, 80) ], total_score=3, total_parts=0.5) expected_result = [(49, 26, 33, False)] result = handDetect(body, oriImg) self.assertEqual(result, expected_result) def test_multiple_hands(self): body = BodyResult([ Keypoint(20, 20), Keypoint(40, 30), Keypoint(60, 40), Keypoint(20, 60), Keypoint(40, 70), Keypoint(60, 80), Keypoint(10, 10), Keypoint(30, 20), Keypoint(50, 30), Keypoint(10, 50), Keypoint(30, 60), Keypoint(50, 70), *([None] * 6), ], total_score=3, total_parts=0.5) oriImg = np.zeros((100, 100, 3), dtype=np.uint8) expected_result = [(0, 0, 100, True), (16, 43, 56, False)] result = handDetect(body, oriImg) self.assertEqual(result, expected_result) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/annotator_tests/openpose_tests/json_encode_test.py ================================================ import unittest import numpy as np import importlib utils = importlib.import_module('extensions.sd-webui-controlnet.tests.utils', 'utils') from annotator.openpose import encode_poses_as_json, HumanPoseResult, Keypoint from annotator.openpose.body import BodyResult class TestEncodePosesAsJson(unittest.TestCase): def test_empty_list(self): poses = [] canvas_height = 1080 canvas_width = 1920 result = encode_poses_as_json(poses, [], canvas_height, canvas_width) expected = { 'people': [], 'animals': [], 'canvas_height': canvas_height, 'canvas_width': canvas_width, } self.assertDictEqual(result, expected) def test_single_pose_no_keypoints(self): poses = [HumanPoseResult(BodyResult(None, 0, 0), None, None, None)] canvas_height = 1080 canvas_width = 1920 result = encode_poses_as_json(poses, [],canvas_height, canvas_width) expected = { 'people': [ { 'pose_keypoints_2d': None, 'face_keypoints_2d': None, 'hand_left_keypoints_2d': None, 'hand_right_keypoints_2d': None, }, ], 'animals': [], 'canvas_height': canvas_height, 'canvas_width': canvas_width, } self.assertDictEqual(result, expected) def test_single_pose_with_keypoints(self): keypoints = [Keypoint(np.float32(0.5), np.float32(0.5)), None, Keypoint(0.6, 0.6)] poses = [HumanPoseResult(BodyResult(keypoints, 0, 0), keypoints, keypoints, keypoints)] canvas_height = 1080 canvas_width = 1920 result = encode_poses_as_json(poses, [], canvas_height, canvas_width) expected = { 'people': [ { 'pose_keypoints_2d': [ 0.5, 0.5, 1.0, 0.0, 0.0, 0.0, 0.6, 0.6, 1.0, ], 'face_keypoints_2d': [ 0.5, 0.5, 1.0, 0.0, 0.0, 0.0, 0.6, 0.6, 1.0, ], 'hand_left_keypoints_2d': [ 0.5, 0.5, 1.0, 0.0, 0.0, 0.0, 0.6, 0.6, 1.0, ], 'hand_right_keypoints_2d': [ 0.5, 0.5, 1.0, 0.0, 0.0, 0.0, 0.6, 0.6, 1.0, ], }, ], 'animals': [], 'canvas_height': canvas_height, 'canvas_width': canvas_width, } self.assertDictEqual(result, expected) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/annotator_tests/openpose_tests/openpose_e2e_test_disabled.py ================================================ """ Disabled because unloading openpose detection models is flaky. See https://github.com/Mikubill/sd-webui-controlnet/actions/runs/6758718106/job/18370634881 for CI report of an example flaky run. """ import unittest import cv2 import numpy as np from pathlib import Path from typing import Dict import importlib utils = importlib.import_module('extensions.sd-webui-controlnet.tests.utils', 'utils') from annotator.openpose import OpenposeDetector class TestOpenposeDetector(unittest.TestCase): image_path = str(Path(__file__).parent.parent.parent / 'images') def setUp(self) -> None: self.detector = OpenposeDetector() self.detector.load_model() def tearDown(self) -> None: self.detector.unload_model() def expect_same_image(self, img1, img2, diff_img_path: str): # Calculate the difference between the two images diff = cv2.absdiff(img1, img2) # Set a threshold to highlight the different pixels threshold = 30 diff_highlighted = np.where(diff > threshold, 255, 0).astype(np.uint8) # Assert that the two images are similar within a tolerance similar = np.allclose(img1, img2, rtol=1e-05, atol=1e-08) if not similar: # Save the diff_highlighted image to inspect the differences cv2.imwrite(diff_img_path, cv2.cvtColor(diff_highlighted, cv2.COLOR_RGB2BGR)) self.assertTrue(similar) # Save expectation image as png so that no compression issue happens. def template(self, test_image: str, expected_image: str, detector_config: Dict, overwrite_expectation: bool = False): oriImg = cv2.cvtColor(cv2.imread(test_image), cv2.COLOR_BGR2RGB) canvas = self.detector(oriImg, **detector_config) # Create expectation file if overwrite_expectation: cv2.imwrite(expected_image, cv2.cvtColor(canvas, cv2.COLOR_RGB2BGR)) else: expected_canvas = cv2.cvtColor(cv2.imread(expected_image), cv2.COLOR_BGR2RGB) self.expect_same_image(canvas, expected_canvas, diff_img_path=expected_image.replace('.png', '_diff.png')) def test_body(self): self.template( test_image = f'{TestOpenposeDetector.image_path}/ski.jpg', expected_image = f'{TestOpenposeDetector.image_path}/expected_ski_output.png', detector_config=dict(), overwrite_expectation=False ) def test_hand(self): self.template( test_image = f'{TestOpenposeDetector.image_path}/woman.jpeg', expected_image = f'{TestOpenposeDetector.image_path}/expected_woman_hand_output.png', detector_config=dict( include_body=False, include_face=False, include_hand=True, ), overwrite_expectation=False ) def test_face(self): self.template( test_image = f'{TestOpenposeDetector.image_path}/woman.jpeg', expected_image = f'{TestOpenposeDetector.image_path}/expected_woman_face_output.png', detector_config=dict( include_body=False, include_face=True, include_hand=False, ), overwrite_expectation=False ) def test_all(self): self.template( test_image = f'{TestOpenposeDetector.image_path}/woman.jpeg', expected_image = f'{TestOpenposeDetector.image_path}/expected_woman_all_output.png', detector_config=dict( include_body=True, include_face=True, include_hand=True, ), overwrite_expectation=False ) def test_dw(self): self.template( test_image = f'{TestOpenposeDetector.image_path}/woman.jpeg', expected_image = f'{TestOpenposeDetector.image_path}/expected_woman_dw_all_output.png', detector_config=dict( include_body=True, include_face=True, include_hand=True, use_dw_pose=True, ), overwrite_expectation=False, ) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/cn_script/__init__.py ================================================ ================================================ FILE: tests/cn_script/batch_hijack_test.py ================================================ import numpy as np import unittest.mock import importlib from typing import Any utils = importlib.import_module('extensions.sd-webui-controlnet.tests.utils', 'utils') from modules import processing, scripts, shared from internal_controlnet.external_code import ControlNetUnit from scripts import controlnet, batch_hijack batch_hijack.instance.undo_hijack() original_process_images_inner = processing.process_images_inner def create_unit(**kwargs) -> ControlNetUnit: return ControlNetUnit(enabled=True, **kwargs) class TestBatchHijack(unittest.TestCase): @unittest.mock.patch('modules.script_callbacks.on_script_unloaded') def setUp(self, on_script_unloaded_mock): self.on_script_unloaded_mock = on_script_unloaded_mock self.batch_hijack_object = batch_hijack.BatchHijack() self.batch_hijack_object.do_hijack() def tearDown(self): self.batch_hijack_object.undo_hijack() def test_do_hijack__registers_on_script_unloaded(self): self.on_script_unloaded_mock.assert_called_once_with(self.batch_hijack_object.undo_hijack) def test_do_hijack__call_once__hijacks_once(self): self.assertEqual(getattr(processing, '__controlnet_original_process_images_inner'), original_process_images_inner) self.assertEqual(processing.process_images_inner, self.batch_hijack_object.processing_process_images_hijack) @unittest.mock.patch('modules.processing.__controlnet_original_process_images_inner') def test_do_hijack__multiple_times__hijacks_once(self, process_images_inner_mock): self.batch_hijack_object.do_hijack() self.batch_hijack_object.do_hijack() self.batch_hijack_object.do_hijack() self.assertEqual(process_images_inner_mock, getattr(processing, '__controlnet_original_process_images_inner')) class TestGetControlNetBatchesWorks(unittest.TestCase): def setUp(self): self.p = unittest.mock.MagicMock() assert scripts.scripts_txt2img is not None self.p.scripts = scripts.scripts_txt2img self.cn_script = controlnet.Script() self.p.scripts.alwayson_scripts = [self.cn_script] self.p.script_args = [] def tearDown(self): batch_hijack.instance.dispatch_callbacks(batch_hijack.instance.postprocess_batch_callbacks, self.p) def assert_get_cn_batches_works(self, batch_images_list): self.cn_script.args_from = 0 self.cn_script.args_to = self.cn_script.args_from + len(self.p.script_args) is_cn_batch, batches, output_dir, _ = batch_hijack.get_cn_batches(self.p) batch_hijack.instance.dispatch_callbacks(batch_hijack.instance.process_batch_callbacks, self.p, batches, output_dir) batch_units = [ unit for unit in self.p.script_args if getattr(unit, 'input_mode', batch_hijack.InputMode.SIMPLE) == batch_hijack.InputMode.BATCH ] # Convert iterator to list to avoid double eval of iterator exhausting # the iterator in following checks. for unit in batch_units: unit.batch_images = list(unit.batch_images) if batch_units: self.assertEqual(min(len(unit.batch_images) for unit in batch_units), len(batches)) else: self.assertEqual(1, len(batches)) for i, unit in enumerate(self.cn_script.enabled_units): self.assertListEqual(batch_images_list[i], list(unit.batch_images)) def test_get_cn_batches__empty(self): is_batch, batches, _, _ = batch_hijack.get_cn_batches(self.p) self.assertEqual(1, len(batches)) self.assertEqual(is_batch, False) def test_get_cn_batches__1_simple(self): self.p.script_args.append(create_unit(image=get_dummy_image())) self.assert_get_cn_batches_works([ [get_dummy_image()], ]) def test_get_cn_batches__2_simples(self): self.p.script_args.extend([ create_unit(image=get_dummy_image(0)), create_unit(image=get_dummy_image(1)), ]) self.assert_get_cn_batches_works([ [get_dummy_image(0)], [get_dummy_image(1)], ]) def test_get_cn_batches__1_batch(self): self.p.script_args.extend([ create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(0), get_dummy_image(1), ], ), ]) self.assert_get_cn_batches_works([ [ get_dummy_image(0), get_dummy_image(1), ], ]) def test_get_cn_batches__2_batches(self): self.p.script_args.extend([ create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(0), get_dummy_image(1), ], ), create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(2), get_dummy_image(3), ], ), ]) self.assert_get_cn_batches_works([ [ get_dummy_image(0), get_dummy_image(1), ], [ get_dummy_image(2), get_dummy_image(3), ], ]) def test_get_cn_batches__2_mixed(self): self.p.script_args.extend([ create_unit(image=get_dummy_image(0)), create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(1), get_dummy_image(2), ], ), ]) self.assert_get_cn_batches_works([ [ get_dummy_image(0), get_dummy_image(0), ], [ get_dummy_image(1), get_dummy_image(2), ], ]) def test_get_cn_batches__3_mixed(self): self.p.script_args.extend([ create_unit(image=get_dummy_image(0)), create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(1), get_dummy_image(2), get_dummy_image(3), ], ), create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(4), get_dummy_image(5), ], ), ]) self.assert_get_cn_batches_works([ [ get_dummy_image(0), get_dummy_image(0), ], [ get_dummy_image(1), get_dummy_image(2), ], [ get_dummy_image(4), get_dummy_image(5), ], ]) class TestProcessImagesPatchWorks(unittest.TestCase): @unittest.mock.patch('modules.script_callbacks.on_script_unloaded') def setUp(self, on_script_unloaded_mock): self.on_script_unloaded_mock = on_script_unloaded_mock self.p = unittest.mock.MagicMock() assert scripts.scripts_txt2img is not None self.p.scripts = scripts.scripts_txt2img self.cn_script = controlnet.Script() self.p.scripts.alwayson_scripts = [self.cn_script] self.p.script_args = [] self.p.all_seeds = [0] self.p.all_subseeds = [0] self.old_model, shared.sd_model = shared.sd_model, unittest.mock.MagicMock() self.batch_hijack_object = batch_hijack.BatchHijack() self.callbacks_mock = unittest.mock.MagicMock() self.batch_hijack_object.process_batch_callbacks.append(self.callbacks_mock.process) self.batch_hijack_object.process_batch_each_callbacks.append(self.callbacks_mock.process_each) self.batch_hijack_object.postprocess_batch_each_callbacks.insert(0, self.callbacks_mock.postprocess_each) self.batch_hijack_object.postprocess_batch_callbacks.insert(0, self.callbacks_mock.postprocess) self.batch_hijack_object.do_hijack() shared.state.begin() def tearDown(self): shared.state.end() self.batch_hijack_object.undo_hijack() shared.sd_model = self.old_model @unittest.mock.patch('modules.processing.__controlnet_original_process_images_inner') def assert_process_images_hijack_called(self, process_images_mock, batch_count): process_images_mock.return_value = processing.Processed(self.p, [get_dummy_image('output')]) with unittest.mock.patch.dict(shared.opts.data, { 'controlnet_show_batch_images_in_ui': True, }): res = processing.process_images_inner(self.p) self.assertEqual(res, process_images_mock.return_value) if batch_count > 0: self.callbacks_mock.process.assert_called() self.callbacks_mock.postprocess.assert_called() else: self.callbacks_mock.process.assert_not_called() self.callbacks_mock.postprocess.assert_not_called() self.assertEqual(self.callbacks_mock.process_each.call_count, batch_count) self.assertEqual(self.callbacks_mock.postprocess_each.call_count, batch_count) def test_process_images_no_units_forwards(self): self.assert_process_images_hijack_called(batch_count=0) def test_process_images__only_simple_units__forwards(self): self.p.script_args = [ create_unit(image=get_dummy_image()), create_unit(image=get_dummy_image()), ] self.assert_process_images_hijack_called(batch_count=0) def test_process_images__1_batch_1_unit__runs_1_batch(self): self.p.script_args = [ create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(), ], ), ] self.assert_process_images_hijack_called(batch_count=1) def test_process_images__2_batches_1_unit__runs_2_batches(self): self.p.script_args = [ create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(0), get_dummy_image(1), ], ), ] self.assert_process_images_hijack_called(batch_count=2) def test_process_images__8_batches_1_unit__runs_8_batches(self): batch_count = 8 self.p.script_args = [ create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[get_dummy_image(i) for i in range(batch_count)] ), ] self.assert_process_images_hijack_called(batch_count=batch_count) def test_process_images__1_batch_2_units__runs_1_batch(self): self.p.script_args = [ create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[get_dummy_image(0)] ), create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[get_dummy_image(1)] ), ] self.assert_process_images_hijack_called(batch_count=1) def test_process_images__2_batches_2_units__runs_2_batches(self): self.p.script_args = [ create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(0), get_dummy_image(1), ], ), create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(2), get_dummy_image(3), ], ), ] self.assert_process_images_hijack_called(batch_count=2) def test_process_images__3_batches_2_mixed_units__runs_3_batches(self): self.p.script_args = [ create_unit( input_mode=batch_hijack.InputMode.BATCH, batch_images=[ get_dummy_image(0), get_dummy_image(1), get_dummy_image(2), ], ), create_unit( input_mode=batch_hijack.InputMode.SIMPLE, image=get_dummy_image(3), ), ] self.assert_process_images_hijack_called(batch_count=3) def get_dummy_image(name: Any = 0): return f'base64#{name}...' if __name__ == '__main__': unittest.main() ================================================ FILE: tests/cn_script/cn_script_test.py ================================================ import unittest from PIL import Image import numpy as np import importlib utils = importlib.import_module("extensions.sd-webui-controlnet.tests.utils", "utils") from scripts.enums import ResizeMode from scripts.controlnet import prepare_mask, Script, set_numpy_seed from internal_controlnet.external_code import ControlNetUnit from modules import processing class TestPrepareMask(unittest.TestCase): def test_prepare_mask(self): p = processing.StableDiffusionProcessing() p.inpainting_mask_invert = True p.mask_blur = 5 mask = Image.new("RGB", (10, 10), color="white") processed_mask = prepare_mask(mask, p) # Check that mask is correctly converted to grayscale self.assertTrue(processed_mask.mode, "L") # Check that mask colors are correctly inverted self.assertEqual( processed_mask.getpixel((0, 0)), 0 ) # inverted white should be black p.inpainting_mask_invert = False processed_mask = prepare_mask(mask, p) # Check that mask colors are not inverted when 'inpainting_mask_invert' is False self.assertEqual( processed_mask.getpixel((0, 0)), 255 ) # white should remain white p.mask_blur = 0 mask = Image.new("RGB", (10, 10), color="black") processed_mask = prepare_mask(mask, p) # Check that mask is not blurred when 'mask_blur' is 0 self.assertEqual( processed_mask.getpixel((0, 0)), 0 ) # black should remain black class TestSetNumpySeed(unittest.TestCase): def test_seed_subseed_minus_one(self): p = processing.StableDiffusionProcessing() p.seed = -1 p.subseed = -1 p.all_seeds = [123, 456] expected_seed = (123 + 123) & 0xFFFFFFFF self.assertEqual(set_numpy_seed(p), expected_seed) def test_valid_seed_subseed(self): p = processing.StableDiffusionProcessing() p.seed = 50 p.subseed = 100 p.all_seeds = [123, 456] expected_seed = (50 + 100) & 0xFFFFFFFF self.assertEqual(set_numpy_seed(p), expected_seed) def test_invalid_seed_subseed(self): p = processing.StableDiffusionProcessing() p.seed = "invalid" p.subseed = 2.5 p.all_seeds = [123, 456] self.assertEqual(set_numpy_seed(p), None) def test_empty_all_seeds(self): p = processing.StableDiffusionProcessing() p.seed = -1 p.subseed = 2 p.all_seeds = [] self.assertEqual(set_numpy_seed(p), None) def test_random_state_change(self): p = processing.StableDiffusionProcessing() p.seed = 50 p.subseed = 100 p.all_seeds = [123, 456] expected_seed = (50 + 100) & 0xFFFFFFFF np.random.seed(0) # set a known seed before_random = np.random.randint(0, 1000) # get a random integer seed = set_numpy_seed(p) self.assertEqual(seed, expected_seed) after_random = np.random.randint(0, 1000) # get another random integer self.assertNotEqual(before_random, after_random) class MockImg2ImgProcessing(processing.StableDiffusionProcessing): """Mock the Img2Img processing as the WebUI version have dependency on `sd_model`.""" def __init__(self, init_images, resize_mode, *args, **kwargs): super().__init__(*args, **kwargs) self.init_images = init_images self.resize_mode = resize_mode class TestScript(unittest.TestCase): sample_base64_image = ( "data:image/png;base64," "iVBORw0KGgoAAAANSUhEUgAAARMAAAC3CAIAAAC+MS2jAAAAqUlEQVR4nO3BAQ" "0AAADCoPdPbQ8HFAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" "AAAAAAAAAAAAAAAAAAAAAAAA/wZOlAAB5tU+nAAAAABJRU5ErkJggg==" ) sample_np_image = np.zeros(shape=[8, 8, 3], dtype=np.uint8) def test_choose_input_image(self): with self.subTest(name="no image"): with self.assertRaises(ValueError): Script.choose_input_image( p=processing.StableDiffusionProcessing(), unit=ControlNetUnit(), idx=0, ) with self.subTest(name="control net input"): _, resize_mode = Script.choose_input_image( p=MockImg2ImgProcessing( init_images=[TestScript.sample_np_image], resize_mode=ResizeMode.OUTER_FIT, ), unit=ControlNetUnit( image=TestScript.sample_np_image, module="none", resize_mode=ResizeMode.INNER_FIT, ), idx=0, ) self.assertEqual(resize_mode, ResizeMode.INNER_FIT) with self.subTest(name="A1111 input"): _, resize_mode = Script.choose_input_image( p=MockImg2ImgProcessing( init_images=[TestScript.sample_np_image], resize_mode=ResizeMode.OUTER_FIT, ), unit=ControlNetUnit( module="none", resize_mode=ResizeMode.INNER_FIT, ), idx=0, ) self.assertEqual(resize_mode, ResizeMode.OUTER_FIT) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/cn_script/utils_test.py ================================================ import importlib utils = importlib.import_module('extensions.sd-webui-controlnet.tests.utils', 'utils') from scripts.utils import ndarray_lru_cache, get_unique_axis0 import unittest import numpy as np class TestNumpyLruCache(unittest.TestCase): def setUp(self): self.arr1 = np.array([1, 2, 3, 4, 5]) self.arr2 = np.array([1, 2, 3, 4, 5]) @ndarray_lru_cache(max_size=128) def add_one(self, arr): return arr + 1 def test_same_array(self): # Test that the decorator works with numpy arrays. result1 = self.add_one(self.arr1) result2 = self.add_one(self.arr1) # If caching is working correctly, these should be the same object. self.assertIs(result1, result2) def test_different_array_same_data(self): # Test that the decorator works with different numpy arrays with the same data. result1 = self.add_one(self.arr1) result2 = self.add_one(self.arr2) # If caching is working correctly, these should be the same object. self.assertIs(result1, result2) def test_cache_size(self): # Test that the cache size limit is respected. arrs = [np.array([i]) for i in range(150)] # Add all arrays to the cache. result1 = self.add_one(arrs[0]) for arr in arrs[1:]: self.add_one(arr) # Check that the first array is no longer in the cache. result2 = self.add_one(arrs[0]) # If the cache size limit is working correctly, these should not be the same object. self.assertIsNot(result1, result2) def test_large_array(self): # Create two large arrays with the same elements in the beginning and end, but one different element in the middle. arr1 = np.ones(10000) arr2 = np.ones(10000) arr2[len(arr2)//2] = 0 result1 = self.add_one(arr1) result2 = self.add_one(arr2) # If hashing is working correctly, these should not be the same object because the input arrays are not equal. self.assertIsNot(result1, result2) class TestUniqueFunctions(unittest.TestCase): def test_get_unique_axis0(self): data = np.random.randint(0, 100, size=(100000, 3)) data = np.concatenate((data, data)) numpy_unique_res = np.unique(data, axis=0) get_unique_axis0_res = get_unique_axis0(data) self.assertEqual(np.array_equal( np.sort(numpy_unique_res, axis=0), np.sort(get_unique_axis0_res, axis=0), ), True) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/conftest.py ================================================ import os def pytest_configure(config): # We don't want to fail on Py.test command line arguments being # parsed by webui: os.environ.setdefault("IGNORE_CMD_ARGS_ERRORS", "1") ================================================ FILE: tests/external_code_api/__init__.py ================================================ ================================================ FILE: tests/external_code_api/external_code_test.py ================================================ import unittest import importlib import numpy as np utils = importlib.import_module('extensions.sd-webui-controlnet.tests.utils', 'utils') from copy import copy from scripts import external_code from scripts import controlnet from scripts.enums import ResizeMode from internal_controlnet.external_code import ControlNetUnit from modules import scripts, ui, shared class TestExternalCodeWorking(unittest.TestCase): max_models = 6 args_offset = 10 def setUp(self): self.scripts = copy(scripts.scripts_txt2img) self.scripts.initialize_scripts(False) ui.create_ui() self.cn_script = controlnet.Script() self.cn_script.args_from = self.args_offset self.cn_script.args_to = self.args_offset + self.max_models self.scripts.alwayson_scripts = [self.cn_script] self.script_args = [None] * self.cn_script.args_from self.initial_max_models = shared.opts.data.get("control_net_unit_count", 3) shared.opts.data.update(control_net_unit_count=self.max_models) self.extra_models = 0 def tearDown(self): shared.opts.data.update(control_net_unit_count=self.initial_max_models) def get_expected_args_to(self): args_len = max(self.max_models, len(self.cn_units)) return self.args_offset + args_len def assert_update_in_place_ok(self): external_code.update_cn_script_in_place(self.scripts, self.script_args, self.cn_units) self.assertEqual(self.cn_script.args_to, self.get_expected_args_to()) def test_empty_resizes_min_args(self): self.cn_units = [] self.assert_update_in_place_ok() def test_empty_resizes_extra_args(self): extra_models = 1 self.cn_units = [ControlNetUnit()] * (self.max_models + extra_models) self.assert_update_in_place_ok() class TestPixelPerfectResolution(unittest.TestCase): def test_outer_fit(self): image = np.zeros((100, 100, 3)) target_H, target_W = 50, 100 resize_mode = ResizeMode.OUTER_FIT result = external_code.pixel_perfect_resolution(image, target_H, target_W, resize_mode) expected = 50 # manually computed expected result self.assertEqual(result, expected) def test_inner_fit(self): image = np.zeros((100, 100, 3)) target_H, target_W = 50, 100 resize_mode = ResizeMode.INNER_FIT result = external_code.pixel_perfect_resolution(image, target_H, target_W, resize_mode) expected = 100 # manually computed expected result self.assertEqual(result, expected) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/utils.py ================================================ import os import sys import cv2 from base64 import b64encode from pathlib import Path import requests BASE_URL = "http://localhost:7860" # setup_test_env os.environ['IGNORE_CMD_ARGS_ERRORS'] = 'True' file_path = Path(__file__).resolve() ext_root = file_path.parent.parent a1111_root = ext_root.parent.parent for p in (ext_root, a1111_root): if p not in sys.path: sys.path.append(str(p)) # Initialize A1111 from modules import initialize initialize.imports() initialize.initialize() from scripts.enums import StableDiffusionVersion def readImage(path): img = cv2.imread(path) retval, buffer = cv2.imencode('.jpg', img) b64img = b64encode(buffer).decode("utf-8") return b64img def get_model(model_name: str, sd_version: StableDiffusionVersion = StableDiffusionVersion.SD1x) -> str: """ Find an available model with specified model name and sd_version. """ if model_name.lower() == "none": return "None" r = requests.get(BASE_URL+"/controlnet/model_list") result = r.json() if "model_list" not in result: raise ValueError("No model available") candidates = [ model for model in result["model_list"] if ( model_name.lower() in model.lower() and StableDiffusionVersion.detect_from_model_name(model) == sd_version ) ] if not candidates: raise ValueError("No suitable model available") return candidates[0] def get_modules(): return requests.get(f"{BASE_URL}/controlnet/module_list").json() def detect(json): return requests.post(BASE_URL+"/controlnet/detect", json=json) ================================================ FILE: tests/web_api/__init__.py ================================================ ================================================ FILE: tests/web_api/animal_pose.json ================================================ { "version": "ap10k", "animals": [ [ 450.2489471435547, 131.68504521623254, 1.0, 392.43172235786915, 129.75780439004302, 1.0, 422.3039551638067, 170.2298617400229, 1.0, 424.2311959899962, 254.06483767926693, 1.0, 460.84877168759704, 416.9166874922812, 0.7048550844192505, 498.42996779829264, 295.50051544234157, 0.742408812046051, 513.8478944078088, 374.5173893161118, 0.763853132724762, 512.884273994714, 438.1163365803659, 1.0, 372.1956936828792, 301.2822379209101, 0.7799525856971741, 384.7227590531111, 381.2627322077751, 0.8117924928665161, 381.8318978138268, 442.9344386458397, 1.0, 553.3563313446939, 327.2999890744686, 0.7031180262565613, 555.2835721708834, 375.48100972920656, 0.6529693603515625, 562.0289150625467, 420.77116914466023, 0.8226040601730347, 409.7768897935748, 359.09946270659566, 0.3695080578327179, 436.75826136022806, 414.0258262529969, 0.6621587872505188, 428.08567764237523, 422.69840997084975, 0.552909255027771 ] ], "canvas_height": 512, "canvas_width": 960 } ================================================ FILE: tests/web_api/clip_mask_test.py ================================================ from .template import ( APITestTemplate, girl_img, mask_img, disable_in_cq, get_model, ) @disable_in_cq def test_clip_mask_txt2img_control(): """No mask control group.""" assert APITestTemplate( "test_clip_mask_txt2img_control", "txt2img", payload_overrides={}, unit_overrides={ "module": "ip-adapter-auto", "model": get_model("ip-adapter_sd15"), "image": girl_img, }, ).exec() @disable_in_cq def test_clip_mask_txt2img_experiment(): """With mask experiment group.""" assert APITestTemplate( "test_clip_mask_txt2img_experiment", "txt2img", payload_overrides={}, unit_overrides={ "module": "ip-adapter-auto", "model": get_model("ip-adapter_sd15"), "image": girl_img, "mask_image": mask_img, }, ).exec() @disable_in_cq def test_clip_mask_img2img(): """CLIP mask should not work in img2img inpaint.""" assert APITestTemplate( "test_clip_mask_img2img", "img2img", payload_overrides={ "init_images": [girl_img], "mask": mask_img, }, unit_overrides={ "module": "ip-adapter-auto", "model": get_model("ip-adapter_sd15"), }, ).exec() ================================================ FILE: tests/web_api/detect_test.py ================================================ import pytest import requests from typing import List from .template import ( APITestTemplate, realistic_girl_face_img, portrait_imgs, girl_img, mask_img, save_base64, get_dest_dir, disable_in_cq, console_log_context, ) def get_modules() -> List[str]: return requests.get(APITestTemplate.BASE_URL + "controlnet/module_list").json()[ "module_list" ] def detect_template(payload, output_name: str, status: int = 200): url = APITestTemplate.BASE_URL + "controlnet/detect" resp = requests.post(url, json=payload) assert resp.status_code == status if status != 200: return resp_json = resp.json() if "images" in resp_json: assert len(resp_json["images"]) == len(payload["controlnet_input_images"]) if not APITestTemplate.is_cq_run: for i, img in enumerate(resp_json["images"]): if img == "Detect result is not image": continue dest = get_dest_dir() / f"{output_name}_{i}.png" save_base64(img, dest) elif "tensor" in resp_json: assert len(resp_json["tensor"]) == len(payload["controlnet_input_images"]) if not APITestTemplate.is_cq_run: for i, tensor_str in enumerate(resp_json["tensor"]): dest = get_dest_dir() / f"{output_name}_{i}.txt" with open(dest, "w") as f: f.write(tensor_str) else: assert False, resp_json return resp_json UNSUPPORTED_PREPROCESSORS = { "clip_vision", "revision_clipvision", "revision_ignore_prompt", "ip-adapter-auto", } INPAINT_PREPROCESSORS = { "inpaint_only", "inpaint", "inpaint_only+lama", } # FAILED extensions/sd-webui-controlnet/tests/web_api/detect_test.py::test_detect_all_modules[depth_zoe] - assert 500 == 200 @disable_in_cq @pytest.mark.parametrize( "module", [ m for m in get_modules() if m not in UNSUPPORTED_PREPROCESSORS.union(INPAINT_PREPROCESSORS) ], ) def test_detect_all_modules(module: str): payload = dict( controlnet_input_images=[realistic_girl_face_img], controlnet_module=module, ) detect_template(payload, f"detect_{module}") @disable_in_cq @pytest.mark.parametrize("module", [m for m in INPAINT_PREPROCESSORS]) def test_inpaint_mask(module: str): payload = dict( controlnet_input_images=[girl_img], controlnet_masks=[mask_img], controlnet_module=module, ) detect_template(payload, f"detect_inpaint_mask_{module}") @disable_in_cq @pytest.mark.parametrize("img_index", [i for i, _ in enumerate(portrait_imgs)]) def test_pulid(img_index: int): """PuLID preprocessor should not memory leak.""" payload = dict( controlnet_input_images=[portrait_imgs[img_index]], controlnet_module="ip-adapter_pulid", ) detect_template(payload, f"detect_pulid_{img_index}") @pytest.mark.parametrize("module", [m for m in UNSUPPORTED_PREPROCESSORS]) def test_unsupported_modules(module: str): payload = dict( controlnet_input_images=[realistic_girl_face_img], controlnet_module=module, ) detect_template(payload, f"detect_{module}", status=422) @pytest.mark.parametrize("module", [m for m in INPAINT_PREPROCESSORS]) def test_mask_error(module: str): payload = dict( controlnet_input_images=[realistic_girl_face_img], controlnet_module=module, ) detect_template(payload, f"mask_error_{module}", status=422) def test_detect_simple(): detect_template( dict( controlnet_input_images=[realistic_girl_face_img], controlnet_module="canny", # Canny does not require model download. ), "simple_detect", ) def test_detect_multiple_inputs(): detect_template( dict( controlnet_input_images=[realistic_girl_face_img, realistic_girl_face_img], controlnet_module="canny", # Canny does not require model download. ), "multiple_inputs_detect", ) def test_detect_with_invalid_module(): detect_template({"controlnet_module": "INVALID"}, "invalid module", 422) def test_detect_with_no_input_images(): detect_template({"controlnet_input_images": []}, "no input images", 422) def test_detect_default_param(): with console_log_context() as log_context: detect_template( dict( controlnet_input_images=[realistic_girl_face_img], controlnet_module="canny", # Canny does not require model download. controlnet_threshold_a=-100, controlnet_threshold_b=-100, controlnet_processor_res=-100, ), "default_param", ) assert log_context.is_in_console_logs( [ "[canny.processor_res] Invalid value(-100), using default value 512.", "[canny.threshold_a] Invalid value(-100.0), using default value 100.", "[canny.threshold_b] Invalid value(-100.0), using default value 200.", ] ) ================================================ FILE: tests/web_api/effective_region_test.py ================================================ from .template import ( APITestTemplate, girl_img, realistic_girl_face_img, living_room_img, mask_left, mask_right, disable_in_cq, get_model, ) @disable_in_cq def test_effective_region_ipadapter(): assert APITestTemplate( "test_effective_region_ipadapter", "txt2img", payload_overrides={ "prompt": "(masterpiece: 1.3), (highres: 1.3), best quality, (2girls: 1.3)", "width": 768, "height": 512, "steps": 20, }, unit_overrides=[ { "module": "ip-adapter-auto", "model": get_model("ip-adapter_sd15"), "image": girl_img, "effective_region_mask": mask_left, }, { "module": "ip-adapter-auto", "model": get_model("ip-adapter_sd15"), "image": realistic_girl_face_img, "effective_region_mask": mask_right, }, ], ).exec() @disable_in_cq def test_effective_region_depth(): assert APITestTemplate( "test_effective_region_depth", "txt2img", payload_overrides={ "prompt": "(masterpiece: 1.3), (highres: 1.3), best quality, A cozy living room", "width": 768, "height": 512, }, unit_overrides={ "module": "depth_midas", "model": get_model("control_v11f1p_sd15_depth"), "image": living_room_img, "effective_region_mask": mask_left, }, ).exec() ================================================ FILE: tests/web_api/full_coverage/README.md ================================================ # Full Coverage Tests Tests that only run locally with all models available. Set environment variable `CONTROLNET_TEST_FULL_COVERAGE` to any value to enable these tests. ================================================ FILE: tests/web_api/full_coverage/__init__.py ================================================ ================================================ FILE: tests/web_api/full_coverage/depth_test.py ================================================ import unittest import pytest from typing import NamedTuple, Optional from .template import ( sd_version, StableDiffusionVersion, is_full_coverage, APITestTemplate, living_room_img, general_negative_prompt, ) base_prompt = "A modern living room" general_depth_modules = [ "depth", "depth_leres", "depth_leres++", "depth_anything", "depth_anything_v2", ] hand_refiner_module = "depth_hand_refiner" general_depth_models = [ "control_sd15_depth_anything [48a4bc3a]", "control_v11f1p_sd15_depth [cfd03158]", "t2iadapter_depth_sd15v2 [3489cd37]", ] hand_refiner_model = "control_sd15_inpaint_depth_hand_fp16 [09456e54]" class TestDepthFullCoverage(unittest.TestCase): def setUp(self): if not is_full_coverage: pytest.skip() # TODO test SDXL. if sd_version == StableDiffusionVersion.SDXL: pytest.skip() def test_depth(self): for module in general_depth_modules: for model in general_depth_models: name = f"depth_txt2img_{module}_{model}" with self.subTest(name=name): self.assertTrue( APITestTemplate( name, "txt2img", payload_overrides={ "prompt": base_prompt, "negative_prompt": general_negative_prompt, "steps": 20, "width": 768, "height": 512, }, unit_overrides={ "module": module, "model": model, "image": living_room_img, }, ).exec(result_only=False) ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/web_api/full_coverage/inpaint_test.py ================================================ import unittest import pytest from .template import ( is_full_coverage, APITestTemplate, girl_img, mask_img, mask_small_img, ) class TestInpaintFullCoverage(unittest.TestCase): def setUp(self): if not is_full_coverage: pytest.skip() def test_inpaint(self): for gen_type in ("img2img", "txt2img"): if gen_type == "img2img": payload = { "init_images": [girl_img], "mask": mask_img, } unit = {} else: payload = {} unit = { "image": { "image": girl_img, "mask": mask_img, } } unit["model"] = "control_v11p_sd15_inpaint [ebff9138]" for i_resize, resize_mode in enumerate( ("Just Resize", "Crop and Resize", "Resize and Fill") ): # Gen 512x768(input image size) for resize. if resize_mode == "Crop and Resize": payload["height"] = 768 payload["width"] = 512 # Gen 512x512 for inner fit. if resize_mode == "Crop and Resize": payload["height"] = 512 payload["width"] = 512 # Gen 768x768 for outer fit. if resize_mode == "Resize and Fill": payload["height"] = 768 payload["width"] = 768 if gen_type == "img2img": payload["resize_mode"] = i_resize else: unit["resize_mode"] = resize_mode for module in ("inpaint_only", "inpaint", "inpaint_only+lama"): unit["module"] = module with self.subTest( gen_type=gen_type, resize_mode=resize_mode, module=module, ): self.assertTrue( APITestTemplate( f"{gen_type}_{resize_mode}_{module}", gen_type, payload_overrides=payload, unit_overrides=unit, ).exec() ) def test_inpaint_no_mask(self): """Inpaint should fail if no mask is provided. Output should not contain ControlNet detected map.""" for gen_type in ("img2img", "txt2img"): if gen_type == "img2img": payload = { "init_images": [girl_img], } unit = {} else: payload = {} unit = { "image": { "image": girl_img, } } unit["model"] = "control_v11p_sd15_inpaint [ebff9138]" unit["module"] = "inpaint_only" with self.subTest(gen_type=gen_type): self.assertTrue( APITestTemplate( f"{gen_type}_no_mask_fail", gen_type, payload_overrides=payload, unit_overrides=unit, ).exec() ) def test_inpaint_double_mask(self): """When mask is provided for both a1111 img2img input and ControlNet unit input, ControlNet input mask should be used.""" self.assertTrue( APITestTemplate( f"img2img_double_mask", "img2img", payload_overrides={ "init_images": [girl_img], "mask": mask_img, }, unit_overrides={ "image": { "image": girl_img, "mask": mask_small_img, }, "model": "control_v11p_sd15_inpaint [ebff9138]", "module": "inpaint", }, ).exec() ) def test_img2img_mask_on_unit(self): """ Usecase for inpaint_global_harmonious. """ self.assertTrue( APITestTemplate( f"img2img_mask_on_unit", "img2img", payload_overrides={ "init_images": [girl_img], }, unit_overrides={ "image": { "image": girl_img, "mask": mask_small_img, }, "model": "control_v11p_sd15_inpaint [ebff9138]", "module": "inpaint", }, ).exec() ) def test_outpaint_without_mask(self): self.assertTrue( APITestTemplate( f"img2img_outpaint_without_mask", "img2img", payload_overrides={ "init_images": [girl_img], "width": 768, "height": 768, "resize_mode": 2, }, unit_overrides={ "model": "control_v11p_sd15_inpaint [ebff9138]", "module": "inpaint_only+lama", }, ).exec() ) self.assertTrue( APITestTemplate( f"txt2img_outpaint_without_mask", "txt2img", payload_overrides={ "width": 768, "height": 768, }, unit_overrides={ "model": "control_v11p_sd15_inpaint [ebff9138]", "module": "inpaint_only+lama", "image": { "image": girl_img, }, "resize_mode": 2, }, ).exec() ) def test_inpaint_crop(self): self.assertTrue( APITestTemplate( "img2img_inpaint_crop", "img2img", payload_overrides={ "init_images": [girl_img], "inpaint_full_res": True, "mask": mask_small_img, }, unit_overrides={ "model": "control_v11p_sd15_canny [d14c016b]", "module": "canny", "inpaint_crop_input_image": True, }, ).exec() ) self.assertTrue( APITestTemplate( "img2img_inpaint_no_crop", "img2img", payload_overrides={ "init_images": [girl_img], "inpaint_full_res": True, "mask": mask_small_img, }, unit_overrides={ "model": "control_v11p_sd15_canny [d14c016b]", "module": "canny", "inpaint_crop_input_image": False, }, ).exec() ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/web_api/full_coverage/ipadapter_test.py ================================================ import unittest import pytest from typing import NamedTuple, Optional from .template import ( sd_version, StableDiffusionVersion, is_full_coverage, APITestTemplate, portrait_imgs, realistic_girl_face_img, general_negative_prompt, ) class AdapterSetting(NamedTuple): module: str model: str lora: Optional[str] = None @property def lora_prompt(self) -> str: return f"" if self.lora else "" # Used to fix pose for better comparison between different settings. openpose_unit = { "module": "openpose", "model": ( "control_v11p_sd15_openpose [cab727d4]" if sd_version != StableDiffusionVersion.SDXL else "kohya_controllllite_xl_openpose_anime [7e5349e5]" ), "image": realistic_girl_face_img, "weight": 0.8, } base_prompt = "1girl, simple background, (white_background: 1.2), portrait" negative_prompts = { "with_neg": general_negative_prompt, "no_neg": "", } sd15_full_face = AdapterSetting( "ip-adapter_clip_sd15", "ip-adapter-full-face_sd15 [852b9843]", ) sd15_plus_face = AdapterSetting( "ip-adapter_clip_sd15", "ip-adapter-plus-face_sd15 [71693645]", ) sd15_normal = AdapterSetting( "ip-adapter_clip_sd15", "ip-adapter_sd15 [6a3f6166]", ) sd15_light = AdapterSetting( "ip-adapter_clip_sd15", "ip-adapter_sd15_light [be1c9b97]", ) sdxl_normal = AdapterSetting( "ip-adapter_clip_sdxl", "ip-adapter_sdxl [d5d53548]" ) sdxl_vit = AdapterSetting( "ip-adapter_clip_sdxl_plus_vith", "ip-adapter_sdxl_vit-h [75a08f84]", ) sdxl_plus_vit = AdapterSetting( "ip-adapter_clip_sdxl_plus_vith", "ip-adapter-plus_sdxl_vit-h [f1f19f7d]", ) sdxl_plus_vit_face = AdapterSetting( "ip-adapter_clip_sdxl_plus_vith", "ip-adapter-plus-face_sdxl_vit-h [c60d7d48]", ) class TestIPAdapterFullCoverage(unittest.TestCase): def setUp(self): if not is_full_coverage: pytest.skip() if sd_version == StableDiffusionVersion.SDXL: self.settings = [ sdxl_normal, sdxl_vit, sdxl_plus_vit, sdxl_plus_vit_face, ] else: self.settings = [ sd15_normal, sd15_light, sd15_plus_face, sd15_full_face, ] def test_adapter(self): for s in self.settings: for n, negative_prompt in negative_prompts.items(): name = f"{s}_{n}" with self.subTest(name=name): self.assertTrue( APITestTemplate( name, "txt2img", payload_overrides={ "prompt": f"{base_prompt},{s.lora_prompt}", "negative_prompt": negative_prompt, "steps": 20, "width": 512, "height": 512, }, unit_overrides=[ { "module": s.module, "model": s.model, "image": realistic_girl_face_img, }, openpose_unit, ], ).exec() ) def test_adapter_multi_inputs(self): for s in self.settings: for n, negative_prompt in negative_prompts.items(): name = f"multi_inputs_{s}_{n}" with self.subTest(name=name): self.assertTrue( APITestTemplate( name=name, gen_type="txt2img", payload_overrides={ "prompt": f"{base_prompt}, {s.lora_prompt}", "negative_prompt": negative_prompt, "steps": 20, "width": 512, "height": 512, }, unit_overrides=[openpose_unit] + [ { "image": img, "module": s.module, "model": s.model, "weight": 1 / len(portrait_imgs), } for img in portrait_imgs ], ).exec() ) def test_adapter_real_multi_inputs(self): for s in self.settings: for n, negative_prompt in negative_prompts.items(): name = f"real_multi_{s}_{n}" with self.subTest(name=name): self.assertTrue( APITestTemplate( name=name, gen_type="txt2img", payload_overrides={ "prompt": f"{base_prompt}, {s.lora_prompt}", "negative_prompt": negative_prompt, "steps": 20, "width": 512, "height": 512, }, unit_overrides=[ openpose_unit, { "image": [{"image": img} for img in portrait_imgs], "module": s.module, "model": s.model, }, ], ).exec() ) sd15_face_id = AdapterSetting( "ip-adapter_face_id", "ip-adapter-faceid_sd15 [0a1757e9]", "ip-adapter-faceid_sd15_lora", ) sd15_face_id_plus = AdapterSetting( "ip-adapter_face_id_plus", "ip-adapter-faceid-plus_sd15 [d86a490f]", "ip-adapter-faceid-plus_sd15_lora", ) sd15_face_id_plus_v2 = AdapterSetting( "ip-adapter_face_id_plus", "ip-adapter-faceid-plusv2_sd15 [6e14fc1a]", "ip-adapter-faceid-plusv2_sd15_lora", ) sd15_face_id_portrait = AdapterSetting( "ip-adapter_face_id", "ip-adapter-faceid-portrait_sd15 [b2609049]", ) sdxl_face_id = AdapterSetting( "ip-adapter_face_id", "ip-adapter-faceid_sdxl [59ee31a3]", "ip-adapter-faceid_sdxl_lora", ) class TestIPAdapterFaceIdFullCoverage(unittest.TestCase): def setUp(self): if not is_full_coverage: pytest.skip() if sd_version == StableDiffusionVersion.SDXL: self.settings = [sdxl_face_id] else: self.settings = [ sd15_face_id, sd15_face_id_plus, sd15_face_id_plus_v2, sd15_face_id_portrait, ] def test_face_id(self): for s in self.settings: for n, negative_prompt in negative_prompts.items(): name = f"{s}_{n}" with self.subTest(name=name): self.assertTrue( APITestTemplate( name, "txt2img", payload_overrides={ "prompt": f"{base_prompt},{s.lora_prompt}", "negative_prompt": negative_prompt, "steps": 20, "width": 512, "height": 512, }, unit_overrides=[ { "module": s.module, "model": s.model, "image": realistic_girl_face_img, }, openpose_unit, ], ).exec() ) def test_face_id_multi_inputs(self): for s in self.settings: for n, negative_prompt in negative_prompts.items(): name = f"multi_inputs_{s}_{n}" with self.subTest(name=name): self.assertTrue( APITestTemplate( name=name, gen_type="txt2img", payload_overrides={ "prompt": f"{base_prompt}, {s.lora_prompt}", "negative_prompt": negative_prompt, "steps": 20, "width": 512, "height": 512, }, unit_overrides=[openpose_unit] + [ { "image": img, "module": s.module, "model": s.model, "weight": 1 / len(portrait_imgs), } for img in portrait_imgs ], ).exec() ) def test_face_id_real_multi_inputs(self): for s in self.settings: for n, negative_prompt in negative_prompts.items(): name = f"real_multi_{s}_{n}" with self.subTest(name=name): self.assertTrue( APITestTemplate( name=name, gen_type="txt2img", payload_overrides={ "prompt": f"{base_prompt}, {s.lora_prompt}", "negative_prompt": negative_prompt, "steps": 20, "width": 512, "height": 512, }, unit_overrides=[ openpose_unit, { "image": [{"image": img} for img in portrait_imgs], "module": s.module, "model": s.model, }, ], ).exec() ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/web_api/full_coverage/template.py ================================================ import io import os import cv2 import base64 from typing import Dict, Any, List, Union, Literal from pathlib import Path import datetime from enum import Enum import numpy as np import requests from PIL import Image PayloadOverrideType = Dict[str, Any] timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") test_result_dir = Path(__file__).parent / "results" / f"test_result_{timestamp}" test_expectation_dir = Path(__file__).parent / "expectations" os.makedirs(test_expectation_dir, exist_ok=True) resource_dir = Path(__file__).parents[2] / "images" def read_image(img_path: Path) -> str: img = cv2.imread(str(img_path)) _, bytes = cv2.imencode(".png", img) encoded_image = base64.b64encode(bytes).decode("utf-8") return encoded_image def read_image_dir(img_dir: Path, suffixes=('.png', '.jpg', '.jpeg', '.webp')) -> List[str]: """Try read all images in given img_dir.""" img_dir = str(img_dir) images = [] for filename in os.listdir(img_dir): if filename.endswith(suffixes): img_path = os.path.join(img_dir, filename) try: images.append(read_image(img_path)) except IOError: print(f"Error opening {img_path}") return images girl_img = read_image(resource_dir / "1girl.png") mask_img = read_image(resource_dir / "mask.png") mask_small_img = read_image(resource_dir / "mask_small.png") portrait_imgs = read_image_dir(resource_dir / "portrait") realistic_girl_face_img = portrait_imgs[0] living_room_img = read_image(resource_dir / "living_room.webp") general_negative_prompt = """ (worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, backlight,(ugly:1.331), (duplicate:1.331), (morbid:1.21), (mutilated:1.21), (tranny:1.331), mutated hands, (poorly drawn hands:1.331), blurry, (bad anatomy:1.21), (bad proportions:1.331), extra limbs, (missing arms:1.331), (extra legs:1.331), (fused fingers:1.61051), (too many fingers:1.61051), (unclear eyes:1.331), bad hands, missing fingers, extra digit, bad body, easynegative, nsfw""" class StableDiffusionVersion(Enum): """The version family of stable diffusion model.""" UNKNOWN = 0 SD1x = 1 SD2x = 2 SDXL = 3 sd_version = StableDiffusionVersion( int(os.environ.get("CONTROLNET_TEST_SD_VERSION", StableDiffusionVersion.SD1x.value)) ) is_full_coverage = os.environ.get("CONTROLNET_TEST_FULL_COVERAGE", None) is not None class APITestTemplate: is_set_expectation_run = os.environ.get("CONTROLNET_SET_EXP", "True") == "True" def __init__( self, name: str, gen_type: Union[Literal["img2img"], Literal["txt2img"]], payload_overrides: PayloadOverrideType, unit_overrides: Union[PayloadOverrideType, List[PayloadOverrideType]], ): self.name = name self.url = "http://localhost:7860/sdapi/v1/" + gen_type self.payload = { **(txt2img_payload if gen_type == "txt2img" else img2img_payload), **payload_overrides, } unit_overrides = ( unit_overrides if isinstance(unit_overrides, (list, tuple)) else [unit_overrides] ) self.payload["alwayson_scripts"]["ControlNet"]["args"] = [ { **default_unit, **unit_override, } for unit_override in unit_overrides ] def exec(self, result_only: bool = True) -> bool: if not APITestTemplate.is_set_expectation_run: os.makedirs(test_result_dir, exist_ok=True) failed = False response = requests.post(url=self.url, json=self.payload).json() if "images" not in response: print(response) return False dest_dir = ( test_expectation_dir if APITestTemplate.is_set_expectation_run else test_result_dir ) results = response["images"][:1] if result_only else response["images"] for i, base64image in enumerate(results): img_file_name = f"{self.name}_{i}.png" Image.open(io.BytesIO(base64.b64decode(base64image.split(",", 1)[0]))).save( dest_dir / img_file_name ) if not APITestTemplate.is_set_expectation_run: try: img1 = cv2.imread(os.path.join(test_expectation_dir, img_file_name)) img2 = cv2.imread(os.path.join(test_result_dir, img_file_name)) except Exception as e: print(f"Get exception reading imgs: {e}") failed = True continue if img1 is None: print(f"Warn: No expectation file found {img_file_name}.") continue if not expect_same_image( img1, img2, diff_img_path=str(test_result_dir / img_file_name.replace(".png", "_diff.png")), ): failed = True return not failed def expect_same_image(img1, img2, diff_img_path: str) -> bool: # Calculate the difference between the two images diff = cv2.absdiff(img1, img2) # Set a threshold to highlight the different pixels threshold = 30 diff_highlighted = np.where(diff > threshold, 255, 0).astype(np.uint8) # Assert that the two images are similar within a tolerance similar = np.allclose(img1, img2, rtol=0.5, atol=1) if not similar: # Save the diff_highlighted image to inspect the differences cv2.imwrite(diff_img_path, diff_highlighted) matching_pixels = np.isclose(img1, img2, rtol=0.5, atol=1) similar_in_general = (matching_pixels.sum() / matching_pixels.size) >= 0.95 return similar_in_general default_unit = { "control_mode": "Balanced", "enabled": True, "guidance_end": 1, "guidance_start": 0, "low_vram": False, "pixel_perfect": True, "processor_res": 512, "resize_mode": "Crop and Resize", "threshold_a": -1, "threshold_b": -1, "weight": 1, } img2img_payload = { "batch_size": 1, "cfg_scale": 7, "height": 768, "width": 512, "n_iter": 1, "steps": 10, "sampler_name": "Euler a", "prompt": "(masterpiece: 1.3), (highres: 1.3), best quality,", "negative_prompt": "", "seed": 42, "seed_enable_extras": False, "seed_resize_from_h": 0, "seed_resize_from_w": 0, "subseed": -1, "subseed_strength": 0, "override_settings": {}, "override_settings_restore_afterwards": False, "do_not_save_grid": False, "do_not_save_samples": False, "s_churn": 0, "s_min_uncond": 0, "s_noise": 1, "s_tmax": None, "s_tmin": 0, "script_args": [], "script_name": None, "styles": [], "alwayson_scripts": {"ControlNet": {"args": [default_unit]}}, "denoising_strength": 0.75, "initial_noise_multiplier": 1, "inpaint_full_res": 0, "inpaint_full_res_padding": 32, "inpainting_fill": 1, "inpainting_mask_invert": 0, "mask_blur_x": 4, "mask_blur_y": 4, "mask_blur": 4, "resize_mode": 0, } txt2img_payload = { "alwayson_scripts": {"ControlNet": {"args": [default_unit]}}, "batch_size": 1, "cfg_scale": 7, "comments": {}, "disable_extra_networks": False, "do_not_save_grid": False, "do_not_save_samples": False, "enable_hr": False, "height": 768, "hr_negative_prompt": "", "hr_prompt": "", "hr_resize_x": 0, "hr_resize_y": 0, "hr_scale": 2, "hr_second_pass_steps": 0, "hr_upscaler": "Latent", "n_iter": 1, "negative_prompt": "", "override_settings": {}, "override_settings_restore_afterwards": True, "prompt": "(masterpiece: 1.3), (highres: 1.3), best quality,", "restore_faces": False, "s_churn": 0.0, "s_min_uncond": 0, "s_noise": 1.0, "s_tmax": None, "s_tmin": 0.0, "sampler_name": "Euler a", "script_args": [], "script_name": None, "seed": 42, "seed_enable_extras": True, "seed_resize_from_h": -1, "seed_resize_from_w": -1, "steps": 10, "styles": [], "subseed": -1, "subseed_strength": 0, "tiling": False, "width": 512, } ================================================ FILE: tests/web_api/generation_test.py ================================================ import pytest from .template import ( APITestTemplate, portrait_imgs, girl_img, mask_img, disable_in_cq, get_model, console_log_context, ) @pytest.mark.parametrize("gen_type", ["img2img", "txt2img"]) def test_no_unit(gen_type): assert APITestTemplate( f"test_no_unit{gen_type}", gen_type, payload_overrides={}, unit_overrides=[], input_image=girl_img, ).exec() @pytest.mark.parametrize("gen_type", ["img2img", "txt2img"]) def test_unrecognized_param(gen_type): assert APITestTemplate( f"test_unrecognized_param_{gen_type}", gen_type, payload_overrides={}, unit_overrides={ "foo": True, "is_ui": False, }, input_image=girl_img, ).exec() @pytest.mark.parametrize("gen_type", ["img2img", "txt2img"]) def test_multiple_iter(gen_type): assert APITestTemplate( f"test_multiple_iter{gen_type}", gen_type, payload_overrides={"n_iter": 2}, unit_overrides={}, input_image=girl_img, ).exec() @pytest.mark.parametrize("gen_type", ["img2img", "txt2img"]) def test_batch_size(gen_type): assert APITestTemplate( f"test_batch_size{gen_type}", gen_type, payload_overrides={"batch_size": 2}, unit_overrides={}, input_image=girl_img, ).exec() @pytest.mark.parametrize("gen_type", ["img2img", "txt2img"]) def test_2_units(gen_type): assert APITestTemplate( f"test_2_units{gen_type}", gen_type, payload_overrides={}, unit_overrides=[{}, {}], input_image=girl_img, ).exec() @pytest.mark.parametrize("gen_type", ["img2img", "txt2img"]) def test_preprocessor(gen_type): assert APITestTemplate( f"test_preprocessor{gen_type}", gen_type, payload_overrides={}, unit_overrides={"module": "canny"}, input_image=girl_img, ).exec() @pytest.mark.parametrize("param_name", ("processor_res", "threshold_a", "threshold_b")) @pytest.mark.parametrize("gen_type", ["img2img", "txt2img"]) def test_invalid_param(gen_type, param_name): with console_log_context() as log_context: assert APITestTemplate( f"test_invalid_param{(gen_type, param_name)}", gen_type, payload_overrides={}, unit_overrides={param_name: -100}, input_image=girl_img, ).exec() number = "-100" if param_name == "processor_res" else "-100.0" assert log_context.is_in_console_logs( [ f"[canny.{param_name}] Invalid value({number}), using default value", ] ) @pytest.mark.parametrize("save_map", [True, False]) @pytest.mark.parametrize("gen_type", ["img2img", "txt2img"]) def test_save_map(gen_type, save_map): assert APITestTemplate( f"test_save_map{(gen_type, save_map)}", gen_type, payload_overrides={}, unit_overrides={"save_detected_map": save_map}, input_image=girl_img, ).exec(expected_output_num=2 if save_map else 1) def test_ip_adapter_face(): assert APITestTemplate( "test_ip_adapter_face", "txt2img", payload_overrides={}, unit_overrides={ "module": "ip-adapter_clip_sd15", "model": get_model("ip-adapter-plus-face_sd15"), }, input_image=girl_img, ).exec() def test_ip_adapter_fullface(): assert APITestTemplate( "test_ip_adapter_fullface", "txt2img", payload_overrides={}, unit_overrides={ "module": "ip-adapter_clip_sd15", "model": get_model("ip-adapter-full-face_sd15"), }, input_image=girl_img, ).exec() def test_control_lora(): assert APITestTemplate( "test_control_lora", "txt2img", payload_overrides={}, unit_overrides={ "module": "canny", "model": get_model("control_lora_rank128_v11p_sd15_canny"), }, input_image=girl_img, ).exec() def test_t2i_adapter(): assert APITestTemplate( "test_t2i_adapter", "txt2img", payload_overrides={}, unit_overrides={ "module": "canny", "model": get_model("t2iadapter_canny_sd15v2"), }, input_image=girl_img, ).exec() def test_reference(): assert APITestTemplate( "test_reference", "txt2img", payload_overrides={}, unit_overrides={ "module": "reference_only", "model": "None", }, input_image=girl_img, ).exec(result_only=False) def test_advanced_weighting(): assert APITestTemplate( "test_advanced_weighting", "txt2img", payload_overrides={}, unit_overrides={"advanced_weighting": [0.75] * 13}, # SD1.5 input_image=girl_img, ).exec() def test_hr_option(): assert APITestTemplate( "test_hr_option", "txt2img", payload_overrides={ "enable_hr": True, "denoising_strength": 0.75, }, unit_overrides={"hr_option": "Both"}, input_image=girl_img, ).exec(expected_output_num=3) def test_hr_option_default(): """In non-hr run, hr_option should be ignored.""" assert APITestTemplate( "test_hr_option_default", "txt2img", payload_overrides={"enable_hr": False}, unit_overrides={"hr_option": "Both"}, input_image=girl_img, ).exec(expected_output_num=2) @disable_in_cq def test_masked_controlnet_txt2img(): assert APITestTemplate( f"test_masked_controlnet_txt2img", "txt2img", payload_overrides={}, unit_overrides={ "image": girl_img, "mask_image": mask_img, }, ).exec() @disable_in_cq def test_masked_controlnet_img2img(): assert APITestTemplate( f"test_masked_controlnet_img2img", "img2img", payload_overrides={ "init_images": [girl_img], }, # Note: Currently you must give ControlNet unit input image to specify # mask. # TODO: Fix this for img2img. unit_overrides={ "image": girl_img, "mask_image": mask_img, }, ).exec() @disable_in_cq def test_txt2img_inpaint(): assert APITestTemplate( "txt2img_inpaint", "txt2img", payload_overrides={}, unit_overrides={ "image": girl_img, "mask_image": mask_img, "model": get_model("v11p_sd15_inpaint"), "module": "inpaint_only", }, ).exec() @disable_in_cq def test_img2img_inpaint(): assert APITestTemplate( "img2img_inpaint", "img2img", payload_overrides={ "init_images": [girl_img], "mask": mask_img, }, unit_overrides={ "model": get_model("v11p_sd15_inpaint"), "module": "inpaint_only", }, ).exec() @disable_in_cq def test_lama_outpaint(): assert APITestTemplate( "txt2img_lama_outpaint", "txt2img", payload_overrides={ "width": 768, "height": 768, }, # Outpaint should not need a mask. unit_overrides={ "image": girl_img, "model": get_model("v11p_sd15_inpaint"), "module": "inpaint_only+lama", "resize_mode": "Resize and Fill", # OUTER_FIT }, ).exec(result_only=False) @disable_in_cq def test_ip_adapter_auto(): with console_log_context() as log_context: assert APITestTemplate( "txt2img_ip_adapter_auto", "txt2img", payload_overrides={}, unit_overrides={ "image": girl_img, "model": get_model("ip-adapter_sd15"), "module": "ip-adapter-auto", }, ).exec() assert log_context.is_in_console_logs(["ip-adapter-auto => ip-adapter_clip_h"]) @disable_in_cq @pytest.mark.parametrize("img_index", [i for i, _ in enumerate(portrait_imgs)]) def test_pulid(img_index: int): """PuLID should not memory leak.""" assert APITestTemplate( f"txt2img_pulid_{img_index}", "txt2img", payload_overrides={ "width": 768, "height": 768, }, unit_overrides={ "image": portrait_imgs[img_index], "model": get_model("ip-adapter_pulid_sdxl_fp16"), "module": "ip-adapter_pulid", }, ).exec() ================================================ FILE: tests/web_api/ipadapter_advanced_weighting.py ================================================ from .template import ( APITestTemplate, realistic_girl_face_img, disable_in_cq, get_model, ) @disable_in_cq def test_ipadapter_advanced_weighting(): weights = [0.0] * 16 # 16 weights for SD15 / 11 weights for SDXL # SD15 composition weights[4] = 0.25 weights[5] = 1.0 APITestTemplate( "test_ipadapter_advanced_weighting", "txt2img", payload_overrides={ "width": 512, "height": 512, }, unit_overrides={ "image": realistic_girl_face_img, "module": "ip-adapter-auto", "model": get_model("ip-adapter_sd15"), "advanced_weighting": weights, }, ).exec() APITestTemplate( "test_ipadapter_advanced_weighting_ref", "txt2img", payload_overrides={ "width": 512, "height": 512, }, unit_overrides={ "image": realistic_girl_face_img, "module": "ip-adapter-auto", "model": get_model("ip-adapter_sd15"), }, ).exec() ================================================ FILE: tests/web_api/ipadapter_clip_api.py ================================================ import requests from .template import ( APITestTemplate, realistic_girl_face_img, girl_img, mask_img, disable_in_cq, get_model, ) def detect_template(payload, status: int = 200): url = APITestTemplate.BASE_URL + "controlnet/detect" resp = requests.post(url, json=payload) assert resp.status_code == status if status != 200: return resp_json = resp.json() assert "tensor" in resp_json assert len(resp_json["tensor"]) == len(payload["controlnet_input_images"]) return resp_json @disable_in_cq def test_ipadapter_clip_api(): """Use previously saved CLIP output in ipadapter run.""" resp = detect_template( dict( controlnet_input_images=[realistic_girl_face_img], controlnet_module="ip-adapter_clip_h", ) ) ipadapter_input = resp["tensor"] APITestTemplate( "test_ipadapter_clip_api", "txt2img", payload_overrides={}, unit_overrides={ "ipadapter_input": ipadapter_input, "model": get_model("ip-adapter_sd15"), }, ).exec() @disable_in_cq def test_ipadapter_clip_api_mask(): resp = detect_template( dict( controlnet_input_images=[girl_img], controlnet_masks=[mask_img], controlnet_module="ip-adapter_clip_h", ) ) ipadapter_input = resp["tensor"] APITestTemplate( "test_ipadapter_clip_api_mask", "txt2img", payload_overrides={}, unit_overrides={ "ipadapter_input": ipadapter_input, "model": get_model("ip-adapter_sd15"), }, ).exec() ================================================ FILE: tests/web_api/modules_test.py ================================================ import pytest import requests from .template import APITestTemplate expected_module_names = { "animal_openpose", "anime_face_segment", "blur_gaussian", "canny", "clip_vision", "color", "densepose", "densepose_parula", "depth", "depth_anything", "depth_anything_v2", "depth_hand_refiner", "depth_leres", "depth_leres++", "depth_zoe", "dw_openpose_full", "hed", "hed_safe", "inpaint", "inpaint_only", "inpaint_only+lama", "instant_id_face_embedding", "instant_id_face_keypoints", "invert", "ip-adapter-auto", "ip-adapter_clip_sd15", "ip-adapter_clip_sdxl", "ip-adapter_clip_sdxl_plus_vith", "ip-adapter_face_id", "ip-adapter_face_id_plus", "lineart", "lineart_anime", "lineart_anime_denoise", "lineart_coarse", "lineart_standard", "mediapipe_face", "mlsd", "none", "normal_bae", "normal_dsine", "normal_map", "oneformer_ade20k", "oneformer_coco", "openpose", "openpose_face", "openpose_faceonly", "openpose_full", "openpose_hand", "pidinet", "pidinet_safe", "pidinet_scribble", "pidinet_sketch", "recolor_intensity", "recolor_luminance", "reference_adain", "reference_adain+attn", "reference_only", "revision_clipvision", "revision_ignore_prompt", "scribble_hed", "scribble_xdog", "segmentation", "shuffle", "softedge_teed", "threshold", "tile_colorfix", "tile_colorfix+sharp", "tile_resample", } # Display name (label) expected_module_alias = { "animal_openpose", "blur_gaussian", "canny", "densepose (pruple bg & purple torso)", "densepose_parula (black bg & blue torso)", "depth_anything", "depth_anything_v2", "depth_hand_refiner", "depth_leres", "depth_leres++", "depth_midas", "depth_zoe", "dw_openpose_full", "inpaint_global_harmonious", "inpaint_only", "inpaint_only+lama", "instant_id_face_embedding", "instant_id_face_keypoints", "invert (from white bg & black line)", "ip-adapter-auto", "ip-adapter_clip_g", "ip-adapter_clip_h", "ip-adapter_clip_sdxl_plus_vith", "ip-adapter_face_id", "ip-adapter_face_id_plus", "lineart_anime", "lineart_anime_denoise", "lineart_coarse", "lineart_realistic", "lineart_standard (from white bg & black line)", "mediapipe_face", "mlsd", "none", "normal_bae", "normal_dsine", "normal_midas", "openpose", "openpose_face", "openpose_faceonly", "openpose_full", "openpose_hand", "recolor_intensity", "recolor_luminance", "reference_adain", "reference_adain+attn", "reference_only", "revision_clipvision", "revision_ignore_prompt", "scribble_hed", "scribble_pidinet", "scribble_xdog", "seg_anime_face", "seg_ofade20k", "seg_ofcoco", "seg_ufade20k", "shuffle", "softedge_hed", "softedge_hedsafe", "softedge_pidinet", "softedge_pidisafe", "softedge_teed", "t2ia_color_grid", "t2ia_sketch_pidi", "t2ia_style_clipvision", "threshold", "tile_colorfix", "tile_colorfix+sharp", "tile_resample", } @pytest.mark.parametrize("alias", ("true", "false")) def test_module_list(alias): json_resp = requests.get( APITestTemplate.BASE_URL + f"controlnet/module_list?alias_names={alias}" ).json() module_list = json_resp["module_list"] module_detail: dict = json_resp["module_detail"] expected_list = expected_module_alias if alias == "true" else expected_module_names assert set(module_list).issuperset(expected_list), expected_list - set(module_list) assert set(module_list) == set(module_detail.keys()) assert module_detail["canny"] == dict( model_free=False, sliders=[ { "name": "Resolution", "value": 512, "min": 64, "max": 2048, "step": 8, }, {"name": "Low Threshold", "value": 100, "min": 1, "max": 255, "step": 1}, {"name": "High Threshold", "value": 200, "min": 1, "max": 255, "step": 1}, ], ) def test_control_types(): json_resp = requests.get( APITestTemplate.BASE_URL + f"controlnet/control_types" ).json() assert "control_types" in json_resp actual_control_types = set(json_resp["control_types"].keys()) expected_control_types = { "Canny", "Revision", "Inpaint", "Instant-ID", "InstructP2P", "Shuffle", "Scribble", "SparseCtrl", "MLSD", "OpenPose", "Depth", "All", "Lineart", "SoftEdge", "NormalMap", "IP-Adapter", "Reference", "T2I-Adapter", "Segmentation", "Tile", "Recolor", } assert actual_control_types.issuperset(expected_control_types), ( expected_control_types - actual_control_types ) ================================================ FILE: tests/web_api/pose.json ================================================ { "people": [ { "pose_keypoints_2d": [ 275.2506064884899, 196.32469357280343, 1, 303.3188016469506, 272.70982071889466, 1, 244.98447950024644, 292.09994638829477, 1, 236.38292745027104, 517.7037729015278, 1, 168.3984479500246, 418.0022744632577, 1, 412.90526047751445, 257.2121425016039, 1, 403.17894535813576, 510.14290732520925, 1, 294.43481869004336, 376.4781345848482, 1, 265.25747216047955, 562.5137576822758, 1, 0, 0, 0, 0, 0, 0, 359.0078938961762, 562.0711206495608, 1, 0, 0, 0, 0, 0, 0, 240.097671925037, 184.513914838073, 1, 308.33409148775263, 161.22089906296208, 1, 204.7558201874076, 213.05887067565308, 1, 366.61934701298674, 148.9278832878512, 1 ], "hand_right_keypoints_2d": [ 168.39790150130915, 418.0005271461072, 1, 181.79401055357368, 399.3767976307846, 1, 184.8627576873498, 384.75709227716266, 1, 198.118414869015, 381.90007483819153, 1, 215.15048903799024, 386.8527024571639, 1, 180.1325238303079, 346.88417988394843, 1, 178.10795487568174, 321.1018085790239, 1, 190.70710955474613, 320.5669160600145, 1, 203.3062827659017, 325.5456061326966, 1, 172.3536896669116, 350.7525276652412, 1, 170.000622912838, 325.0336533262108, 1, 185.70972426755964, 323.476117950832, 1, 208.50724912413568, 333.4635128502484, 1, 163.50256975319706, 356.1325737292637, 1, 162.59147123450128, 335.0197021116338, 1, 183.9828354600188, 328.4553078224726, 1, 201.57171021013423, 337.94383954551654, 1, 152.9805889462092, 357.94403689402753, 1, 167.09651929267878, 341.7437601311937, 1, 180.9402668216081, 337.10207327446744, 1, 194.60028340222678, 343.0449246874465, 1 ], "hand_left_keypoints_2d": [ 294.4393772120137, 376.476024395234, 1, 271.70933825161165, 384.48117305399165, 1, 257.2452829806548, 374.58948859472207, 1, 238.26122936397638, 375.2887100029166, 1, 219.89983184668415, 382.69322630254595, 1, 263.0323651487124, 320.1279349241104, 1, 246.94602107917282, 309.8099960810156, 1, 233.73717716804694, 314.1485136789638, 1, 224.27755744411303, 322.7892154545116, 1, 264.97558037166135, 334.6319791090978, 1, 254.35598193615226, 315.5629746257517, 1, 238.25810853722876, 321.2812182403252, 1, 223.57727818251382, 328.39525394113423, 1, 278.15831452661644, 337.1533682086847, 1, 265.12624946042416, 323.3619430418993, 1, 250.5919197031302, 325.30525908324694, 1, 235.2500911122877, 332.6721359855453, 1, 285.9427695830851, 341.50671458478496, 1, 274.50497773130155, 333.1376809270594, 1, 261.49768784257105, 328.7012203257942, 1, 248.90495501067193, 332.0535195828255, 1 ] } ], "canvas_width": 512, "canvas_height": 512 } ================================================ FILE: tests/web_api/render_openpose_json.py ================================================ import requests import unittest import importlib import json from pathlib import Path utils = importlib.import_module("extensions.sd-webui-controlnet.tests.utils", "utils") def render(poses): return requests.post( utils.BASE_URL + "/controlnet/render_openpose_json", json=poses ).json() with open(Path(__file__).parent / "pose.json", "r") as f: pose = json.load(f) with open(Path(__file__).parent / "animal_pose.json", "r") as f: animal_pose = json.load(f) class TestDetectEndpointWorking(unittest.TestCase): def test_render_single(self): res = render([pose]) self.assertEqual(res["info"], "Success") self.assertEqual(len(res["images"]), 1) def test_render_multiple(self): res = render([pose, pose]) self.assertEqual(res["info"], "Success") self.assertEqual(len(res["images"]), 2) def test_render_no_pose(self): res = render([]) self.assertNotEqual(res["info"], "Success") def test_render_invalid_pose(self): res = render([{"foo": 10, "bar": 100}]) self.assertNotIn("info", res) self.assertNotIn("images", res) def test_render_animals(self): res = render([animal_pose]) self.assertEqual(res["info"], "Success") self.assertEqual(len(res["images"]), 1) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/web_api/template.py ================================================ import io import os import cv2 import base64 import functools from typing import Dict, Any, List, Union, Literal, Optional from pathlib import Path import datetime from enum import Enum import numpy as np import pytest from contextlib import contextmanager import requests from PIL import Image def disable_in_cq(func): """Skips the decorated test func in CQ run.""" @functools.wraps(func) def wrapped_func(*args, **kwargs): if APITestTemplate.is_cq_run: pytest.skip() return func(*args, **kwargs) return wrapped_func PayloadOverrideType = Dict[str, Any] timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") test_result_dir = Path(__file__).parent / "results" / f"test_result_{timestamp}" test_expectation_dir = Path(__file__).parent / "expectations" os.makedirs(test_expectation_dir, exist_ok=True) resource_dir = Path(__file__).parents[1] / "images" def get_dest_dir(): if APITestTemplate.is_set_expectation_run: return test_expectation_dir else: return test_result_dir def save_base64(base64img: str, dest: Path): Image.open(io.BytesIO(base64.b64decode(base64img.split(",", 1)[0]))).save(dest) def read_image(img_path: Path) -> str: img = cv2.imread(str(img_path)) _, bytes = cv2.imencode(".png", img) encoded_image = base64.b64encode(bytes).decode("utf-8") return encoded_image def read_image_dir( img_dir: Path, suffixes=(".png", ".jpg", ".jpeg", ".webp") ) -> List[str]: """Try read all images in given img_dir.""" img_dir = str(img_dir) images = [] for filename in os.listdir(img_dir): if filename.endswith(suffixes): img_path = os.path.join(img_dir, filename) try: images.append(read_image(img_path)) except IOError: print(f"Error opening {img_path}") return images girl_img = read_image(resource_dir / "1girl.png") mask_img = read_image(resource_dir / "mask.png") mask_small_img = read_image(resource_dir / "mask_small.png") mask_left = read_image(resource_dir / "mask_left.png") mask_right = read_image(resource_dir / "mask_right.png") portrait_imgs = read_image_dir(resource_dir / "portrait") realistic_girl_face_img = portrait_imgs[0] living_room_img = read_image(resource_dir / "living_room.webp") general_negative_prompt = """ (worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, backlight,(ugly:1.331), (duplicate:1.331), (morbid:1.21), (mutilated:1.21), (tranny:1.331), mutated hands, (poorly drawn hands:1.331), blurry, (bad anatomy:1.21), (bad proportions:1.331), extra limbs, (missing arms:1.331), (extra legs:1.331), (fused fingers:1.61051), (too many fingers:1.61051), (unclear eyes:1.331), bad hands, missing fingers, extra digit, bad body, easynegative, nsfw""" class StableDiffusionVersion(Enum): """The version family of stable diffusion model.""" UNKNOWN = 0 SD1x = 1 SD2x = 2 SDXL = 3 sd_version = StableDiffusionVersion( int(os.environ.get("CONTROLNET_TEST_SD_VERSION", StableDiffusionVersion.SD1x.value)) ) is_full_coverage = os.environ.get("CONTROLNET_TEST_FULL_COVERAGE", None) is not None class APITestTemplate: is_set_expectation_run = os.environ.get("CONTROLNET_SET_EXP", "True") == "True" is_cq_run = os.environ.get("FORGE_CQ_TEST", "False") == "True" BASE_URL = "http://localhost:7860/" def __init__( self, name: str, gen_type: Union[Literal["img2img"], Literal["txt2img"]], payload_overrides: PayloadOverrideType, unit_overrides: Union[PayloadOverrideType, List[PayloadOverrideType]], input_image: Optional[str] = None, ): self.name = name self.url = APITestTemplate.BASE_URL + "sdapi/v1/" + gen_type self.payload = { **(txt2img_payload if gen_type == "txt2img" else img2img_payload), **payload_overrides, } if gen_type == "img2img" and input_image is not None: self.payload["init_images"] = [input_image] # CQ runs on CPU. Reduce steps/width/height to increase test speed. if APITestTemplate.is_cq_run: if "steps" not in payload_overrides: self.payload["steps"] = 3 if "width" not in payload_overrides: self.payload["width"] = 64 if "height" not in payload_overrides: self.payload["height"] = 64 unit_overrides = ( unit_overrides if isinstance(unit_overrides, (list, tuple)) else [unit_overrides] ) self.payload["alwayson_scripts"]["ControlNet"]["args"] = [ { **default_unit, **unit_override, **( {"image": input_image} if gen_type == "txt2img" and input_image is not None else {} ), } for unit_override in unit_overrides ] self.active_unit_count = len(unit_overrides) def exec(self, *args, **kwargs) -> bool: if APITestTemplate.is_cq_run: return self.exec_cq(*args, **kwargs) else: return self.exec_local(*args, **kwargs) def exec_cq( self, expected_output_num: Optional[int] = None, *args, **kwargs ) -> bool: """Execute test in CQ environment.""" res = requests.post(url=self.url, json=self.payload) if res.status_code != 200: print(f"Unexpected status code {res.status_code}") return False response = res.json() if "images" not in response: print(response.keys()) return False if expected_output_num is None: expected_output_num = ( self.payload["n_iter"] * self.payload["batch_size"] + self.active_unit_count ) if len(response["images"]) != expected_output_num: print(f"{len(response['images'])} != {expected_output_num}") return False return True def exec_local(self, result_only: bool = True, *args, **kwargs) -> bool: """Execute test in local environment.""" if not APITestTemplate.is_set_expectation_run: os.makedirs(test_result_dir, exist_ok=True) failed = False response = requests.post(url=self.url, json=self.payload).json() if "images" not in response: print(response.keys()) return False dest_dir = get_dest_dir() results = response["images"][:1] if result_only else response["images"] for i, base64image in enumerate(results): img_file_name = f"{self.name}_{i}.png" save_base64(base64image, dest_dir / img_file_name) if not APITestTemplate.is_set_expectation_run: try: img1 = cv2.imread(os.path.join(test_expectation_dir, img_file_name)) img2 = cv2.imread(os.path.join(test_result_dir, img_file_name)) except Exception as e: print(f"Get exception reading imgs: {e}") failed = True continue if img1 is None: print(f"Warn: No expectation file found {img_file_name}.") continue if not expect_same_image( img1, img2, diff_img_path=str( test_result_dir / img_file_name.replace(".png", "_diff.png") ), ): failed = True return not failed def expect_same_image(img1, img2, diff_img_path: str) -> bool: # Calculate the difference between the two images diff = cv2.absdiff(img1, img2) # Set a threshold to highlight the different pixels threshold = 30 diff_highlighted = np.where(diff > threshold, 255, 0).astype(np.uint8) # Assert that the two images are similar within a tolerance similar = np.allclose(img1, img2, rtol=0.5, atol=1) if not similar: # Save the diff_highlighted image to inspect the differences cv2.imwrite(diff_img_path, diff_highlighted) matching_pixels = np.isclose(img1, img2, rtol=0.5, atol=1) similar_in_general = (matching_pixels.sum() / matching_pixels.size) >= 0.95 return similar_in_general @contextmanager def console_log_context(output_file="output.txt"): log_encoding = "utf-8" if APITestTemplate.is_cq_run else "utf-16" class Context: def __init__(self, output_file) -> None: self.output_file = output_file self.init_line_count = 0 with open(self.output_file, "r", encoding=log_encoding) as file: for _ in file: self.init_line_count += 1 def is_in_console_logs(self, expected_lines: List[str]) -> bool: with open(self.output_file, "r", encoding=log_encoding) as file: for i, line in enumerate(file): if not expected_lines: break if i < self.init_line_count: continue if expected_lines[0] in line: expected_lines.pop(0) return len(expected_lines) == 0 yield Context(output_file) def get_model(model_name: str) -> str: """Find an available model with specified model name.""" if model_name.lower() == "none": return "None" r = requests.get(APITestTemplate.BASE_URL + "controlnet/model_list") result = r.json() if "model_list" not in result: raise ValueError(f"No model available\n{result}") candidates = [ model for model in result["model_list"] if model_name.lower() in model.lower() ] if not candidates: raise ValueError("No suitable model available") return candidates[0] default_unit = { "control_mode": "Balanced", "enabled": True, "guidance_end": 1, "guidance_start": 0, "pixel_perfect": True, "processor_res": 512, "resize_mode": "Crop and Resize", "threshold_a": -1, "threshold_b": -1, "weight": 1, "module": "canny", "model": get_model("sd15_canny"), } img2img_payload = { "batch_size": 1, "cfg_scale": 7, "height": 768, "width": 512, "n_iter": 1, "steps": 10, "sampler_name": "Euler a", "prompt": "(masterpiece: 1.3), (highres: 1.3), best quality,", "negative_prompt": "", "seed": 42, "seed_enable_extras": False, "seed_resize_from_h": 0, "seed_resize_from_w": 0, "subseed": -1, "subseed_strength": 0, "override_settings": {}, "override_settings_restore_afterwards": False, "do_not_save_grid": False, "do_not_save_samples": False, "s_churn": 0, "s_min_uncond": 0, "s_noise": 1, "s_tmax": None, "s_tmin": 0, "script_args": [], "script_name": None, "styles": [], "alwayson_scripts": {"ControlNet": {"args": [default_unit]}}, "denoising_strength": 0.75, "initial_noise_multiplier": 1, "inpaint_full_res": 0, "inpaint_full_res_padding": 32, "inpainting_fill": 1, "inpainting_mask_invert": 0, "mask_blur_x": 4, "mask_blur_y": 4, "mask_blur": 4, "resize_mode": 0, } txt2img_payload = { "alwayson_scripts": {"ControlNet": {"args": [default_unit]}}, "batch_size": 1, "cfg_scale": 7, "comments": {}, "disable_extra_networks": False, "do_not_save_grid": False, "do_not_save_samples": False, "enable_hr": False, "height": 768, "hr_negative_prompt": "", "hr_prompt": "", "hr_resize_x": 0, "hr_resize_y": 0, "hr_scale": 2, "hr_second_pass_steps": 0, "hr_upscaler": "Latent", "n_iter": 1, "negative_prompt": "", "override_settings": {}, "override_settings_restore_afterwards": True, "prompt": "(masterpiece: 1.3), (highres: 1.3), best quality,", "restore_faces": False, "s_churn": 0.0, "s_min_uncond": 0, "s_noise": 1.0, "s_tmax": None, "s_tmin": 0.0, "sampler_name": "Euler a", "script_args": [], "script_name": None, "seed": 42, "seed_enable_extras": True, "seed_resize_from_h": -1, "seed_resize_from_w": -1, "steps": 10, "styles": [], "subseed": -1, "subseed_strength": 0, "tiling": False, "width": 512, } ================================================ FILE: unit_tests/__init__.py ================================================ ================================================ FILE: unit_tests/args_test.py ================================================ import pytest import torch import numpy as np from copy import copy from dataclasses import dataclass from internal_controlnet.args import ControlNetUnit H = W = 128 img1 = np.ones(shape=[H, W, 3], dtype=np.uint8) img2 = np.ones(shape=[H, W, 3], dtype=np.uint8) * 2 mask_diff = np.ones(shape=[H - 1, W - 1, 3], dtype=np.uint8) * 2 mask_2d = np.ones(shape=[H, W], dtype=np.uint8) img_bad_channel = np.ones(shape=[H, W, 2], dtype=np.uint8) * 2 img_bad_dim = np.ones(shape=[1, H, W, 3], dtype=np.uint8) * 2 ui_img_diff = np.ones(shape=[H - 1, W - 1, 4], dtype=np.uint8) * 2 ui_img = np.ones(shape=[H, W, 4], dtype=np.uint8) tensor1 = torch.zeros(size=[1, 1], dtype=torch.float16) @pytest.fixture(scope="module") def set_cls_funcs(): ControlNetUnit.cls_match_model = lambda s: s in { "None", "model1", "model2", "control_v11p_sd15_inpaint [ebff9138]", } ControlNetUnit.cls_match_module = lambda s: s in { "none", "module1", "inpaint_only+lama", } ControlNetUnit.cls_decode_base64 = lambda s: { "b64img1": img1, "b64img2": img2, "b64mask_diff": mask_diff, }[s] ControlNetUnit.cls_torch_load_base64 = lambda s: { "b64tensor1": tensor1, }[s] ControlNetUnit.cls_get_preprocessor = lambda s: { "module1": MockPreprocessor(), "none": MockPreprocessor(), "inpaint_only+lama": MockPreprocessor(), }[s] def test_module_invalid(set_cls_funcs): with pytest.raises(ValueError) as excinfo: ControlNetUnit(module="foo") assert "module(foo) not found in supported modules." in str(excinfo.value) def test_module_valid(set_cls_funcs): ControlNetUnit(module="module1") def test_model_invalid(set_cls_funcs): with pytest.raises(ValueError) as excinfo: ControlNetUnit(model="foo") assert "model(foo) not found in supported models." in str(excinfo.value) def test_model_valid(set_cls_funcs): ControlNetUnit(model="model1") @pytest.mark.parametrize( "d", [ # API dict(image={"image": "b64img1"}), dict(image={"image": "b64img1", "mask": "b64img2"}), dict(image=["b64img1", "b64img2"]), dict(image=("b64img1", "b64img2")), dict(image=[{"image": "b64img1", "mask": "b64img2"}]), dict(image=[{"image": "b64img1"}]), dict(image=[{"image": "b64img1", "mask": None}]), dict( image=[ {"image": "b64img1", "mask": "b64img2"}, {"image": "b64img1", "mask": "b64img2"}, ] ), dict( image=[ {"image": "b64img1", "mask": None}, {"image": "b64img1", "mask": "b64img2"}, ] ), dict( image=[ {"image": "b64img1"}, {"image": "b64img1", "mask": "b64img2"}, ] ), dict(image="b64img1", mask="b64img2"), dict(image="b64img1"), dict(image="b64img1", mask_image="b64img2"), dict(image=None), # UI dict(image=dict(image=img1)), dict(image=dict(image=img1, mask=img2)), # Grey-scale mask/image should be accepted. dict(image=dict(image=img1, mask=mask_2d)), dict(image=img1, mask=mask_2d), dict(image=np.zeros(shape=[H, W, 1], dtype=np.uint8)), # RGBA image input should be accepted. dict(image=np.zeros(shape=[H, W, 4], dtype=np.uint8)), ], ) def test_valid_image_formats(set_cls_funcs, d): ControlNetUnit(**d) unit = ControlNetUnit.from_dict(d) unit.get_input_images_rgba() @pytest.mark.parametrize( "d", [ dict(image={"mask": "b64img1"}), dict(image={"foo": "b64img1", "bar": "b64img2"}), dict(image=["b64img1"]), dict(image=("b64img1", "b64img2", "b64img1")), dict(image=[]), dict(image=[{"mask": "b64img1"}]), dict(image=None, mask="b64img2"), # image & mask have different H x W dict(image="b64img1", mask="b64mask_diff"), ], ) def test_invalid_image_formats(set_cls_funcs, d): # Setting field will be fine. ControlNetUnit(**d) unit = ControlNetUnit.from_dict(d) # Error on eval. with pytest.raises((ValueError, AssertionError)): unit.get_input_images_rgba() def test_mask_alias_conflict(): with pytest.raises((ValueError, AssertionError)): ControlNetUnit.from_dict( dict( image="b64img1", mask="b64img1", mask_image="b64img1", ) ), def test_resize_mode(): ControlNetUnit(resize_mode="Just Resize") # Alias should also work. For deforum # See https://github.com/deforum-art/sd-webui-deforum/blob/322426851408ebca2cd49492bfeb1ec86e1dc869/scripts/deforum_helpers/deforum_controlnet.py#L150 ControlNetUnit(resize_mode="Inner Fit (Scale to Fit)") def test_weight(): ControlNetUnit(weight=0.5) ControlNetUnit(weight=0.0) with pytest.raises(ValueError): ControlNetUnit(weight=-1) with pytest.raises(ValueError): ControlNetUnit(weight=100) def test_start_end(): ControlNetUnit(guidance_start=0.0, guidance_end=1.0) ControlNetUnit(guidance_start=0.5, guidance_end=1.0) ControlNetUnit(guidance_start=0.5, guidance_end=0.5) with pytest.raises(ValueError): ControlNetUnit(guidance_start=1.0, guidance_end=0.0) with pytest.raises(ValueError): ControlNetUnit(guidance_start=11) with pytest.raises(ValueError): ControlNetUnit(guidance_end=11) def test_effective_region_mask(): ControlNetUnit(effective_region_mask="b64img1") ControlNetUnit(effective_region_mask=None) ControlNetUnit(effective_region_mask=img1) with pytest.raises(ValueError): ControlNetUnit(effective_region_mask=124) def test_ipadapter_input(): ControlNetUnit(ipadapter_input=["b64tensor1"]) ControlNetUnit(ipadapter_input="b64tensor1") ControlNetUnit(ipadapter_input=None) with pytest.raises(ValueError): ControlNetUnit(ipadapter_input=[]) @dataclass class MockSlider: value: float = 1 minimum: float = 0 maximum: float = 2 @dataclass class MockPreprocessor: slider_resolution = MockSlider() slider_1 = MockSlider() slider_2 = MockSlider() def test_preprocessor_sliders(): unit = ControlNetUnit(enabled=True, module="none") assert unit.processor_res == 1 assert unit.threshold_a == 1 assert unit.threshold_b == 1 def test_preprocessor_sliders_disabled(): unit = ControlNetUnit(enabled=False, module="none") assert unit.processor_res == -1 assert unit.threshold_a == -1 assert unit.threshold_b == -1 def test_infotext_parsing(): infotext = ( "Module: inpaint_only+lama, Model: control_v11p_sd15_inpaint [ebff9138], Weight: 1, " "Resize Mode: Resize and Fill, Low Vram: False, Guidance Start: 0, Guidance End: 1, " "Pixel Perfect: True, Control Mode: Balanced" ) assert ControlNetUnit( enabled=True, module="inpaint_only+lama", model="control_v11p_sd15_inpaint [ebff9138]", weight=1, resize_mode="Resize and Fill", low_vram=False, guidance_start=0, guidance_end=1, pixel_perfect=True, control_mode="Balanced", ) == ControlNetUnit.parse(infotext) def test_alias(): ControlNetUnit.from_dict({"lowvram": True}) def test_copy(): unit1 = ControlNetUnit(enabled=True, module="none") unit2 = copy(unit1) unit2.enabled = False assert unit1.enabled assert not unit2.enabled ================================================ FILE: web_tests/README.md ================================================ # Web Tests Web tests are selenium-based browser interaction tests, that fully simulate actual user's behaviours. # Preparation - Have Google Chrome (Any version) installed. - Install following python packages with `pip`: - `selenium` - `webdriver-manager` # Run Tests - Have WebUI with ControlNet installed running on `localhost:7860` - Run `python main.py --overwrite_expectation` for the first run to set a baseline. - Run `python main.py` later to verify the baseline still holds. ================================================ FILE: web_tests/main.py ================================================ import argparse import unittest import os import sys import time import datetime from enum import Enum from typing import List, Tuple import cv2 import requests import numpy as np from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager TIMEOUT = 20 # seconds CWD = os.getcwd() SKI_IMAGE = os.path.join(CWD, "images/ski.jpg") timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") test_result_dir = os.path.join("results", f"test_result_{timestamp}") test_expectation_dir = "expectations" os.makedirs(test_result_dir, exist_ok=True) os.makedirs(test_expectation_dir, exist_ok=True) driver_path = ChromeDriverManager().install() class GenType(Enum): txt2img = "txt2img" img2img = "img2img" def _find_by_xpath(self, driver: webdriver.Chrome, xpath: str) -> "WebElement": return driver.find_element(By.XPATH, xpath) def tab(self, driver: webdriver.Chrome) -> "WebElement": return self._find_by_xpath( driver, f"//*[@id='tabs']/*[contains(@class, 'tab-nav')]//button[text()='{self.value}']", ) def controlnet_panel(self, driver: webdriver.Chrome) -> "WebElement": return self._find_by_xpath( driver, f"//*[@id='tab_{self.value}']//*[@id='controlnet']" ) def generate_button(self, driver: webdriver.Chrome) -> "WebElement": return self._find_by_xpath(driver, f"//*[@id='{self.value}_generate_box']") def prompt_textarea(self, driver: webdriver.Chrome) -> "WebElement": return self._find_by_xpath(driver, f"//*[@id='{self.value}_prompt']//textarea") class SeleniumTestCase(unittest.TestCase): def __init__(self, methodName: str = "runTest") -> None: super().__init__(methodName) self.driver = None self.gen_type = None def setUp(self) -> None: super().setUp() self.driver = webdriver.Chrome(driver_path) self.driver.get(webui_url) wait = WebDriverWait(self.driver, TIMEOUT) wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#controlnet"))) self.gen_type = GenType.txt2img def tearDown(self) -> None: self.driver.quit() super().tearDown() def select_gen_type(self, gen_type: GenType): gen_type.tab(self.driver).click() self.gen_type = gen_type def set_prompt(self, prompt: str): textarea = self.gen_type.prompt_textarea(self.driver) textarea.clear() textarea.send_keys(prompt) def expand_controlnet_panel(self): controlnet_panel = self.gen_type.controlnet_panel(self.driver) input_image_group = controlnet_panel.find_element( By.CSS_SELECTOR, ".cnet-input-image-group" ) if not input_image_group.is_displayed(): controlnet_panel.click() def enable_controlnet_unit(self): controlnet_panel = self.gen_type.controlnet_panel(self.driver) enable_checkbox = controlnet_panel.find_element( By.CSS_SELECTOR, ".cnet-unit-enabled input[type='checkbox']" ) if not enable_checkbox.is_selected(): enable_checkbox.click() def iterate_preprocessor_types(self, ignore_none: bool = True): dropdown = self.gen_type.controlnet_panel(self.driver).find_element( By.CSS_SELECTOR, f"#{self.gen_type.value}_controlnet_ControlNet-0_controlnet_preprocessor_dropdown", ) index = 0 while True: dropdown.click() options = dropdown.find_elements( By.XPATH, "//ul[contains(@class, 'options')]/li" ) input_element = dropdown.find_element(By.CSS_SELECTOR, "input") if index >= len(options): return option = options[index] index += 1 if "none" in option.text and ignore_none: continue option_text = option.text option.click() yield option_text def select_control_type(self, control_type: str): controlnet_panel = self.gen_type.controlnet_panel(self.driver) control_type_radio = controlnet_panel.find_element( By.CSS_SELECTOR, f'.controlnet_control_type input[value="{control_type}"]' ) control_type_radio.click() time.sleep(3) # Wait for gradio backend to update model/module def set_seed(self, seed: int): seed_input = self.driver.find_element( By.CSS_SELECTOR, f"#{self.gen_type.value}_seed input[type='number']" ) seed_input.clear() seed_input.send_keys(seed) def set_subseed(self, seed: int): show_button = self.driver.find_element( By.CSS_SELECTOR, f"#{self.gen_type.value}_subseed_show input[type='checkbox']", ) if not show_button.is_selected(): show_button.click() subseed_locator = ( By.CSS_SELECTOR, f"#{self.gen_type.value}_subseed input[type='number']", ) WebDriverWait(self.driver, TIMEOUT).until( EC.visibility_of_element_located(subseed_locator) ) subseed_input = self.driver.find_element(*subseed_locator) subseed_input.clear() subseed_input.send_keys(seed) def upload_controlnet_input(self, img_path: str): controlnet_panel = self.gen_type.controlnet_panel(self.driver) image_input = controlnet_panel.find_element( By.CSS_SELECTOR, '.cnet-input-image-group .cnet-image input[type="file"]' ) image_input.send_keys(img_path) def upload_img2img_input(self, img_path: str): image_input = self.driver.find_element( By.CSS_SELECTOR, '#img2img_image input[type="file"]' ) image_input.send_keys(img_path) def generate_image(self, name: str): self.gen_type.generate_button(self.driver).click() progress_bar_locator_visible = EC.visibility_of_element_located( (By.CSS_SELECTOR, f"#{self.gen_type.value}_results .progress") ) WebDriverWait(self.driver, TIMEOUT).until(progress_bar_locator_visible) WebDriverWait(self.driver, TIMEOUT * 10).until_not(progress_bar_locator_visible) generated_imgs = self.driver.find_elements( By.CSS_SELECTOR, f"#{self.gen_type.value}_results #{self.gen_type.value}_gallery img", ) for i, generated_img in enumerate(generated_imgs): # Use requests to get the image content img_content = requests.get(generated_img.get_attribute("src")).content # Save the image content to a file global overwrite_expectation dest_dir = ( test_expectation_dir if overwrite_expectation else test_result_dir ) img_file_name = f"{self.__class__.__name__}_{name}_{i}.png" with open( os.path.join(dest_dir, img_file_name), "wb", ) as img_file: img_file.write(img_content) if not overwrite_expectation: try: img1 = cv2.imread(os.path.join(test_expectation_dir, img_file_name)) img2 = cv2.imread(os.path.join(test_result_dir, img_file_name)) except Exception as e: self.assertTrue(False, f"Get exception reading imgs: {e}") continue self.expect_same_image( img1, img2, diff_img_path=os.path.join( test_result_dir, img_file_name.replace(".png", "_diff.png") ), ) def expect_same_image(self, img1, img2, diff_img_path: str): # Calculate the difference between the two images diff = cv2.absdiff(img1, img2) # Set a threshold to highlight the different pixels threshold = 30 diff_highlighted = np.where(diff > threshold, 255, 0).astype(np.uint8) # Assert that the two images are similar within a tolerance similar = np.allclose(img1, img2, rtol=0.5, atol=1) if not similar: # Save the diff_highlighted image to inspect the differences cv2.imwrite(diff_img_path, diff_highlighted) self.assertTrue(similar) simple_control_types = { "Canny": "canny", "Depth": "depth_midas", "Normal": "normal_bae", "OpenPose": "openpose_full", "MLSD": "mlsd", "Lineart": "lineart_standard (from white bg & black line)", "SoftEdge": "softedge_pidinet", "Scribble": "scribble_pidinet", "Seg": "seg_ofade20k", "Tile": "tile_resample", # Shuffle and Reference are not stable, and expected to fail. # The majority of pixels are same, but some outlier pixels can have big diff. "Shuffle": "shuffle", "Reference": "reference_only", }.keys() class SeleniumTxt2ImgTest(SeleniumTestCase): def setUp(self) -> None: super().setUp() self.select_gen_type(GenType.txt2img) self.set_seed(100) self.set_subseed(1000) def test_simple_control_types(self): """Test simple control types that only requires input image.""" for control_type in simple_control_types: with self.subTest(control_type=control_type): self.expand_controlnet_panel() self.select_control_type(control_type) self.upload_controlnet_input(SKI_IMAGE) self.generate_image(f"{control_type}_ski") class SeleniumImg2ImgTest(SeleniumTestCase): def setUp(self) -> None: super().setUp() self.select_gen_type(GenType.img2img) self.set_seed(100) self.set_subseed(1000) def test_simple_control_types(self): """Test simple control types that only requires input image.""" for control_type in simple_control_types: with self.subTest(control_type=control_type): self.expand_controlnet_panel() self.select_control_type(control_type) self.upload_img2img_input(SKI_IMAGE) self.upload_controlnet_input(SKI_IMAGE) self.generate_image(f"img2img_{control_type}_ski") class SeleniumInpaintTest(SeleniumTestCase): def setUp(self) -> None: super().setUp() def draw_inpaint_mask(self, target_canvas): size = target_canvas.size width = size["width"] height = size["height"] brush_radius = 5 repeat = int(width * 0.1 / brush_radius) trace: List[Tuple[int, int]] = [ (brush_radius, 0), (0, height * 0.2), (brush_radius, 0), (0, -height * 0.2), ] * repeat actions = ActionChains(self.driver) actions.move_to_element(target_canvas) # move to the canvas actions.move_by_offset(*trace[0]) actions.click_and_hold() # click and hold the left mouse button down for stop_point in trace[1:]: actions.move_by_offset(*stop_point) actions.release() # release the left mouse button actions.perform() # perform the action chain def draw_cn_mask(self): canvas = self.gen_type.controlnet_panel(self.driver).find_element( By.CSS_SELECTOR, ".cnet-input-image-group .cnet-image canvas" ) self.draw_inpaint_mask(canvas) def draw_a1111_mask(self): canvas = self.driver.find_element(By.CSS_SELECTOR, "#img2maskimg canvas") self.draw_inpaint_mask(canvas) def test_txt2img_inpaint(self): self.select_gen_type(GenType.txt2img) self.expand_controlnet_panel() self.select_control_type("Inpaint") self.upload_controlnet_input(SKI_IMAGE) self.draw_cn_mask() self.set_seed(100) self.set_subseed(1000) for option in self.iterate_preprocessor_types(): with self.subTest(option=option): self.generate_image(f"{option}_txt2img_ski") def test_img2img_inpaint(self): # Note: img2img inpaint can only use A1111 mask. # ControlNet input is disabled in img2img inpaint. self._test_img2img_inpaint(use_cn_mask=False, use_a1111_mask=True) def _test_img2img_inpaint(self, use_cn_mask: bool, use_a1111_mask: bool): self.select_gen_type(GenType.img2img) self.expand_controlnet_panel() self.select_control_type("Inpaint") self.upload_img2img_input(SKI_IMAGE) # Send to inpaint self.driver.find_element( By.XPATH, f"//*[@id='img2img_copy_to_img2img']//button[text()='inpaint']" ).click() time.sleep(3) # Select latent noise to make inpaint effect more visible. self.driver.find_element( By.XPATH, f"//input[@name='radio-img2img_inpainting_fill' and @value='latent noise']", ).click() self.set_prompt("(coca-cola:2.0)") self.enable_controlnet_unit() self.upload_controlnet_input(SKI_IMAGE) self.set_seed(100) self.set_subseed(1000) prefix = "" if use_cn_mask: self.draw_cn_mask() prefix += "controlnet" if use_a1111_mask: self.draw_a1111_mask() prefix += "A1111" for option in self.iterate_preprocessor_types(): with self.subTest(option=option, mask_prefix=prefix): self.generate_image(f"{option}_{prefix}_img2img_ski") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Your script description.") parser.add_argument( "--overwrite_expectation", action="store_true", help="overwrite expectation" ) parser.add_argument( "--target_url", type=str, default="http://localhost:7860", help="WebUI URL" ) args, unknown_args = parser.parse_known_args() overwrite_expectation = args.overwrite_expectation webui_url = args.target_url sys.argv = sys.argv[:1] + unknown_args unittest.main()