Repository: LYX0501/InstructNav Branch: main Commit: e58a5aac51cd Files: 72 Total size: 565.1 KB Directory structure: gitextract_3pjq364l/ ├── .gitignore ├── README.md ├── config_utils.py ├── constants.py ├── cv_utils/ │ ├── glee_detector.py │ ├── image_percevior.py │ └── object_list.py ├── llm_utils/ │ ├── gpt_request.py │ └── nav_prompt.py ├── mapper.py ├── mapping_utils/ │ ├── geometry.py │ ├── path_planning.py │ ├── preprocess.py │ ├── projection.py │ └── transform.py ├── objnav_agent.py ├── objnav_benchmark.py ├── requirements.txt └── thirdparty/ └── GLEE/ ├── configs/ │ ├── R50.yaml │ └── SwinL.yaml └── glee/ ├── __init__.py ├── backbone/ │ ├── __init__.py │ ├── backbone.py │ ├── build.py │ ├── davit.py │ ├── eva01.py │ ├── eva02-dino.py │ ├── eva02.py │ ├── eva_01_utils.py │ ├── eva_02_utils.py │ ├── internimage.py │ ├── registry.py │ ├── resnet.py │ ├── swin.py │ ├── vit.py │ └── vit_utils.py ├── config.py ├── config_deeplab.py ├── models/ │ ├── glee_model.py │ ├── pixel_decoder/ │ │ ├── __init__.py │ │ ├── early_fusion.py │ │ ├── maskdino_encoder.py │ │ ├── ops/ │ │ │ ├── functions/ │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn_func.py │ │ │ ├── make.sh │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ └── ms_deform_attn.py │ │ │ ├── setup.py │ │ │ ├── src/ │ │ │ │ ├── cpu/ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ ├── cuda/ │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ ├── ms_deform_attn.h │ │ │ │ └── vision.cpp │ │ │ └── test.py │ │ └── position_encoding.py │ ├── transformer_decoder/ │ │ ├── __init__.py │ │ ├── dino_decoder.py │ │ └── maskdino_decoder.py │ └── vos_utils.py ├── modules/ │ ├── __init__.py │ ├── attention.py │ ├── point_features.py │ ├── position_encoding.py │ └── postprocessing.py └── utils/ ├── __init__.py ├── box_ops.py ├── config.py ├── it_contrastive.py ├── misc.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ ./tmp *.pth *.pyc **/__pycache__ ================================================ FILE: README.md ================================================ # InstructNav Enabling robots to navigate following diverse language instructions in unexplored environments is an attractive goal for human-robot interaction. In this work, we propose InstructNav, a generic instruction navigation system. InstructNav makes the first endeavor to handle various instruction navigation tasks without any navigation training or pre-built maps. To reach this goal, we introduce **Dynamic Chain-of-Navigation (DCoN)** to unify the planning process for different types of navigation instructions. Furthermore, we propose **Multi-sourced Value Maps** to model key elements in instruction navigation so that linguistic DCoN planning can be converted into robot actionable trajectories. [InstructNav](https://github.com/LYX0501/InstructNav/blob/main/InstructNav.png) With InstructNav, we complete the R2R-CE task in a zero-shot way for the first time and outperform many task-training methods. Besides, InstructNav also surpasses the previous SOTA method by 10.48% on the zero-shot Habitat ObjNav and by 86.34% on demand-driven navigation DDN. Real robot experiments on diverse indoor scenes further demonstrate our method's robustness in coping with the environment and instruction variations. Please refer to more details in our paper: ![InstructNav: Zero-shot System for Generic Instruction Navigation in Unexplored Environment](https://arxiv.org/abs/2406.04882). ## 🔥 News - 2024.9.11: The HM3D objnav benchmark code is released. - 2024.9.5: Our paper is accepted by CoRL 2024. Codes will be released in the recent. ### Dependency ### Our project is based on the [habitat-sim](https://github.com/facebookresearch/habitat-sim?tab=readme-ov-file), [habitat-lab](https://github.com/facebookresearch/habitat-lab), and [Detectron2](https://github.com/facebookresearch/detectron2). Please follow the guides to install them in your python environment. You can directly install the latest version of habitat-lab and habitat-sim. And make sure you have properly download the navigation scenes [(HM3D, MP3D)](https://github.com/facebookresearch/habitat-lab/blob/main/DATASETS.md) and the episode dataset for both visual-language navigation (VLN-CE) and object navigation. ### Installation ### Firstly, clone our repo as: ``` git clone https://github.com/LYX0501/InstructNav.git cd InstructNav pip install -r requirements.txt ``` Our method depends on an open-vocalbulary detection and segmentation model [GLEE](https://github.com/FoundationVision/GLEE). Please check the original repo or try to use the copy located in the ./thirdparty/ directory. ### ### Prepare your GPT4 and GPT4V API Keys ### Please prepare your keys for calling the API for large-language model and large vision-language model. We prefer to use the GPT4 and GPT4V to do the inference. And our code follows the AzureOpenAI calling process. Before running the benchmark, you should prepare for your own api-keys and api-endpoint and api-version. You can check the ./llm_utils/gpt_request.py for usage details. ``` export GPT4_API_BASE= export GPT4_API_KEY= export GPT4_API_DEPLOY= export GPT4_API_VERSION= export GPT4V_API_BASE= export GPT4V_API_KEY= export GPT4V_API_DEPLOY= export GPT4V_API_VERSION= ``` ### Running our Benchmark Code ### If everything goes well, you can directly run the evaluation code for different navigation tasks. For example, ``` python objnav_benchmark.py ``` And all the episode results, intermediate results such as GPT4 input/output and value maps will be saved in /tmp/ directory. The real-time agent first-person-view image observation, depth and segmentation will be saved in the project root directory. Examples are shown below: ![test](https://github.com/user-attachments/assets/51a65b07-70e2-49f3-a850-815b0ec151d0) https://github.com/user-attachments/assets/04e37b91-c524-4c51-86d1-8fb72325f612 ## BibTex Please cite our paper if you find it helpful :) ``` @misc{InstructNav, title={InstructNav: Zero-shot System for Generic Instruction Navigation in Unexplored Environment}, author={Yuxing Long and Wenzhe Cai and Hongcheng Wang and Guanqi Zhan and Hao Dong}, year={2024}, eprint={2406.04882}, archivePrefix={arXiv}, primaryClass={cs.RO}, } ``` ================================================ FILE: config_utils.py ================================================ import habitat from habitat.config.read_write import read_write from habitat.config.default_structured_configs import ( CollisionsMeasurementConfig, FogOfWarConfig, TopDownMapMeasurementConfig, ) HM3D_CONFIG_PATH = "/habitat-lab/habitat-lab/habitat/config/benchmark/nav/objectnav/objectnav_hm3d.yaml" MP3D_CONFIG_PATH = "/habitat-lab/habitat-lab/habitat/config/benchmark/nav/objectnav/objectnav_mp3d.yaml" R2R_CONFIG_PATH = "/habitat-lab/habitat-lab/habitat/config/benchmark/nav/vln_r2r.yaml" def hm3d_config(path:str=HM3D_CONFIG_PATH,stage:str='val',episodes=200): habitat_config = habitat.get_config(path) with read_write(habitat_config): habitat_config.habitat.dataset.split = stage habitat_config.habitat.dataset.scenes_dir = "/home/PJLAB/caiwenzhe/Desktop/dataset/scenes" habitat_config.habitat.dataset.data_path = "/home/PJLAB/caiwenzhe/Desktop/dataset/habitat_task/objectnav/hm3d/v2/{split}/{split}.json.gz" habitat_config.habitat.simulator.scene_dataset = "/home/PJLAB/caiwenzhe/Desktop/dataset/scenes/hm3d_v0.2/hm3d_annotated_basis.scene_dataset_config.json" habitat_config.habitat.environment.iterator_options.num_episode_sample = episodes habitat_config.habitat.task.measurements.update( { "top_down_map": TopDownMapMeasurementConfig( map_padding=3, map_resolution=1024, draw_source=True, draw_border=True, draw_shortest_path=False, draw_view_points=True, draw_goal_positions=True, draw_goal_aabbs=True, fog_of_war=FogOfWarConfig( draw=True, visibility_dist=5.0, fov=90, ), ), "collisions": CollisionsMeasurementConfig(), }) habitat_config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.max_depth=5.0 habitat_config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.normalize_depth=False habitat_config.habitat.task.measurements.success.success_distance = 0.25 return habitat_config def mp3d_config(path:str=MP3D_CONFIG_PATH,stage:str='val',episodes=200): habitat_config = habitat.get_config(path) with read_write(habitat_config): habitat_config.habitat.dataset.split = stage habitat_config.habitat.dataset.scenes_dir = "/home/PJLAB/caiwenzhe/Desktop/dataset/scenes" habitat_config.habitat.dataset.data_path = "/home/PJLAB/caiwenzhe/Desktop/dataset/habitat_task/objectnav/mp3d/v1/{split}/{split}.json.gz" habitat_config.habitat.simulator.scene_dataset = "/home/PJLAB/caiwenzhe/Desktop/dataset/scenes/mp3d/mp3d.scene_dataset_config.json" habitat_config.habitat.environment.iterator_options.num_episode_sample = episodes habitat_config.habitat.task.measurements.update( { "top_down_map": TopDownMapMeasurementConfig( map_padding=3, map_resolution=1024, draw_source=True, draw_border=True, draw_shortest_path=False, draw_view_points=True, draw_goal_positions=True, draw_goal_aabbs=True, fog_of_war=FogOfWarConfig( draw=True, visibility_dist=5.0, fov=79, ), ), "collisions": CollisionsMeasurementConfig(), }) habitat_config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.max_depth=5.0 habitat_config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.normalize_depth=False habitat_config.habitat.task.measurements.success.success_distance = 0.25 return habitat_config def r2r_config(path:str=R2R_CONFIG_PATH,stage:str='val_seen',episodes=200): habitat_config = habitat.get_config(path) with read_write(habitat_config): habitat_config.habitat.dataset.split = stage habitat_config.habitat.dataset.scenes_dir = "/home/PJLAB/caiwenzhe/Desktop/dataset/scenes" habitat_config.habitat.dataset.data_path = "/home/PJLAB/caiwenzhe/Desktop/dataset/habitat_task/vln/r2r/{split}/{split}.json.gz" habitat_config.habitat.simulator.scene_dataset = "/home/PJLAB/caiwenzhe/Desktop/dataset/scenes/mp3d/mp3d.scene_dataset_config.json" habitat_config.habitat.environment.iterator_options.num_episode_sample = episodes habitat_config.habitat.task.measurements.update( { "top_down_map": TopDownMapMeasurementConfig( map_padding=3, map_resolution=1024, draw_source=True, draw_border=True, draw_shortest_path=False, draw_view_points=True, draw_goal_positions=True, draw_goal_aabbs=True, fog_of_war=FogOfWarConfig( draw=True, visibility_dist=5.0, fov=79, ), ), "collisions": CollisionsMeasurementConfig(), }) habitat_config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.max_depth=5.0 habitat_config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.normalize_depth=False habitat_config.habitat.task.measurements.success.success_distance = 0.25 return habitat_config ================================================ FILE: constants.py ================================================ from cv_utils.object_list import categories GLEE_CONFIG_PATH = "./thirdparty/GLEE/configs/SwinL.yaml" GLEE_CHECKPOINT_PATH = "./thirdparty/GLEE/GLEE_SwinL_Scaleup10m.pth" DETECT_OBJECTS = [[cat['name'].lower()] for cat in categories] INTEREST_OBJECTS = ['bed','chair','toilet','potted_plant','television_set','sofa'] ================================================ FILE: cv_utils/glee_detector.py ================================================ from thirdparty.GLEE.glee.models.glee_model import GLEE_Model from thirdparty.GLEE.glee.config_deeplab import add_deeplab_config from thirdparty.GLEE.glee.config import add_glee_config from habitat_sim.utils.common import d3_40_colors_rgb from constants import * from detectron2.config import get_cfg from .object_list import categories as CATEGORIES import torch import torch.nn.functional as F import torchvision import cv2 import numpy as np CATEGORIES = [cat['name'].lower() for cat in CATEGORIES] def initialize_glee(glee_config=GLEE_CONFIG_PATH, glee_checkpoint=GLEE_CHECKPOINT_PATH, device="cuda:0"): cfg_swin = get_cfg() add_deeplab_config(cfg_swin) add_glee_config(cfg_swin) conf_files_swin = glee_config checkpoints_swin = torch.load(glee_checkpoint) cfg_swin.merge_from_file(conf_files_swin) GLEEmodel_swin = GLEE_Model(cfg_swin, None, device, None, True).to(device) GLEEmodel_swin.load_state_dict(checkpoints_swin, strict=False) GLEEmodel_swin.eval() return GLEEmodel_swin # prompt_mode="categories", # results_select=["box", "mask", "name", "score"], def glee_segmentation(img, GLEEmodel, custom_category=CATEGORIES, num_inst_select=15, threshold_select=0.2, device="cuda:0"): pixel_mean = torch.Tensor([123.675, 116.28, 103.53]).to(device).view(3, 1, 1) pixel_std = torch.Tensor([58.395, 57.12, 57.375]).to(device).view(3, 1, 1) normalizer = lambda x: (x - pixel_mean) / pixel_std ori_image = torch.as_tensor(np.ascontiguousarray(img.transpose(2, 0, 1))) ori_image = normalizer(ori_image.to(device))[None,] _,_, ori_height, ori_width = ori_image.shape resizer = torchvision.transforms.Resize(800) resize_image = resizer(ori_image) image_size = torch.as_tensor((resize_image.shape[-2],resize_image.shape[-1])) re_size = resize_image.shape[-2:] stride = 32 # the last two dims are H,W, both subject to divisibility requirement padding_size = ((image_size + (stride - 1)).div(stride, rounding_mode="floor") * stride).tolist() infer_image = torch.zeros(1,3,padding_size[0],padding_size[1]).to(resize_image) infer_image[0,:,:image_size[0],:image_size[1]] = resize_image batch_category_name = custom_category prompt_list = [] with torch.no_grad(): (outputs,_) = GLEEmodel(infer_image, prompt_list, task="coco", batch_name_list=batch_category_name, is_train=False) topK_instance = max(num_inst_select,1) bbox_pred = outputs['pred_boxes'][0] bbox_pred[:,0],bbox_pred[:,2] = bbox_pred[:,0] * img.shape[1] - bbox_pred[:,2] * img.shape[1] * 0.5, bbox_pred[:,0] * img.shape[1] + bbox_pred[:,2] * img.shape[1] * 0.5 bbox_pred[:,1],bbox_pred[:,3] = bbox_pred[:,1] * img.shape[0] - bbox_pred[:,3] * img.shape[0] * 0.5, bbox_pred[:,1] * img.shape[0] + bbox_pred[:,3] * img.shape[0] * 0.5 mask_pred = outputs['pred_masks'][0] mask_cls = outputs['pred_logits'][0] scores = mask_cls.sigmoid().max(-1)[0] scores_per_image, topk_indices = scores.topk(topK_instance, sorted=True) valid = scores_per_image>threshold_select topk_indices = topk_indices[valid] scores_per_image = scores_per_image[valid] pred_class = mask_cls[topk_indices].max(-1)[1].tolist() if len(pred_class) == 0: return [], [], [], [] mask_pred = mask_pred[topk_indices] bbox_pred = bbox_pred[topk_indices].cpu().numpy() pred_masks = F.interpolate( mask_pred[None,], size=(padding_size[0], padding_size[1]), mode="bilinear", align_corners=False) pred_masks = pred_masks[:,:,:re_size[0],:re_size[1]] pred_masks = F.interpolate( pred_masks, size=(ori_height,ori_width), mode="bilinear", align_corners=False ) pred_masks = (pred_masks>0).detach().cpu().numpy()[0] return bbox_pred, pred_masks, np.array(batch_category_name)[pred_class], scores_per_image def visualize_segmentation(image,classes,masks): copy_image = image.copy() label_classes = np.unique(classes) for cls,mask in zip(classes,masks): if len(np.unique(mask)) != 2: continue copy_image[np.where(mask == 1)] = d3_40_colors_rgb[label_classes.tolist().index(cls)] x, y = int(np.mean(np.where(mask)[1])), int(np.mean(np.where(mask)[0])) cv2.putText(copy_image, str(cls), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 2) ret_image = cv2.addWeighted(image,0.2,copy_image,0.8,0) return ret_image def visualize_detection(image,classes,bboxes): copy_image = image.copy() label_classes = np.unique(classes) for cls,bbox in zip(classes,bboxes): color = d3_40_colors_rgb[label_classes.tolist().index(cls)%40] copy_image = cv2.rectangle(copy_image,(int(bbox[0]),int(bbox[1])),(int(bbox[2]),int(bbox[3])),color.tolist(),2) x, y = int(bbox[0]*0.5+bbox[2]*0.5), int(bbox[1]*0.5+bbox[3]*0.5) cv2.putText(copy_image, str(cls), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 2) return copy_image ================================================ FILE: cv_utils/image_percevior.py ================================================ from constants import * from .glee_detector import * class GLEE_Percevior: def __init__(self, glee_config=GLEE_CONFIG_PATH, glee_checkpoint=GLEE_CHECKPOINT_PATH, device = "cuda:0"): self.device = device self.glee_model = initialize_glee(glee_config,glee_checkpoint,device) def perceive(self,image,confidence_threshold=0.25,area_threshold=2500): pred_bboxes, pred_masks, pred_class, pred_confidence = glee_segmentation(image,self.glee_model,threshold_select=confidence_threshold,device=self.device) try: mask_area = np.array([mask.sum() for mask in pred_masks]) bbox_trust = np.array([(bbox[0] > 20) & (bbox[2] < image.shape[1] - 20) for bbox in pred_bboxes]) visualization = visualize_segmentation(image,pred_class[(mask_area>area_threshold) & bbox_trust],pred_masks[(mask_area>area_threshold) & bbox_trust]) return pred_class[(mask_area>area_threshold) & bbox_trust],pred_masks[(mask_area>area_threshold) & bbox_trust],pred_confidence[(mask_area>area_threshold) & bbox_trust],[visualization] except: return [],[],[],[image] ================================================ FILE: cv_utils/object_list.py ================================================ categories = [ {"id": 1, "name": "bed"}, {"id": 2, "name": "sofa"}, {"id": 3, "name": "chair"}, {"id": 4, "name": "television_set"}, {"id": 5, "name": "potted_plant"}, {"id": 6, "name": "toilet"}, {"id": 7, "name": "lamp"}, {"id": 8, "name": "desk"}, {"id": 9, "name": "bookshelf"}, {"id": 10, "name": "cupboard"}, {"id": 11, "name": "drawer"}, {"id": 12, "name": "refrigerator"}, {"id": 13, "name": "oven"}, {"id": 14, "name": "microwave"}, {"id": 15, "name": "toaster"}, {"id": 16, "name": "sink"}, {"id": 17, "name": "dishwasher"}, {"id": 18, "name": "coffee_machine"}, {"id": 19, "name": "kettle"}, {"id": 20, "name": "stove"}, {"id": 21, "name": "washing_machine"}, {"id": 22, "name": "dryer"}, {"id": 23, "name": "mirror"}, {"id": 24, "name": "clock"}, {"id": 25, "name": "curtains"}, {"id": 26, "name": "blinds"}, {"id": 27, "name": "bathtub"}, {"id": 28, "name": "shower"}, {"id": 29, "name": "table"}, {"id": 30, "name": "towel"}, {"id": 31, "name": "soap_dispenser"}, {"id": 32, "name": "toothbrush"}, {"id": 33, "name": "toothpaste"}, {"id": 34, "name": "shampoo"}, {"id": 35, "name": "conditioner"}, {"id": 36, "name": "hair_dryer"}, {"id": 37, "name": "razor"}, {"id": 38, "name": "makeup"}, {"id": 39, "name": "tissue_box"}, {"id": 40, "name": "trash_can"}, {"id": 41, "name": "vacuum_cleaner"}, {"id": 42, "name": "mop"}, {"id": 43, "name": "broom"}, {"id": 44, "name": "bucket"}, {"id": 45, "name": "sponge"}, {"id": 46, "name": "detergent"}, {"id": 47, "name": "iron"}, {"id": 48, "name": "ironing_board"}, {"id": 49, "name": "laundry_basket"}, {"id": 50, "name": "clothes_hanger"}, {"id": 51, "name": "coat_rack"}, {"id": 52, "name": "shoe_rack"}, {"id": 53, "name": "umbrella"}, {"id": 54, "name": "fire_extinguisher"}, {"id": 55, "name": "first_aid_kit"}, {"id": 56, "name": "thermometer"}, {"id": 57, "name": "scale"}, {"id": 58, "name": "fan"}, {"id": 59, "name": "heater"}, {"id": 60, "name": "air_conditioner"}, {"id": 61, "name": "humidifier"}, {"id": 62, "name": "dehumidifier"}, {"id": 63, "name": "light_switch"}, {"id": 64, "name": "electrical_outlet"}, {"id": 65, "name": "extension_cord"}, {"id": 66, "name": "remote_control"}, {"id": 67, "name": "game_console"}, {"id": 68, "name": "router"}, {"id": 69, "name": "modem"}, {"id": 70, "name": "computer"}, {"id": 71, "name": "laptop"}, {"id": 72, "name": "printer"}, {"id": 73, "name": "scanner"}, {"id": 74, "name": "fax_machine"}, {"id": 75, "name": "telephone"}, {"id": 76, "name": "smartphone"}, {"id": 77, "name": "tablet"}, {"id": 78, "name": "keyboard"}, {"id": 79, "name": "mouse"}, {"id": 80, "name": "monitor"}, {"id": 81, "name": "notebook"}, {"id": 82, "name": "pen"}, {"id": 83, "name": "pencil"}, {"id": 84, "name": "eraser"}, {"id": 85, "name": "stapler"}, {"id": 86, "name": "scissors"}, {"id": 87, "name": "tape_dispenser"}, {"id": 88, "name": "paper_clip"}, {"id": 89, "name": "envelope"}, {"id": 90, "name": "letter_opener"}, {"id": 91, "name": "cabinet"}, {"id": 92, "name": "whiteboard"}, {"id": 93, "name": "calendar"}, {"id": 94, "name": "photo_frame"}, {"id": 95, "name": "vase"}, {"id": 96, "name": "candle"}, {"id": 97, "name": "incense"}, {"id": 98, "name": "book"}, {"id": 99, "name": "magazine"}, {"id": 100, "name": "newspaper"}, {"id": 101, "name": "album"}, {"id": 102, "name": "record_player"}, {"id": 103, "name": "cd_player"}, {"id": 104, "name": "dvd_player"}, {"id": 105, "name": "blu_ray_player"}, {"id": 106, "name": "speaker"}, {"id": 107, "name": "headphones"}, {"id": 108, "name": "microphone"}, {"id": 109, "name": "camera"}, {"id": 110, "name": "camcorder"}, {"id": 111, "name": "tripod"}, {"id": 112, "name": "flashlight"}, {"id": 113, "name": "batteries"}, {"id": 114, "name": "charger"}, {"id": 115, "name": "cable"}, {"id": 116, "name": "usb_drive"}, {"id": 117, "name": "hard_drive"}, {"id": 118, "name": "router"}, {"id": 119, "name": "switch"}, {"id": 120, "name": "firewall"}, {"id": 121, "name": "server"}, {"id": 122, "name": "keyboard_tray"}, {"id": 123, "name": "mouse_pad"}, {"id": 124, "name": "speaker_stand"}, {"id": 125, "name": "monitor_stand"}, {"id": 126, "name": "file_folder"}, {"id": 127, "name": "binder"}, {"id": 128, "name": "clipboard"}, {"id": 129, "name": "calculator"}, {"id": 130, "name": "label_maker"}, {"id": 131, "name": "hole_punch"}, {"id": 132, "name": "paper_shredder"}, {"id": 133, "name": "post_it_note"}, {"id": 134, "name": "thumbtack"}, {"id": 135, "name": "magnet"}, {"id": 136, "name": "ruler"}, {"id": 137, "name": "protractor"}, {"id": 138, "name": "compass"}, {"id": 139, "name": "glue"}, {"id": 140, "name": "white_out"}, {"id": 141, "name": "marker"}, {"id": 142, "name": "highlighter"}, {"id": 143, "name": "crayon"}, {"id": 144, "name": "paint"}, {"id": 145, "name": "paintbrush"}, {"id": 146, "name": "easel"}, {"id": 147, "name": "canvas"}, {"id": 148, "name": "palette"}, {"id": 149, "name": "sculpting_tools"}, {"id": 150, "name": "clay"}, {"id": 151, "name": "sewing_machine"}, {"id": 152, "name": "thread"}, {"id": 153, "name": "needle"}, {"id": 154, "name": "scissors"}, {"id": 155, "name": "fabric"}, {"id": 156, "name": "measuring_tape"}, {"id": 157, "name": "pin_cushion"}, {"id": 158, "name": "thimble"}, {"id": 159, "name": "seam_ripper"}, {"id": 160, "name": "iron"}, {"id": 161, "name": "pattern"}, {"id": 162, "name": "ribbon"}, {"id": 163, "name": "button"}, {"id": 164, "name": "zipper"}, {"id": 165, "name": "hook"}, {"id": 166, "name": "stairs"}, {"id": 167, "name": "snap"}, {"id": 168, "name": "velcro"}, {"id": 169, "name": "elastic"}, {"id": 170, "name": "lace"}, {"id": 171, "name": "trim"}, {"id": 172, "name": "bead"}, {"id": 173, "name": "sequin"}, {"id": 174, "name": "glue_gun"}, {"id": 175, "name": "glue_stick"}, {"id": 176, "name": "craft_knife"}, {"id": 177, "name": "cutting_mat"}, {"id": 178, "name": "ruler"}, {"id": 179, "name": "scalpel"}, {"id": 180, "name": "tweezers"}, {"id": 181, "name": "pliers"}, {"id": 182, "name": "hammer"}, {"id": 183, "name": "screwdriver"}, {"id": 184, "name": "wrench"}, {"id": 185, "name": "drill"}, {"id": 186, "name": "saw"}, {"id": 187, "name": "chisel"}, {"id": 188, "name": "level"}, {"id": 189, "name": "tape_measure"}, {"id": 190, "name": "toolbox"}, {"id": 191, "name": "nail"}, {"id": 192, "name": "screw"}, {"id": 193, "name": "bolt"}, {"id": 194, "name": "nut"}, {"id": 195, "name": "washer"}, {"id": 196, "name": "sandpaper"}, {"id": 197, "name": "wood_glue"}, {"id": 198, "name": "clamp"}, {"id": 199, "name": "vise"}, {"id": 200, "name": "workbench"} ] ================================================ FILE: llm_utils/gpt_request.py ================================================ import os from openai import AzureOpenAI,OpenAI import requests import base64 import cv2 import numpy as np from mimetypes import guess_type gpt4_api_base = os.environ['GPT4_API_BASE'] gpt4_api_key = os.environ['GPT4_API_KEY'] gpt4v_api_base = os.environ['GPT4V_API_BASE'] gpt4v_api_key = os.environ['GPT4V_API_KEY'] deployment_name = os.environ['GPT4_API_DEPLOY'] api_version = os.environ['GPT4_API_VERSION'] gpt4_client = AzureOpenAI( api_key=gpt4_api_key, api_version=api_version, base_url=f"{gpt4_api_base}/openai/deployments/{deployment_name}" ) deployment_name = os.environ['GPT4V_API_DEPLOY'] api_version = os.environ['GPT4V_API_VERSION'] gpt4v_client = AzureOpenAI( api_key=gpt4v_api_key, api_version=api_version, base_url=f"{gpt4v_api_base}/openai/deployments/{deployment_name}") def local_image_to_data_url(image): if isinstance(image,str): mime_type, _ = guess_type(image) with open(image, "rb") as image_file: base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8') return f"data:{mime_type};base64,{base64_encoded_data}" elif isinstance(image,np.ndarray): base64_encoded_data = base64.b64encode(cv2.imencode('.jpg',image)[1]).decode('utf-8') return f"data:image/jpeg;base64,{base64_encoded_data}" def gptv_response(text_prompt,image_prompt,system_prompt=""): prompt = [{'role':'system','content':system_prompt}, {'role':'user','content':[{'type':'text','text':text_prompt}, {'type':'image_url','image_url':{'url':local_image_to_data_url(image_prompt)}}]}] response = gpt4v_client.chat.completions.create(model=deployment_name, messages=prompt, max_tokens=1000) return response.choices[0].message.content def gpt_response(text_prompt,system_prompt=""): prompt = [{'role':'system','content':system_prompt}, {'role':'user','content':[{'type':'text','text':text_prompt}]}] response = gpt4_client.chat.completions.create(model=deployment_name, messages=prompt, max_tokens=1000) return response.choices[0].message.content ================================================ FILE: llm_utils/nav_prompt.py ================================================ CHAINON_PROMPT = "You are a wheeled mobile robot working in an indoor environment.\ And you are required to finish a navigation task indicated by a human instruction in a new house.\ Your task is to make a navigation plan for finishing the task as soon as possible.\ The navigation plan should be formulated as a chain as { - - - ...}.\ To make the plan, I will provide you the following elements:\ (1) : The navigation instruction given by the human.\ (2) : The completed steps in the plan recording your history trajectory.\ (3) : The list recording all the observed rooms and objects in this house from your perspective.\ The allowed in the plan contains ['Explore','Approach','Move Forward','Turn Left','Turn Right','Turn Around'].\ The action 'Explore' will lead you to the exploration frontiers to help unlock new areas.\ The action 'Approach' will lead you close to a specific object or room for more detailed observations.\ The allowed should be one appeared semantic instance in the or .\ Do not output an imagined instance as which has not been observerd in or mentioned in the .\ To select the landmark, you should consider the common house layout, human's habit of objects placement and the task navigation instruction.\ For example, the sofa is often close to a television, therefore, sofa is a good landmark for finding the television and to satisfy the human entertainment demand.\ If the action and landmark is clearly specified in the instruction like 'walk forward to the television', then you can directly decompose the instruction into 'Move_Forward' - 'Television' without 'Explore' action.\ You only need to plan one and one ahead, besides, you should output a flag to indicate whether you have finished the navigation task.\ Therefore, your output answer should be formatted as Answer={'Reason':, 'Action':, 'Landmark':, 'Flag':}.\ If you find a specific instance of the target object or a synonyms object, the output 'Flag' should be True.\ Try to select the that is closely related to the according to the human habit.\ Try not repeatly select the same as the ." GPT4V_PROMPT = f"You are an indoor navigation agent. I give you a panoramic observation image, complete navigation instruction and the sub-instruction you should execute now. \ Direction 1 and 11 are ahead, Direction 5 and 7 are back, Direction 3 is to the right, and Direction 9 is to the left. Please carefully analyze visual information in each direction \ and judge which direction is most suitable for next movement according to the act and landmark mentioned in the sub-instruction. \ You answer should follow \"Thinking Process\" and \"Judgement\". In the \"Judgement: \" field, you should only write down direction ID you choose. \ If you think you have arrived the destination, you can answer \"Stop\" in the \"Judgement: \" field. Note that the \"Direction 5\" and \"Direction 7\" are the directions you just came from. \ Generally, the direction with more navigation landmarks in the complete navigation instruction is better." ================================================ FILE: mapper.py ================================================ from mapping_utils.geometry import * from mapping_utils.preprocess import * from mapping_utils.projection import * from mapping_utils.transform import * from mapping_utils.path_planning import * from cv_utils.image_percevior import GLEE_Percevior from matplotlib import colormaps from habitat_sim.utils.common import d3_40_colors_rgb from constants import * import open3d as o3d from lavis.models import load_model_and_preprocess from PIL import Image class Instruct_Mapper: def __init__(self, camera_intrinsic, pcd_resolution=0.05, grid_resolution=0.1, grid_size=5, floor_height=-0.8, ceiling_height=0.8, translation_func=habitat_translation, rotation_func=habitat_rotation, rotate_axis=[0,1,0], device='cuda:0'): self.device = device self.camera_intrinsic = camera_intrinsic self.pcd_resolution = pcd_resolution self.grid_resolution = grid_resolution self.grid_size = grid_size self.floor_height = floor_height self.ceiling_height = ceiling_height self.translation_func = translation_func self.rotation_func = rotation_func self.rotate_axis = np.array(rotate_axis) self.object_percevior = GLEE_Percevior(device=device) self.pcd_device = o3d.core.Device(device.upper()) def reset(self,position,rotation): self.update_iterations = 0 self.initial_position = self.translation_func(position) self.current_position = self.translation_func(position) - self.initial_position self.current_rotation = self.rotation_func(rotation) self.scene_pcd = o3d.t.geometry.PointCloud(self.pcd_device) self.navigable_pcd = o3d.t.geometry.PointCloud(self.pcd_device) self.object_pcd = o3d.t.geometry.PointCloud(self.pcd_device) self.object_entities = [] self.trajectory_position = [] def update(self,rgb,depth,position,rotation): self.current_position = self.translation_func(position) - self.initial_position self.current_rotation = self.rotation_func(rotation) self.current_depth = preprocess_depth(depth) self.current_rgb = preprocess_image(rgb) self.trajectory_position.append(self.current_position) # to avoid there is no valid depth value (especially in real-world) if np.sum(self.current_depth) > 0: camera_points,camera_colors = get_pointcloud_from_depth(self.current_rgb,self.current_depth,self.camera_intrinsic) world_points = translate_to_world(camera_points,self.current_position,self.current_rotation) self.current_pcd = gpu_pointcloud_from_array(world_points,camera_colors,self.pcd_device).voxel_down_sample(self.pcd_resolution) else: return # semantic masking and project object mask to pointcloud classes,masks,confidences,visualization = self.object_percevior.perceive(self.current_rgb) self.segmentation = visualization[0] current_object_entities = self.get_object_entities(self.current_depth,classes,masks,confidences) self.object_entities = self.associate_object_entities(self.object_entities,current_object_entities) self.object_pcd = self.update_object_pcd() # pointcloud update self.scene_pcd = gpu_merge_pointcloud(self.current_pcd,self.scene_pcd).voxel_down_sample(self.pcd_resolution) self.scene_pcd = self.scene_pcd.select_by_index((self.scene_pcd.point.positions[:,2]>self.floor_height-0.2).nonzero()[0]) self.useful_pcd = self.scene_pcd.select_by_index((self.scene_pcd.point.positions[:,2] self.floor_height-0.2) & (interpolate_points[:,2] < self.floor_height+0.2)] interpolate_colors = np.ones_like(interpolate_points) * 100 try: current_navigable_pcd = gpu_pointcloud_from_array(interpolate_points,interpolate_colors,self.pcd_device).voxel_down_sample(self.grid_resolution) self.navigable_pcd = gpu_merge_pointcloud(self.navigable_pcd,current_navigable_pcd).voxel_down_sample(self.pcd_resolution) except: self.navigable_pcd = self.useful_pcd.select_by_index((self.useful_pcd.point.positions[:,2]self.floor_height+0.1).nonzero()[0]) self.trajectory_pcd = gpu_pointcloud_from_array(np.array(self.trajectory_position),np.zeros((len(self.trajectory_position),3)),self.pcd_device) self.frontier_pcd = project_frontier(self.obstacle_pcd,self.navigable_pcd,self.floor_height+0.2,self.grid_resolution) self.frontier_pcd[:,2] = self.navigable_pcd.point.positions.cpu().numpy()[:,2].mean() self.frontier_pcd = gpu_pointcloud_from_array(self.frontier_pcd,np.ones((self.frontier_pcd.shape[0],3))*np.array([[255,0,0]]),self.pcd_device) self.update_iterations += 1 def update_object_pcd(self): object_pcd = o3d.geometry.PointCloud() for entity in self.object_entities: points = entity['pcd'].point.positions.cpu().numpy() colors = entity['pcd'].point.colors.cpu().numpy() new_pcd = o3d.geometry.PointCloud() new_pcd.points = o3d.utility.Vector3dVector(points) new_pcd.colors = o3d.utility.Vector3dVector(colors) object_pcd = object_pcd + new_pcd try: return gpu_pointcloud(object_pcd,self.pcd_device) except: return self.scene_pcd def get_view_pointcloud(self,rgb,depth,translation,rotation): current_position = self.translation_func(translation) - self.initial_position current_rotation = self.rotation_func(rotation) current_depth = preprocess_depth(depth) current_rgb = preprocess_image(rgb) camera_points,camera_colors = get_pointcloud_from_depth(current_rgb,current_depth,self.camera_intrinsic) world_points = translate_to_world(camera_points,current_position,current_rotation) current_pcd = gpu_pointcloud_from_array(world_points,camera_colors,self.pcd_device).voxel_down_sample(self.pcd_resolution) return current_pcd def get_object_entities(self,depth,classes,masks,confidences): entities = [] exist_objects = np.unique([ent['class'] for ent in self.object_entities]).tolist() for cls,mask,score in zip(classes,masks,confidences): if depth[mask>0].min() < 1.0 and score < 0.5: continue if cls not in exist_objects: exist_objects.append(cls) camera_points = get_pointcloud_from_depth_mask(depth,mask,self.camera_intrinsic) world_points = translate_to_world(camera_points,self.current_position,self.current_rotation) point_colors = np.array([d3_40_colors_rgb[exist_objects.index(cls)%40]]*world_points.shape[0]) if world_points.shape[0] < 10: continue object_pcd = gpu_pointcloud_from_array(world_points,point_colors,self.pcd_device).voxel_down_sample(self.pcd_resolution) object_pcd = gpu_cluster_filter(object_pcd) if object_pcd.point.positions.shape[0] < 10: continue entity = {'class':cls,'pcd':object_pcd,'confidence':score} entities.append(entity) return entities def associate_object_entities(self,ref_entities,eval_entities): for entity in eval_entities: if len(ref_entities) == 0: ref_entities.append(entity) continue overlap_score = [] eval_pcd = entity['pcd'] for ref_entity in ref_entities: if eval_pcd.point.positions.shape[0] == 0: break cdist = pointcloud_distance(eval_pcd,ref_entity['pcd']) overlap_condition = (cdist < 0.1) nonoverlap_condition = overlap_condition.logical_not() eval_pcd = eval_pcd.select_by_index(o3d.core.Tensor(nonoverlap_condition.cpu().numpy(),device=self.pcd_device).nonzero()[0]) overlap_score.append((overlap_condition.sum()/(overlap_condition.shape[0]+1e-6)).cpu().numpy()) max_overlap_score = np.max(overlap_score) arg_overlap_index = np.argmax(overlap_score) if max_overlap_score < 0.25: entity['pcd'] = eval_pcd ref_entities.append(entity) else: argmax_entity = ref_entities[arg_overlap_index] argmax_entity['pcd'] = gpu_merge_pointcloud(argmax_entity['pcd'],eval_pcd) if argmax_entity['pcd'].point.positions.shape[0] < entity['pcd'].point.positions.shape[0] or entity['class'] in INTEREST_OBJECTS: argmax_entity['class'] = entity['class'] ref_entities[arg_overlap_index] = argmax_entity return ref_entities def get_obstacle_affordance(self): try: distance = pointcloud_distance(self.navigable_pcd,self.obstacle_pcd) affordance = (distance - distance.min())/(distance.max() - distance.min() + 1e-6) affordance[distance < 0.25] = 0 return affordance.cpu().numpy() except: return np.zeros((self.navigable_pcd.point.positions.shape[0],),dtype=np.float32) def get_trajectory_affordance(self): try: distance = pointcloud_distance(self.navigable_pcd,self.trajectory_pcd) affordance = (distance - distance.min()) / (distance.max() - distance.min() + 1e-6) return affordance.cpu().numpy() except: return np.zeros((self.navigable_pcd.point.positions.shape[0],),dtype=np.float32) def get_semantic_affordance(self,target_class,threshold=0.1): semantic_pointcloud = o3d.t.geometry.PointCloud() for entity in self.object_entities: if entity['class'] in target_class: semantic_pointcloud = gpu_merge_pointcloud(semantic_pointcloud,entity['pcd']) try: distance = pointcloud_2d_distance(self.navigable_pcd,semantic_pointcloud) affordance = 1 - (distance - distance.min()) / (distance.max() - distance.min() + 1e-6) affordance[distance > threshold] = 0 affordance = affordance.cpu().numpy() return affordance except: return np.zeros((self.navigable_pcd.point.positions.shape[0],),dtype=np.float32) def get_gpt4v_affordance(self,gpt4v_pcd): try: distance = pointcloud_distance(self.navigable_pcd,gpt4v_pcd) affordance = 1 - (distance - distance.min()) / (distance.max() - distance.min() + 1e-6) affordance[distance > 0.1] = 0 return affordance.cpu().numpy() except: return np.zeros((self.navigable_pcd.point.positions.shape[0],),dtype=np.float32) def get_action_affordance(self,action): try: if action == 'Explore': distance = pointcloud_2d_distance(self.navigable_pcd,self.frontier_pcd) affordance = 1 - (distance - distance.min()) / (distance.max() - distance.min() + 1e-6) affordance[distance > 0.2] = 0 return affordance.cpu().numpy() elif action == 'Move_Forward': pixel_x,pixel_z,depth_values = project_to_camera(self.navigable_pcd,self.camera_intrinsic,self.current_position,self.current_rotation) filter_condition = (pixel_x >= 0) & (pixel_x < self.camera_intrinsic[0][2]*2) & (pixel_z >= 0) & (pixel_z < self.camera_intrinsic[1][2]*2) & (depth_values > 1.5) & (depth_values < 2.5) filter_pcd = self.navigable_pcd.select_by_index(o3d.core.Tensor(np.where(filter_condition==1)[0],device=self.navigable_pcd.device)) distance = pointcloud_distance(self.navigable_pcd,filter_pcd) affordance = 1 - (distance - distance.min()) / (distance.max() - distance.min() + 1e-6) affordance[distance > 0.1] = 0 return affordance.cpu().numpy() elif action == 'Turn_Around': R = np.array([np.pi,np.pi,np.pi]) * self.rotate_axis turn_extrinsic = np.matmul(self.current_rotation,quaternion.as_rotation_matrix(quaternion.from_euler_angles(R))) pixel_x,pixel_z,depth_values = project_to_camera(self.navigable_pcd,self.camera_intrinsic,self.current_position,turn_extrinsic) filter_condition = (pixel_x >= 0) & (pixel_x < self.camera_intrinsic[0][2]*2) & (pixel_z >= 0) & (pixel_z < self.camera_intrinsic[1][2]*2) & (depth_values > 1.5) & (depth_values < 2.5) filter_pcd = self.navigable_pcd.select_by_index(o3d.core.Tensor(np.where(filter_condition==1)[0],device=self.navigable_pcd.device)) distance = pointcloud_distance(self.navigable_pcd,filter_pcd) affordance = 1 - (distance - distance.min()) / (distance.max() - distance.min() + 1e-6) affordance[distance > 0.1] = 0 return affordance.cpu().numpy() elif action == 'Turn_Left': R = np.array([np.pi/2,np.pi/2,np.pi/2]) * self.rotate_axis turn_extrinsic = np.matmul(self.current_rotation,quaternion.as_rotation_matrix(quaternion.from_euler_angles(R))) pixel_x,pixel_z,depth_values = project_to_camera(self.navigable_pcd,self.camera_intrinsic,self.current_position,turn_extrinsic) filter_condition = (pixel_x >= 0) & (pixel_x < self.camera_intrinsic[0][2]*2) & (pixel_z >= 0) & (pixel_z < self.camera_intrinsic[1][2]*2) & (depth_values > 1.5) & (depth_values < 2.5) filter_pcd = self.navigable_pcd.select_by_index(o3d.core.Tensor(np.where(filter_condition==1)[0],device=self.navigable_pcd.device)) distance = pointcloud_distance(self.navigable_pcd,filter_pcd) affordance = 1 - (distance - distance.min()) / (distance.max() - distance.min() + 1e-6) affordance[distance > 0.1] = 0 return affordance.cpu().numpy() elif action == 'Turn_Right': R = np.array([-np.pi/2,-np.pi/2,-np.pi/2]) * self.rotate_axis turn_extrinsic = np.matmul(self.current_rotation,quaternion.as_rotation_matrix(quaternion.from_euler_angles(R))) pixel_x,pixel_z,depth_values = project_to_camera(self.navigable_pcd,self.camera_intrinsic,self.current_position,turn_extrinsic) filter_condition = (pixel_x >= 0) & (pixel_x < self.camera_intrinsic[0][2]*2) & (pixel_z >= 0) & (pixel_z < self.camera_intrinsic[1][2]*2) & (depth_values > 1.5) & (depth_values < 2.5) filter_pcd = self.navigable_pcd.select_by_index(o3d.core.Tensor(np.where(filter_condition==1)[0],device=self.navigable_pcd.device)) distance = pointcloud_distance(self.navigable_pcd,filter_pcd) affordance = 1 - (distance - distance.min()) / (distance.max() - distance.min() + 1e-6) affordance[distance > 0.1] = 0 return affordance.cpu().numpy() elif action == 'Enter': return self.get_semantic_affordance(['doorway','door','entrance','exit']) elif action == 'Exit': return self.get_semantic_affordance(['doorway','door','entrance','exit']) else: return np.zeros((self.navigable_pcd.point.positions.shape[0],),dtype=np.float32) except: return np.zeros((self.navigable_pcd.point.positions.shape[0],),dtype=np.float32) def get_objnav_affordance_map(self,action,target_class,gpt4v_pcd,complete_flag=False,failure_mode=False): if failure_mode: obstacle_affordance = self.get_obstacle_affordance() affordance = self.get_action_affordance('Explore') affordance = np.clip(affordance,0.1,1.0) affordance[obstacle_affordance == 0] = 0 return affordance,self.visualize_affordance(affordance) elif complete_flag: affordance = self.get_semantic_affordance([target_class],threshold=0.1) return affordance,self.visualize_affordance(affordance) else: obstacle_affordance = self.get_obstacle_affordance() semantic_affordance = self.get_semantic_affordance([target_class],threshold=1.5) action_affordance = self.get_action_affordance(action) gpt4v_affordance = self.get_gpt4v_affordance(gpt4v_pcd) history_affordance = self.get_trajectory_affordance() affordance = 0.25*semantic_affordance + 0.25*action_affordance + 0.25*gpt4v_affordance + 0.25*history_affordance affordance = np.clip(affordance,0.1,1.0) affordance[obstacle_affordance == 0] = 0 return affordance,self.visualize_affordance(affordance/(affordance.max()+1e-6)) def get_debug_affordance_map(self,action,target_class,gpt4v_pcd): obstacle_affordance = self.get_obstacle_affordance() semantic_affordance = self.get_semantic_affordance([target_class],threshold=1.5) action_affordance = self.get_action_affordance(action) gpt4v_affordance = self.get_gpt4v_affordance(gpt4v_pcd) history_affordance = self.get_trajectory_affordance() return self.visualize_affordance(semantic_affordance/(semantic_affordance.max()+1e-6)),\ self.visualize_affordance(history_affordance/(history_affordance.max()+1e-6)),\ self.visualize_affordance(action_affordance/(action_affordance.max()+1e-6)),\ self.visualize_affordance(gpt4v_affordance/(gpt4v_affordance.max()+1e-6)),\ self.visualize_affordance(obstacle_affordance/(obstacle_affordance.max()+1e-6)) def visualize_affordance(self,affordance): cmap = colormaps.get('jet') color_affordance = cmap(affordance)[:,0:3] color_affordance = cpu_pointcloud_from_array(self.navigable_pcd.point.positions.cpu().numpy(),color_affordance) return color_affordance def get_appeared_objects(self): return [entity['class'] for entity in self.object_entities] def save_pointcloud_debug(self,path="./"): save_pcd = o3d.geometry.PointCloud() try: assert self.useful_pcd.point.positions.shape[0] > 0 save_pcd.points = o3d.utility.Vector3dVector(self.useful_pcd.point.positions.cpu().numpy()) save_pcd.colors = o3d.utility.Vector3dVector(self.useful_pcd.point.colors.cpu().numpy()) o3d.io.write_point_cloud(path + "scene.ply",save_pcd) except: pass try: assert self.navigable_pcd.point.positions.shape[0] > 0 save_pcd.points = o3d.utility.Vector3dVector(self.navigable_pcd.point.positions.cpu().numpy()) save_pcd.colors = o3d.utility.Vector3dVector(self.navigable_pcd.point.colors.cpu().numpy()) o3d.io.write_point_cloud(path + "navigable.ply",save_pcd) except: pass try: assert self.obstacle_pcd.point.positions.shape[0] > 0 save_pcd.points = o3d.utility.Vector3dVector(self.obstacle_pcd.point.positions.cpu().numpy()) save_pcd.colors = o3d.utility.Vector3dVector(self.obstacle_pcd.point.colors.cpu().numpy()) o3d.io.write_point_cloud(path + "obstacle.ply",save_pcd) except: pass object_pcd = o3d.geometry.PointCloud() for entity in self.object_entities: points = entity['pcd'].point.positions.cpu().numpy() colors = entity['pcd'].point.colors.cpu().numpy() new_pcd = o3d.geometry.PointCloud() new_pcd.points = o3d.utility.Vector3dVector(points) new_pcd.colors = o3d.utility.Vector3dVector(colors) object_pcd = object_pcd + new_pcd if len(object_pcd.points) > 0: o3d.io.write_point_cloud(path + "object.ply",object_pcd) ================================================ FILE: mapping_utils/geometry.py ================================================ import numpy as np import open3d as o3d import quaternion import time import torch import cv2 def get_pointcloud_from_depth(rgb:np.ndarray,depth:np.ndarray,intrinsic:np.ndarray): if len(depth.shape) == 3: depth = depth[:,:,0] filter_z,filter_x = np.where(depth>0) depth_values = depth[filter_z,filter_x] pixel_z = (depth.shape[0] - 1 - filter_z - intrinsic[1][2]) * depth_values / intrinsic[1][1] pixel_x = (filter_x - intrinsic[0][2])*depth_values / intrinsic[0][0] pixel_y = depth_values color_values = rgb[filter_z,filter_x] point_values = np.stack([pixel_x,pixel_z,-pixel_y],axis=-1) return point_values,color_values def get_pointcloud_from_depth_mask(depth:np.ndarray,mask:np.ndarray,intrinsic:np.ndarray): if len(depth.shape) == 3: depth = depth[:,:,0] if len(mask.shape) == 3: mask = mask[:,:,0] filter_z,filter_x = np.where((depth>0) & (mask>0)) depth_values = depth[filter_z,filter_x] pixel_z = (depth.shape[0] - 1 - filter_z - intrinsic[1][2]) * depth_values / intrinsic[1][1] pixel_x = (filter_x - intrinsic[0][2])*depth_values / intrinsic[0][0] pixel_y = depth_values point_values = np.stack([pixel_x,pixel_z,-pixel_y],axis=-1) return point_values def translate_to_world(pointcloud,position,rotation): extrinsic = np.eye(4) extrinsic[0:3,0:3] = rotation extrinsic[0:3,3] = position world_points = np.matmul(extrinsic,np.concatenate((pointcloud,np.ones((pointcloud.shape[0],1))),axis=-1).T).T return world_points[:,0:3] def project_to_camera(pcd,intrinsic,position,rotation): extrinsic = np.eye(4) extrinsic[0:3,0:3] = rotation extrinsic[0:3,3] = position extrinsic = np.linalg.inv(extrinsic) try: camera_points = np.concatenate((pcd.point.positions.cpu().numpy(),np.ones((pcd.point.positions.shape[0],1))),axis=-1) except: camera_points = np.concatenate((pcd.points,np.ones((np.array(pcd.points).shape[0],1))),axis=-1) camera_points = np.matmul(extrinsic,camera_points.T).T[:,0:3] depth_values = -camera_points[:,2] filter_x = (camera_points[:,0] * intrinsic[0][0] / depth_values + intrinsic[0][2]).astype(np.int32) filter_z = (-camera_points[:,1] * intrinsic[1][1] / depth_values - intrinsic[1][2] + intrinsic[1][2]*2 - 1).astype(np.int32) return filter_x,filter_z,depth_values def pointcloud_distance(pcdA,pcdB,device='cpu'): try: pointsA = torch.tensor(pcdA.point.positions.cpu().numpy(),device=device) pointsB = torch.tensor(pcdB.point.positions.cpu().numpy(),device=device) except: pointsA = torch.tensor(np.array(pcdA.points),device=device) pointsB = torch.tensor(np.array(pcdB.points),device=device) cdist = torch.cdist(pointsA,pointsB) min_distances1, _ = cdist.min(dim=1) return min_distances1 def pointcloud_2d_distance(pcdA,pcdB,device='cpu'): pointsA = torch.tensor(pcdA.point.positions.cpu().numpy(),device=device) pointsA[:,2] = 0 pointsB = torch.tensor(pcdB.point.positions.cpu().numpy(),device=device) pointsB[:,2] = 0 cdist = torch.cdist(pointsA,pointsB) min_distances1, _ = cdist.min(dim=1) return min_distances1 def cpu_pointcloud_from_array(points,colors): pointcloud = o3d.geometry.PointCloud() pointcloud.points = o3d.utility.Vector3dVector(points) pointcloud.colors = o3d.utility.Vector3dVector(colors) return pointcloud def gpu_pointcloud_from_array(points,colors,device): pointcloud = o3d.t.geometry.PointCloud(device) pointcloud.point.positions = o3d.core.Tensor(points,dtype=o3d.core.Dtype.Float32,device=device) pointcloud.point.colors = o3d.core.Tensor(colors.astype(np.float32)/255.0,dtype=o3d.core.Dtype.Float32,device=device) return pointcloud def gpu_pointcloud(pointcloud,device): new_pointcloud = o3d.t.geometry.PointCloud(device) new_pointcloud.point.positions = o3d.core.Tensor(np.asarray(pointcloud.points),device=device) new_pointcloud.point.colors = o3d.core.Tensor(np.asarray(pointcloud.colors),device=device) return new_pointcloud def cpu_pointcloud(pointcloud): new_pointcloud = o3d.geometry.PointCloud() new_pointcloud.points = o3d.utility.Vector3dVector(pointcloud.point.positions.cpu().numpy()) new_pointcloud.colors = o3d.utility.Vector3dVector(pointcloud.point.colors.cpu().numpy()) return new_pointcloud def cpu_merge_pointcloud(pcdA,pcdB): return pcdA + pcdB def gpu_merge_pointcloud(pcdA,pcdB): if pcdA.is_empty(): return pcdB if pcdB.is_empty(): return pcdA return pcdA + pcdB def gpu_cluster_filter(pointcloud,eps=0.3,min_points=20): labels = pointcloud.cluster_dbscan(eps=eps, min_points=min_points, print_progress=False) numpy_labels = labels.cpu().numpy() unique_labels = np.unique(numpy_labels) largest_cluster_label = max(unique_labels, key=lambda x: np.sum(numpy_labels == x)) largest_cluster_pc = pointcloud.select_by_index((labels == largest_cluster_label).nonzero()[0]) return largest_cluster_pc def cpu_cluster_filter(pointcloud,eps=0.3,min_points=20): labels = pointcloud.cluster_dbscan(eps=eps, min_points=min_points, print_progress=False) unique_labels = np.unique(labels) largest_cluster_label = max(unique_labels, key=lambda x: np.sum(labels == x)) largest_cluster_pc = pointcloud.select_by_index((labels == largest_cluster_label).nonzero()[0]) return largest_cluster_pc def quat2array(quat): return np.array([quat.w,quat.x,quat.y,quat.z],np.float32) def quaternion_distance(quatA,quatB): # M*4, N*4 dot = np.dot(quatA,quatB.T) dot[dot<0] = -dot[dot<0] angle = 2*np.arccos(dot) return angle/np.pi*180 def eculidean_distance(posA,posB): posA_reshaped = posA[:, np.newaxis, :] posB_reshaped = posB[np.newaxis, :, :] pairwise_distance = np.sqrt(np.sum((posA_reshaped - posB_reshaped)**2, axis=2)) return pairwise_distance ================================================ FILE: mapping_utils/path_planning.py ================================================ import numpy as np import cv2 from pathfinding.core.diagonal_movement import DiagonalMovement from pathfinding.core.grid import Grid from pathfinding.finder.a_star import AStarFinder from .projection import * def path_planning(costmap,start_index,goal_index): planmap = costmap.copy() planmap[planmap == 1] = 10 grid = Grid(matrix=(planmap*100).astype(np.int32)) finder = AStarFinder(diagonal_movement=DiagonalMovement.always) start_index[0][1] = np.clip(start_index[0][1],0,costmap.shape[1]-1) start_index[0][0] = np.clip(start_index[0][0],0,costmap.shape[0]-1) goal_index[0][1] = np.clip(goal_index[0][1],0,costmap.shape[1]-1) goal_index[0][0] = np.clip(goal_index[0][0],0,costmap.shape[0]-1) start = grid.node(start_index[0][1],start_index[0][0]) goal = grid.node(goal_index[0][1],goal_index[0][0]) path,_ = finder.find_path(start,goal,grid) return path def visualize_path(costmap,path): visualize_costmap = costmap.copy() for waypoint in path: x = waypoint.y y = waypoint.x visualize_costmap[x,y] = 10 visualize_costmap = cv2.resize(visualize_costmap,(0,0),fx=10,fy=10,interpolation=cv2.INTER_NEAREST) visualize_costmap = cv2.applyColorMap((255*visualize_costmap/10).astype(np.uint8),cv2.COLORMAP_JET) return visualize_costmap ================================================ FILE: mapping_utils/preprocess.py ================================================ import numpy as np def preprocess_depth(depth:np.ndarray,lower_bound:float=0.1,upper_bound:float=4.9): depth[np.where((depthupper_bound))] = 0 return depth def preprocess_image(image:np.ndarray): return image ================================================ FILE: mapping_utils/projection.py ================================================ import numpy as np import open3d as o3d import cv2 # obstacle = 0 # unknown = 1 # position = 2 # navigable = 3 # frontier = 4 def project_frontier(obstacle_pcd,navigable_pcd,obstacle_height=-0.7,grid_resolution=0.25): np_obstacle_points = obstacle_pcd.point.positions.cpu().numpy() np_navigable_points = navigable_pcd.point.positions.cpu().numpy() np_all_points = np.concatenate((np_obstacle_points,np_navigable_points),axis=0) max_bound = np.max(np_all_points,axis=0) min_bound = np.min(np_all_points,axis=0) grid_dimensions = np.ceil((max_bound - min_bound) / grid_resolution).astype(int) grid_map = np.ones((grid_dimensions[0],grid_dimensions[1]),dtype=np.int32) # get navigable occupancy navigable_points = np_navigable_points navigable_indices = np.floor((navigable_points - min_bound) / grid_resolution).astype(int) navigable_indices[:,0] = np.clip(navigable_indices[:,0],0,grid_dimensions[0]-1) navigable_indices[:,1] = np.clip(navigable_indices[:,1],0,grid_dimensions[1]-1) navigable_indices[:,2] = np.clip(navigable_indices[:,2],0,grid_dimensions[2]-1) navigable_voxels = np.zeros(grid_dimensions,dtype=np.int32) navigable_voxels[navigable_indices[:,0],navigable_indices[:,1],navigable_indices[:,2]] = 1 navigable_map = (navigable_voxels.sum(axis=2) > 0) grid_map[np.where(navigable_map>0)] = 3 # get obstacle occupancy obstacle_points = np_obstacle_points obstacle_indices = np.floor((obstacle_points - min_bound) / grid_resolution).astype(int) obstacle_indices[:,0] = np.clip(obstacle_indices[:,0],0,grid_dimensions[0]-1) obstacle_indices[:,1] = np.clip(obstacle_indices[:,1],0,grid_dimensions[1]-1) obstacle_indices[:,2] = np.clip(obstacle_indices[:,2],0,grid_dimensions[2]-1) obstacle_voxels = np.zeros(grid_dimensions,dtype=np.int32) obstacle_voxels[obstacle_indices[:,0],obstacle_indices[:,1],obstacle_indices[:,2]] = 1 obstacle_map = (obstacle_voxels.sum(axis=2) > 0) grid_map[np.where(obstacle_map>0)] = 0 # get outer-border of navigable areas outer_border_navigable = ((grid_map == 3)*255).astype(np.uint8) contours,hierarchiy = cv2.findContours(outer_border_navigable,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE) outer_border_navigable = cv2.drawContours(np.zeros((grid_map.shape[0],grid_map.shape[1])),contours,-1,(255,255,255),1).astype(np.float32) obstacles = ((grid_map == 0)*255).astype(np.float32) obstacles = cv2.dilate(obstacles.astype(np.uint8),np.ones((3,3))) outer_border_navigable = ((outer_border_navigable - obstacles) > 0) grid_map_x,grid_map_y = np.where(outer_border_navigable>0) grid_indexes = np.stack((grid_map_x,grid_map_y,obstacle_height*np.ones((grid_map_x.shape[0],))),axis=1) frontier_points = grid_indexes * grid_resolution + min_bound return frontier_points def translate_grid_to_point(pointcloud,grid_indexes,grid_resolution=0.25): np_all_points = pointcloud.point.positions.cpu().numpy() min_bound = np.min(np_all_points,axis=0) translate_points = grid_indexes * grid_resolution + min_bound return translate_points def translate_point_to_grid(pointcloud,point_poses,grid_resolution=0.25): if len(point_poses.shape) == 1: point_poses = point_poses[np.newaxis,:] np_all_points = pointcloud.point.positions.cpu().numpy() min_bound = np.min(np_all_points,axis=0) grid_index = np.floor((point_poses - min_bound) / grid_resolution).astype(int) return grid_index[:,0:2] def project_costmap(navigable_pcd,affordance_value,grid_resolution=0.25): navigable_points = navigable_pcd.point.positions.cpu().numpy() max_bound = np.max(navigable_points,axis=0) min_bound = np.min(navigable_points,axis=0) grid_dimensions = np.ceil((max_bound - min_bound) / grid_resolution).astype(int) navigable_voxels = np.zeros(grid_dimensions,dtype=np.float32) navigable_indices = np.floor((navigable_points - min_bound) / grid_resolution).astype(int) navigable_indices[:,0] = np.clip(navigable_indices[:,0],0,grid_dimensions[0]-1) navigable_indices[:,1] = np.clip(navigable_indices[:,1],0,grid_dimensions[1]-1) navigable_indices[:,2] = np.clip(navigable_indices[:,2],0,grid_dimensions[2]-1) navigable_voxels[navigable_indices[:,0],navigable_indices[:,1],navigable_indices[:,2]] = affordance_value navigable_costmap = navigable_voxels.max(axis=2) navigable_costmap = 1 - navigable_costmap color_navigable_costmap = cv2.applyColorMap((navigable_costmap*255).astype(np.uint8),cv2.COLORMAP_JET) color_navigable_costmap = cv2.resize(color_navigable_costmap,(0,0),fx=5,fy=5,interpolation=cv2.INTER_NEAREST) return navigable_costmap,color_navigable_costmap ================================================ FILE: mapping_utils/transform.py ================================================ import numpy as np import quaternion def habitat_camera_intrinsic(config): assert config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.width == config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor.width, 'The configuration of the depth camera should be the same as rgb camera.' assert config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.height == config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor.height, 'The configuration of the depth camera should be the same as rgb camera.' assert config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.hfov == config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor.hfov, 'The configuration of the depth camera should be the same as rgb camera.' width = config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.width height = config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.height hfov = config.habitat.simulator.agents.main_agent.sim_sensors.depth_sensor.hfov xc = (width - 1.) / 2. zc = (height - 1.) / 2. f = (width / 2.) / np.tan(np.deg2rad(hfov / 2.)) intrinsic_matrix = np.array([[f,0,xc], [0,f,zc], [0,0,1]],np.float32) return intrinsic_matrix def habitat_translation(position): return np.array([position[0],position[2],position[1]]) def habitat_rotation(rotation): rotation_matrix = quaternion.as_rotation_matrix(rotation) transform_matrix = np.array([[1,0,0], [0,0,1], [0,1,0]]) rotation_matrix = np.matmul(transform_matrix,rotation_matrix) return rotation_matrix ================================================ FILE: objnav_agent.py ================================================ import habitat import numpy as np import cv2 import ast import open3d as o3d from mapping_utils.geometry import * from mapping_utils.projection import * from mapping_utils.path_planning import * from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower from mapper import Instruct_Mapper from habitat.utils.visualizations.maps import colorize_draw_agent_and_fit_to_height from llm_utils.nav_prompt import CHAINON_PROMPT,GPT4V_PROMPT from llm_utils.gpt_request import gpt_response,gptv_response class HM3D_Objnav_Agent: def __init__(self,env:habitat.Env,mapper:Instruct_Mapper): self.env = env self.mapper = mapper self.episode_samples = 0 self.planner = ShortestPathFollower(env.sim,0.5,False) def translate_objnav(self,object_goal): if object_goal.lower() == 'plant': return "Find the <%s>."%"potted_plant" elif object_goal.lower() == "tv_monitor": return "Find the <%s>."%"television_set" else: return "Find the <%s>."%object_goal def reset_debug_probes(self): self.rgb_trajectory = [] self.depth_trajectory = [] self.topdown_trajectory = [] self.segmentation_trajectory = [] self.gpt_trajectory = [] self.gptv_trajectory = [] self.panoramic_trajectory = [] self.obstacle_affordance_trajectory = [] self.semantic_affordance_trajectory = [] self.history_affordance_trajectory = [] self.action_affordance_trajectory = [] self.gpt4v_affordance_trajectory = [] self.affordance_trajectory = [] def reset(self): self.episode_samples += 1 self.episode_steps = 0 self.obs = self.env.reset() self.mapper.reset(self.env.sim.get_agent_state().sensor_states['rgb'].position,self.env.sim.get_agent_state().sensor_states['rgb'].rotation) self.instruct_goal = self.translate_objnav(self.env.current_episode.object_category) self.trajectory_summary = "" self.reset_debug_probes() def rotate_panoramic(self,rotate_times = 12): self.temporary_pcd = [] self.temporary_images = [] for i in range(rotate_times): if self.env.episode_over: break self.update_trajectory() self.temporary_pcd.append(self.mapper.current_pcd) self.temporary_images.append(self.rgb_trajectory[-1]) self.obs = self.env.step(3) def concat_panoramic(self,images): try: height,width = images[0].shape[0],images[0].shape[1] except: height,width = 480,640 background_image = np.zeros((2*height + 3*10, 3*width + 4*10, 3),np.uint8) copy_images = np.array(images,dtype=np.uint8) for i in range(len(copy_images)): if i % 2 != 0: row = (i//6) col = ((i%6)//2) copy_images[i] = cv2.putText(copy_images[i],"Direction %d"%i,(100,100),cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 6, cv2.LINE_AA) background_image[10*(row+1)+row*height:10*(row+1)+row*height+height:,col*width + col * 10:col*width+col*10+width,:] = copy_images[i] return background_image def update_trajectory(self): self.episode_steps += 1 self.metrics = self.env.get_metrics() self.rgb_trajectory.append(cv2.cvtColor(self.obs['rgb'],cv2.COLOR_BGR2RGB)) self.depth_trajectory.append((self.obs['depth']/5.0 * 255.0).astype(np.uint8)) topdown_image = cv2.cvtColor(colorize_draw_agent_and_fit_to_height(self.metrics['top_down_map'],1024),cv2.COLOR_BGR2RGB) topdown_image = cv2.putText(topdown_image,'Success:%.2f,SPL:%.2f,SoftSPL:%.2f,DTS:%.2f'%(self.metrics['success'],self.metrics['spl'],self.metrics['soft_spl'],self.metrics['distance_to_goal']),(0,100),cv2.FONT_HERSHEY_SIMPLEX,2,(0,0,0),2,cv2.LINE_AA) self.topdown_trajectory.append(topdown_image) self.position = self.env.sim.get_agent_state().sensor_states['rgb'].position self.rotation = self.env.sim.get_agent_state().sensor_states['rgb'].rotation self.mapper.update(self.rgb_trajectory[-1],self.obs['depth'],self.position,self.rotation) self.segmentation_trajectory.append(self.mapper.segmentation) self.observed_objects = self.mapper.get_appeared_objects() cv2.imwrite("monitor-rgb.jpg",self.rgb_trajectory[-1]) cv2.imwrite("monitor-depth.jpg",self.depth_trajectory[-1]) cv2.imwrite("monitor-segmentation.jpg",self.segmentation_trajectory[-1]) def save_trajectory(self,dir="./tmp_objnav/"): import imageio import os os.makedirs(dir) self.mapper.save_pointcloud_debug(dir) fps_writer = imageio.get_writer(dir+"fps.mp4", fps=4) dps_writer = imageio.get_writer(dir+"depth.mp4", fps=4) seg_writer = imageio.get_writer(dir+"segmentation.mp4", fps=4) metric_writer = imageio.get_writer(dir+"metrics.mp4",fps=4) for i,img,dep,seg,met in zip(np.arange(len(self.rgb_trajectory)),self.rgb_trajectory,self.depth_trajectory,self.segmentation_trajectory,self.topdown_trajectory): fps_writer.append_data(cv2.cvtColor(img,cv2.COLOR_BGR2RGB)) dps_writer.append_data(dep) seg_writer.append_data(cv2.cvtColor(seg,cv2.COLOR_BGR2RGB)) metric_writer.append_data(cv2.cvtColor(met,cv2.COLOR_BGR2RGB)) for index,pano_img in enumerate(self.panoramic_trajectory): cv2.imwrite(dir+"%d-pano.jpg"%index,pano_img) with open(dir+"gpt4_history.txt",'w') as file: file.write("".join(self.gpt_trajectory)) with open(dir+"gpt4v_history.txt",'w') as file: file.write("".join(self.gptv_trajectory)) for i,afford,safford,hafford,cafford,gafford,oafford in zip(np.arange(len(self.affordance_trajectory)),self.affordance_trajectory,self.semantic_affordance_trajectory,self.history_affordance_trajectory,self.action_affordance_trajectory,self.gpt4v_affordance_trajectory,self.obstacle_affordance_trajectory): o3d.io.write_point_cloud(dir+"afford-%d-plan.ply"%i,afford) o3d.io.write_point_cloud(dir+"semantic-afford-%d-plan.ply"%i,safford) o3d.io.write_point_cloud(dir+"history-afford-%d-plan.ply"%i,hafford) o3d.io.write_point_cloud(dir+"action-afford-%d-plan.ply"%i,cafford) o3d.io.write_point_cloud(dir+"gpt4v-afford-%d-plan.ply"%i,gafford) o3d.io.write_point_cloud(dir+"obstacle-afford-%d-plan.ply"%i,oafford) fps_writer.close() dps_writer.close() seg_writer.close() metric_writer.close() def query_chainon(self): semantic_clue = {'observed object':self.observed_objects} query_content = ":{}, :{}, :{}".format(self.instruct_goal,"{" + self.trajectory_summary + "}",semantic_clue) self.gpt_trajectory.append("Input:\n%s \n"%query_content) for i in range(10): try: raw_answer = gpt_response(query_content,CHAINON_PROMPT) print("GPT-4 Output Response: %s"%raw_answer) answer = raw_answer.replace(" ","") answer = answer[answer.index("{"):answer.index("}")+1] answer = ast.literal_eval(answer) if 'Action' in answer.keys() and 'Landmark' in answer.keys() and 'Flag' in answer.keys(): break except: continue self.gpt_trajectory.append("\nGPT-4 Answer:\n%s"%raw_answer) if self.trajectory_summary == "": self.trajectory_summary = self.trajectory_summary + str(answer['Action']) + '-' + str(answer['Landmark']) else: self.trajectory_summary = self.trajectory_summary + '-' + str(answer['Action']) + '-' + str(answer['Landmark']) return answer def query_gpt4v(self): images = self.temporary_images inference_image = self.concat_panoramic(images) cv2.imwrite("monitor-panoramic.jpg",inference_image) text_content = ":{}\n :{}".format(self.instruct_goal,self.trajectory_summary.split("-")[-2] + "-" + self.trajectory_summary.split("-")[-1]) self.gptv_trajectory.append("\nInput:\n%s \n"%text_content) for i in range(10): try: raw_answer = gptv_response(text_content,inference_image,GPT4V_PROMPT) print("GPT-4V Output Response: %s"%raw_answer) answer = raw_answer[raw_answer.index("Judgement: Direction"):] answer = answer.replace(" ","") answer = int(answer.split("Direction")[-1]) break except: continue self.gptv_trajectory.append("GPT-4V Answer:\n%s"%raw_answer) self.panoramic_trajectory.append(inference_image) try: return answer except: return np.random.randint(0,12) def make_plan(self,rotate=True,failed=False): if rotate == True: self.rotate_panoramic() self.chainon_answer = self.query_chainon() self.gpt4v_answer = self.query_gpt4v() self.gpt4v_pcd = o3d.t.geometry.PointCloud(self.mapper.pcd_device) self.gpt4v_pcd = gpu_merge_pointcloud(self.gpt4v_pcd,self.temporary_pcd[self.gpt4v_answer]) self.found_goal = bool(self.chainon_answer['Flag']) self.affordance_pcd,self.colored_affordance_pcd = self.mapper.get_objnav_affordance_map(self.chainon_answer['Action'],self.chainon_answer['Landmark'],self.gpt4v_pcd,self.chainon_answer['Flag'],failure_mode=failed) self.semantic_afford,self.history_afford,self.action_afford,self.gpt4v_afford,self.obs_afford = self.mapper.get_debug_affordance_map(self.chainon_answer['Action'],self.chainon_answer['Landmark'],self.gpt4v_pcd) if self.affordance_pcd.max() == 0: self.affordance_pcd,self.colored_affordance_pcd = self.mapper.get_objnav_affordance_map(self.chainon_answer['Action'],self.chainon_answer['Landmark'],self.gpt4v_pcd,False,failure_mode=failed) self.found_goal = False self.affordance_map,self.colored_affordance_map = project_costmap(self.mapper.navigable_pcd,self.affordance_pcd,self.mapper.grid_resolution) self.target_point = self.mapper.navigable_pcd.point.positions[self.affordance_pcd.argmax()].cpu().numpy() self.plan_position = self.mapper.current_position.copy() target_index = translate_point_to_grid(self.mapper.navigable_pcd,self.target_point,self.mapper.grid_resolution) start_index = translate_point_to_grid(self.mapper.navigable_pcd,self.mapper.current_position,self.mapper.grid_resolution) self.path = path_planning(self.affordance_map,start_index,target_index) self.path = [translate_grid_to_point(self.mapper.navigable_pcd,np.array([[waypoint.y,waypoint.x,0]]),self.mapper.grid_resolution)[0] for waypoint in self.path] if len(self.path) == 0: self.waypoint = self.mapper.navigable_pcd.point.positions.cpu().numpy()[np.argmax(self.affordance_pcd)] self.waypoint[2] = self.mapper.current_position[2] elif len(self.path) < 5: self.waypoint = self.path[-1] self.waypoint[2] = self.mapper.current_position[2] else: self.waypoint = self.path[4] self.waypoint[2] = self.mapper.current_position[2] self.affordance_trajectory.append(self.colored_affordance_pcd) self.obstacle_affordance_trajectory.append(self.obs_afford) self.semantic_affordance_trajectory.append(self.semantic_afford) self.history_affordance_trajectory.append(self.history_afford) self.action_affordance_trajectory.append(self.action_afford) self.gpt4v_affordance_trajectory.append(self.gpt4v_afford) def step(self): to_target_distance = np.sqrt(np.sum(np.square(self.mapper.current_position - self.waypoint))) if to_target_distance < 0.6 and len(self.path) > 0: self.path = self.path[min(5,len(self.path)-1):] if len(self.path) < 3: self.waypoint = self.path[-1] self.waypoint[2] = self.mapper.current_position[2] else: self.waypoint = self.path[2] self.waypoint[2] = self.mapper.current_position[2] pid_waypoint = self.waypoint + self.mapper.initial_position pid_waypoint = np.array([pid_waypoint[0],self.env.sim.get_agent_state().position[1],pid_waypoint[1]]) act = self.planner.get_next_action(pid_waypoint) move_distance = np.sqrt(np.sum(np.square(self.mapper.current_position - self.plan_position))) if (act == 0 or move_distance > 3.0) and not self.found_goal: self.make_plan(rotate=True) pid_waypoint = self.waypoint + self.mapper.initial_position pid_waypoint = np.array([pid_waypoint[0],self.env.sim.get_agent_state().position[1],pid_waypoint[1]]) act = self.planner.get_next_action(pid_waypoint) if act == 0 and not self.found_goal: self.make_plan(False,True) pid_waypoint = self.waypoint + self.mapper.initial_position pid_waypoint = np.array([pid_waypoint[0],self.env.sim.get_agent_state().position[1],pid_waypoint[1]]) act = self.planner.get_next_action(pid_waypoint) print("Warning: Failure locomotion and action = %d"%act) if not self.env.episode_over: self.obs = self.env.step(act) self.update_trajectory() ================================================ FILE: objnav_benchmark.py ================================================ import habitat import os import argparse import csv from tqdm import tqdm from config_utils import hm3d_config,mp3d_config from mapping_utils.transform import habitat_camera_intrinsic from mapper import Instruct_Mapper from objnav_agent import HM3D_Objnav_Agent os.environ['CUDA_VISIBLE_DEVICES'] = '0' os.environ["MAGNUM_LOG"] = "quiet" os.environ["HABITAT_SIM_LOG"] = "quiet" def write_metrics(metrics,path="objnav_hm3d.csv"): with open(path, mode="w", newline="") as csv_file: fieldnames = metrics[0].keys() writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() writer.writerows(metrics) def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--eval_episodes",type=int,default=500) parser.add_argument("--mapper_resolution",type=float,default=0.05) parser.add_argument("--path_resolution",type=float,default=0.2) parser.add_argument("--path_scale",type=int,default=5) return parser.parse_known_args()[0] if __name__ == "__main__": args = get_args() habitat_config = hm3d_config(stage='val',episodes=args.eval_episodes) habitat_env = habitat.Env(habitat_config) habitat_mapper = Instruct_Mapper(habitat_camera_intrinsic(habitat_config), pcd_resolution=args.mapper_resolution, grid_resolution=args.path_resolution, grid_size=args.path_scale) habitat_agent = HM3D_Objnav_Agent(habitat_env,habitat_mapper) evaluation_metrics = [] for i in tqdm(range(args.eval_episodes)): habitat_agent.reset() habitat_agent.make_plan() while not habitat_env.episode_over and habitat_agent.episode_steps < 495: habitat_agent.step() habitat_agent.save_trajectory("./tmp/episode-%d/"%i) evaluation_metrics.append({'success':habitat_agent.metrics['success'], 'spl':habitat_agent.metrics['spl'], 'distance_to_goal':habitat_agent.metrics['distance_to_goal'], 'object_goal':habitat_agent.instruct_goal}) write_metrics(evaluation_metrics) ================================================ FILE: requirements.txt ================================================ apex==0.9.10dev einops==0.8.0 fairscale==0.4.4 fvcore==0.1.5.post20221221 imageio==2.34.1 matplotlib==3.8.4 MultiScaleDeformableAttention==1.0 numpy==1.23.5 numpy_quaternion==2023.0.3 omegaconf==2.3.0 open3d==0.18.0 openai==1.45.0 opencv_python==4.4.0.46 opencv_python_headless==4.5.5.64 pathfinding==1.0.9 Pillow==10.4.0 Requests==2.32.3 salesforce_lavis==1.0.2 scipy==1.14.1 setuptools==60.2.0 timm==0.4.12 torch==2.2.2+cu121 torchvision==0.17.2+cu121 tqdm==4.65.2 transformers==4.26.1 xformers==0.0.28.post1 ================================================ FILE: thirdparty/GLEE/configs/R50.yaml ================================================ MODEL: META_ARCHITECTURE: "GLEE" MASK_ON: True BACKBONE: FREEZE_AT: 0 NAME: "build_resnet_backbone" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used SEM_SEG_HEAD: NAME: "MaskDINOHead" IGNORE_VALUE: 255 NUM_CLASSES: 80 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MaskDINOEncoder" DIM_FEEDFORWARD: 2048 NUM_FEATURE_LEVELS: 3 TOTAL_NUM_FEATURE_LEVELS: 4 IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 FEATURE_ORDER: "low2high" MaskDINO: TRANSFORMER_DECODER_NAME: "MaskDINODecoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 4.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 BOX_WEIGHT: 5.0 GIOU_WEIGHT: 2.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 300 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 INITIAL_PRED: True TWO_STAGE: True DN: "standard" DN_NUM: 100 INITIALIZE_BOX_TYPE: "no" TEST: SEMANTIC_ON: False INSTANCE_ON: True PANOPTIC_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.25 TEXT: ARCH: clip_teacher LANGUAGE_BACKBONE: LANG_DIM: 512 ================================================ FILE: thirdparty/GLEE/configs/SwinL.yaml ================================================ MODEL: META_ARCHITECTURE: "GLEE" MASK_ON: True BACKBONE: NAME: "D2SwinTransformer" SWIN: EMBED_DIM: 192 DEPTHS: [2, 2, 18, 2] NUM_HEADS: [6, 12, 24, 48] WINDOW_SIZE: 12 APE: False DROP_PATH_RATE: 0.3 PATCH_NORM: True PRETRAIN_IMG_SIZE: 384 PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used SEM_SEG_HEAD: NAME: "MaskDINOHead" IGNORE_VALUE: 255 NUM_CLASSES: 80 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MaskDINOEncoder" DIM_FEEDFORWARD: 2048 NUM_FEATURE_LEVELS: 3 TOTAL_NUM_FEATURE_LEVELS: 4 IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 FEATURE_ORDER: "low2high" MaskDINO: TRANSFORMER_DECODER_NAME: "MaskDINODecoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 4.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 BOX_WEIGHT: 5.0 GIOU_WEIGHT: 2.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 300 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 INITIAL_PRED: True TWO_STAGE: True DN: "standard" DN_NUM: 100 INITIALIZE_BOX_TYPE: "no" TEST: SEMANTIC_ON: False INSTANCE_ON: True PANOPTIC_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.25 TEXT: ARCH: clip_teacher LANGUAGE_BACKBONE: LANG_DIM: 512 ================================================ FILE: thirdparty/GLEE/glee/__init__.py ================================================ from __future__ import absolute_import from __future__ import division from __future__ import print_function from .config import add_glee_config from .config_deeplab import add_deeplab_config # from .GLEE import GLEE # from .data import build_detection_train_loader, build_detection_test_loader from .backbone.swin import D2SwinTransformer from .backbone.eva02 import D2_EVA02 ================================================ FILE: thirdparty/GLEE/glee/backbone/__init__.py ================================================ from .build import build_backbone from .resnet import * from .swin import * # from .focal import * # from .focal_dw import * from .backbone import * ================================================ FILE: thirdparty/GLEE/glee/backbone/backbone.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch.nn as nn from detectron2.modeling import ShapeSpec __all__ = ["Backbone"] class Backbone(nn.Module): """ Abstract base class for network backbones. """ def __init__(self): """ The `__init__` method of any subclass can specify its own set of arguments. """ super().__init__() def forward(self): """ Subclasses must override this method, but adhere to the same return type. Returns: dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor """ pass @property def size_divisibility(self) -> int: """ Some backbones require the input height and width to be divisible by a specific integer. This is typically true for encoder / decoder type networks with lateral connection (e.g., FPN) for which feature maps need to match dimension in the "bottom up" and "top down" paths. Set to 0 if no specific input size divisibility is required. """ return 0 def output_shape(self): """ Returns: dict[str->ShapeSpec] """ # this is a backward-compatible default return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } ================================================ FILE: thirdparty/GLEE/glee/backbone/build.py ================================================ from .registry import model_entrypoints from .registry import is_model from .backbone import * def build_backbone(config, **kwargs): model_name = config['MODEL']['BACKBONE']['NAME'] if not is_model(model_name): raise ValueError(f'Unkown model: {model_name}') model = model_entrypoints(model_name)(config, **kwargs) return model ================================================ FILE: thirdparty/GLEE/glee/backbone/davit.py ================================================ import os import itertools import logging import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from collections import OrderedDict from einops import rearrange from timm.models.layers import DropPath, trunc_normal_ from detectron2.utils.file_io import PathManager from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec from .registry import register_backbone logger = logging.getLogger(__name__) class MySequential(nn.Sequential): def forward(self, *inputs): for module in self._modules.values(): if type(inputs) == tuple: inputs = module(*inputs) else: inputs = module(inputs) return inputs class PreNorm(nn.Module): def __init__(self, norm, fn, drop_path=None): super().__init__() self.norm = norm self.fn = fn self.drop_path = drop_path def forward(self, x, *args, **kwargs): shortcut = x if self.norm != None: x, size = self.fn(self.norm(x), *args, **kwargs) else: x, size = self.fn(x, *args, **kwargs) if self.drop_path: x = self.drop_path(x) x = shortcut + x return x, size class Mlp(nn.Module): def __init__( self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.net = nn.Sequential(OrderedDict([ ("fc1", nn.Linear(in_features, hidden_features)), ("act", act_layer()), ("fc2", nn.Linear(hidden_features, out_features)) ])) def forward(self, x, size): return self.net(x), size class DepthWiseConv2d(nn.Module): def __init__( self, dim_in, kernel_size, padding, stride, bias=True, ): super().__init__() self.dw = nn.Conv2d( dim_in, dim_in, kernel_size=kernel_size, padding=padding, groups=dim_in, stride=stride, bias=bias ) def forward(self, x, size): B, N, C = x.shape H, W = size assert N == H * W x = self.dw(x.transpose(1, 2).view(B, C, H, W)) size = (x.size(-2), x.size(-1)) x = x.flatten(2).transpose(1, 2) return x, size class ConvEmbed(nn.Module): """ Image to Patch Embedding """ def __init__( self, patch_size=7, in_chans=3, embed_dim=64, stride=4, padding=2, norm_layer=None, pre_norm=True ): super().__init__() self.patch_size = patch_size self.proj = nn.Conv2d( in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding ) dim_norm = in_chans if pre_norm else embed_dim self.norm = norm_layer(dim_norm) if norm_layer else None self.pre_norm = pre_norm def forward(self, x, size): H, W = size if len(x.size()) == 3: if self.norm and self.pre_norm: x = self.norm(x) x = rearrange( x, 'b (h w) c -> b c h w', h=H, w=W ) x = self.proj(x) _, _, H, W = x.shape x = rearrange(x, 'b c h w -> b (h w) c') if self.norm and not self.pre_norm: x = self.norm(x) return x, (H, W) class ChannelAttention(nn.Module): def __init__(self, dim, groups=8, qkv_bias=True): super().__init__() self.groups = groups self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.proj = nn.Linear(dim, dim) def forward(self, x, size): B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] q = q * (N ** -0.5) attention = q.transpose(-1, -2) @ k attention = attention.softmax(dim=-1) x = (attention @ v.transpose(-1, -2)).transpose(-1, -2) x = x.transpose(1, 2).reshape(B, N, C) x = self.proj(x) return x, size class ChannelBlock(nn.Module): def __init__(self, dim, groups, mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True): super().__init__() drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None self.channel_attn = PreNorm( norm_layer(dim), ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias), drop_path ) self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None self.ffn = PreNorm( norm_layer(dim), Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer), drop_path ) def forward(self, x, size): if self.conv1: x, size = self.conv1(x, size) x, size = self.channel_attn(x, size) if self.conv2: x, size = self.conv2(x, size) x, size = self.ffn(x, size) return x, size def window_partition(x, window_size: int): B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size: int, H: int, W: int): B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): def __init__(self, dim, num_heads, window_size, qkv_bias=True): super().__init__() self.dim = dim self.window_size = window_size self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.proj = nn.Linear(dim, dim) self.softmax = nn.Softmax(dim=-1) def forward(self, x, size): H, W = size B, L, C = x.shape assert L == H * W, "input feature has wrong size" x = x.view(B, H, W, C) pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) _, Hp, Wp, _ = x.shape x = window_partition(x, self.window_size) x = x.view(-1, self.window_size * self.window_size, C) # W-MSA/SW-MSA # attn_windows = self.attn(x_windows) B_, N, C = x.shape qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] q = q * self.scale attn = (q @ k.transpose(-2, -1)) attn = self.softmax(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) # merge windows x = x.view( -1, self.window_size, self.window_size, C ) x = window_reverse(x, self.window_size, Hp, Wp) if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) return x, size class SpatialBlock(nn.Module): def __init__(self, dim, num_heads, window_size, mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True): super().__init__() drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None self.window_attn = PreNorm( norm_layer(dim), WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias), drop_path ) self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None self.ffn = PreNorm( norm_layer(dim), Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer), drop_path ) def forward(self, x, size): if self.conv1: x, size = self.conv1(x, size) x, size = self.window_attn(x, size) if self.conv2: x, size = self.conv2(x, size) x, size = self.ffn(x, size) return x, size class DaViT(nn.Module): """ DaViT: Dual-Attention Transformer Args: img_size (int): Image size, Default: 224. in_chans (int): Number of input image channels. Default: 3. num_classes (int): Number of classes for classification head. Default: 1000. patch_size (tuple(int)): Patch size of convolution in different stages. Default: (7, 2, 2, 2). patch_stride (tuple(int)): Patch stride of convolution in different stages. Default: (4, 2, 2, 2). patch_padding (tuple(int)): Patch padding of convolution in different stages. Default: (3, 0, 0, 0). patch_prenorm (tuple(bool)): If True, perform norm before convlution layer. Default: (True, False, False, False). embed_dims (tuple(int)): Patch embedding dimension in different stages. Default: (64, 128, 192, 256). num_heads (tuple(int)): Number of spatial attention heads in different stages. Default: (4, 8, 12, 16). num_groups (tuple(int)): Number of channel groups in different stages. Default: (4, 8, 12, 16). window_size (int): Window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True. drop_path_rate (float): Stochastic depth rate. Default: 0.1. norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. enable_checkpoint (bool): If True, enable checkpointing. Default: False. conv_at_attn (bool): If True, performe depthwise convolution before attention layer. Default: True. conv_at_ffn (bool): If True, performe depthwise convolution before ffn layer. Default: True. """ def __init__( self, img_size=224, in_chans=3, num_classes=1000, depths=(1, 1, 3, 1), patch_size=(7, 2, 2, 2), patch_stride=(4, 2, 2, 2), patch_padding=(3, 0, 0, 0), patch_prenorm=(False, False, False, False), embed_dims=(64, 128, 192, 256), num_heads=(3, 6, 12, 24), num_groups=(3, 6, 12, 24), window_size=7, mlp_ratio=4., qkv_bias=True, drop_path_rate=0.1, norm_layer=nn.LayerNorm, enable_checkpoint=False, conv_at_attn=True, conv_at_ffn=True, out_indices=[], ): super().__init__() self.num_classes = num_classes self.embed_dims = embed_dims self.num_heads = num_heads self.num_groups = num_groups self.num_stages = len(self.embed_dims) self.enable_checkpoint = enable_checkpoint assert self.num_stages == len(self.num_heads) == len(self.num_groups) num_stages = len(embed_dims) self.img_size = img_size dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)] depth_offset = 0 convs = [] blocks = [] for i in range(num_stages): conv_embed = ConvEmbed( patch_size=patch_size[i], stride=patch_stride[i], padding=patch_padding[i], in_chans=in_chans if i == 0 else self.embed_dims[i - 1], embed_dim=self.embed_dims[i], norm_layer=norm_layer, pre_norm=patch_prenorm[i] ) convs.append(conv_embed) print(f'=> Depth offset in stage {i}: {depth_offset}') block = MySequential( *[ MySequential(OrderedDict([ ( 'spatial_block', SpatialBlock( embed_dims[i], num_heads[i], window_size, drop_path_rate=dpr[depth_offset+j*2], qkv_bias=qkv_bias, mlp_ratio=mlp_ratio, conv_at_attn=conv_at_attn, conv_at_ffn=conv_at_ffn, ) ), ( 'channel_block', ChannelBlock( embed_dims[i], num_groups[i], drop_path_rate=dpr[depth_offset+j*2+1], qkv_bias=qkv_bias, mlp_ratio=mlp_ratio, conv_at_attn=conv_at_attn, conv_at_ffn=conv_at_ffn, ) ) ])) for j in range(depths[i]) ] ) blocks.append(block) depth_offset += depths[i]*2 self.convs = nn.ModuleList(convs) self.blocks = nn.ModuleList(blocks) self.out_indices = out_indices # self.norms = norm_layer(self.embed_dims[-1]) # self.avgpool = nn.AdaptiveAvgPool1d(1) # self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity() self.apply(self._init_weights) @property def dim_out(self): return self.embed_dims[-1] def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=0.02) if m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Conv2d): nn.init.normal_(m.weight, std=0.02) for name, _ in m.named_parameters(): if name in ['bias']: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.weight, 1.0) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1.0) nn.init.constant_(m.bias, 0) def _try_remap_keys(self, pretrained_dict): remap_keys = { "conv_embeds": "convs", "main_blocks": "blocks", "0.cpe.0.proj": "spatial_block.conv1.fn.dw", "0.attn": "spatial_block.window_attn.fn", "0.cpe.1.proj": "spatial_block.conv2.fn.dw", "0.mlp": "spatial_block.ffn.fn.net", "1.cpe.0.proj": "channel_block.conv1.fn.dw", "1.attn": "channel_block.channel_attn.fn", "1.cpe.1.proj": "channel_block.conv2.fn.dw", "1.mlp": "channel_block.ffn.fn.net", "0.norm1": "spatial_block.window_attn.norm", "0.norm2": "spatial_block.ffn.norm", "1.norm1": "channel_block.channel_attn.norm", "1.norm2": "channel_block.ffn.norm" } full_key_mappings = {} for k in pretrained_dict.keys(): old_k = k for remap_key in remap_keys.keys(): if remap_key in k: print(f'=> Repace {remap_key} with {remap_keys[remap_key]}') k = k.replace(remap_key, remap_keys[remap_key]) full_key_mappings[old_k] = k return full_key_mappings def from_state_dict(self, pretrained_dict, pretrained_layers=[], verbose=True): model_dict = self.state_dict() stripped_key = lambda x: x[14:] if x.startswith('image_encoder.') else x full_key_mappings = self._try_remap_keys(pretrained_dict) pretrained_dict = { stripped_key(full_key_mappings[k]): v for k, v in pretrained_dict.items() if stripped_key(full_key_mappings[k]) in model_dict.keys() } need_init_state_dict = {} for k, v in pretrained_dict.items(): need_init = ( k.split('.')[0] in pretrained_layers or pretrained_layers[0] == '*' ) if need_init: if verbose: print(f'=> init {k} from pretrained state dict') need_init_state_dict[k] = v self.load_state_dict(need_init_state_dict, strict=False) def from_pretrained(self, pretrained='', pretrained_layers=[], verbose=True): if os.path.isfile(pretrained): print(f'=> loading pretrained model {pretrained}') pretrained_dict = torch.load(pretrained, map_location='cpu') self.from_state_dict(pretrained_dict, pretrained_layers, verbose) def forward_features(self, x): input_size = (x.size(2), x.size(3)) outs = {} for i, (conv, block) in enumerate(zip(self.convs, self.blocks)): x, input_size = conv(x, input_size) if self.enable_checkpoint: x, input_size = checkpoint.checkpoint(block, x, input_size) else: x, input_size = block(x, input_size) if i in self.out_indices: out = x.view(-1, *input_size, self.embed_dims[i]).permute(0, 3, 1, 2).contiguous() outs["res{}".format(i + 2)] = out if len(self.out_indices) == 0: outs["res5"] = x.view(-1, *input_size, self.embed_dims[-1]).permute(0, 3, 1, 2).contiguous() return outs def forward(self, x): x = self.forward_features(x) # x = self.head(x) return x class D2DaViT(DaViT, Backbone): def __init__(self, cfg, input_shape): spec = cfg['BACKBONE']['DAVIT'] super().__init__( num_classes=0, depths=spec['DEPTHS'], embed_dims=spec['DIM_EMBED'], num_heads=spec['NUM_HEADS'], num_groups=spec['NUM_GROUPS'], patch_size=spec['PATCH_SIZE'], patch_stride=spec['PATCH_STRIDE'], patch_padding=spec['PATCH_PADDING'], patch_prenorm=spec['PATCH_PRENORM'], drop_path_rate=spec['DROP_PATH_RATE'], img_size=input_shape, window_size=spec.get('WINDOW_SIZE', 7), enable_checkpoint=spec.get('ENABLE_CHECKPOINT', False), conv_at_attn=spec.get('CONV_AT_ATTN', True), conv_at_ffn=spec.get('CONV_AT_FFN', True), out_indices=spec.get('OUT_INDICES', []), ) self._out_features = cfg['BACKBONE']['DAVIT']['OUT_FEATURES'] self._out_feature_strides = { "res2": 4, "res3": 8, "res4": 16, "res5": 32, } self._out_feature_channels = { "res2": self.embed_dims[0], "res3": self.embed_dims[1], "res4": self.embed_dims[2], "res5": self.embed_dims[3], } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert ( x.dim() == 4 ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} y = super().forward(x) for k in y.keys(): if k in self._out_features: outputs[k] = y[k] return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 @register_backbone def get_davit_backbone(cfg): davit = D2DaViT(cfg['MODEL'], 224) if cfg['MODEL']['BACKBONE']['LOAD_PRETRAINED'] is True: filename = cfg['MODEL']['BACKBONE']['PRETRAINED'] logger.info(f'=> init from {filename}') davit.from_pretrained( filename, cfg['MODEL']['BACKBONE']['DAVIT'].get('PRETRAINED_LAYERS', ['*']), cfg['VERBOSE']) return davit ================================================ FILE: thirdparty/GLEE/glee/backbone/eva01.py ================================================ import logging import math from functools import partial import fvcore.nn.weight_init as weight_init import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor, Size from typing import Union, List from torch.nn.parameter import Parameter import numbers from detectron2.layers import CNNBlockBase, Conv2d, get_norm from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous from fairscale.nn.checkpoint import checkpoint_wrapper from timm.models.layers import DropPath, Mlp, trunc_normal_ # from detectron2.modeling.backbone import Backbone from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec from .eva_01_utils import ( PatchEmbed, add_decomposed_rel_pos, get_abs_pos, window_partition, window_unpartition, ) from detectron2.modeling.backbone.fpn import LastLevelMaxPool logger = logging.getLogger(__name__) __all__ = ["EVAViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"] _shape_t = Union[int, List[int], Size] # steal from beit https://github.com/microsoft/unilm/tree/master/beit class LayerNormWithForceFP32(nn.Module): __constants__ = ['normalized_shape', 'eps', 'elementwise_affine'] normalized_shape: _shape_t eps: float elementwise_affine: bool def __init__(self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True) -> None: super(LayerNormWithForceFP32, self).__init__() if isinstance(normalized_shape, numbers.Integral): normalized_shape = (normalized_shape,) self.normalized_shape = tuple(normalized_shape) self.eps = eps self.elementwise_affine = elementwise_affine if self.elementwise_affine: self.weight = Parameter(torch.Tensor(*normalized_shape)) self.bias = Parameter(torch.Tensor(*normalized_shape)) else: self.register_parameter('weight', None) self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self) -> None: if self.elementwise_affine: nn.init.ones_(self.weight) nn.init.zeros_(self.bias) def forward(self, input: Tensor) -> Tensor: return F.layer_norm( input.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps).type_as(input) def extra_repr(self) -> Tensor: return '{normalized_shape}, eps={eps}, ' \ 'elementwise_affine={elementwise_affine}'.format(**self.__dict__) class Attention(nn.Module): """Multi-head Attention block with relative position embeddings.""" def __init__( self, dim, num_heads=8, qkv_bias=True, beit_like_qkv_bias=False, use_rel_pos=False, rel_pos_zero_init=True, input_size=None, interp_type="vitdet", ): """ Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. qkv_bias (bool: If True, add a learnable bias to query, key, value. rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. input_size (int or None): Input resolution for calculating the relative positional parameter size. """ super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim**-0.5 self.beit_like_qkv_bias = beit_like_qkv_bias if beit_like_qkv_bias: self.q_bias = nn.Parameter(torch.zeros(dim)) self.v_bias = nn.Parameter(torch.zeros(dim)) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.proj = nn.Linear(dim, dim) self.use_rel_pos = use_rel_pos self.interp_type = interp_type if self.use_rel_pos: # initialize relative positional embeddings self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) if not rel_pos_zero_init: trunc_normal_(self.rel_pos_h, std=0.02) trunc_normal_(self.rel_pos_w, std=0.02) self.qk_float = False def forward(self, x): B, H, W, _ = x.shape # qkv with shape (3, B, nHead, H * W, C) if self.beit_like_qkv_bias: qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) qkv = torch.nn.functional.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) qkv = qkv.reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) else: qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # q, k, v with shape (B * nHead, H * W, C) q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) if self.qk_float: attn = (q.float() * self.scale) @ k.float().transpose(-2, -1) if self.use_rel_pos: attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W), self.interp_type) attn = attn.softmax(dim=-1).type_as(x) else: attn = (q * self.scale) @ k.transpose(-2, -1) if self.use_rel_pos: attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W), self.interp_type) attn = attn.softmax(dim=-1) x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) x = self.proj(x) return x class ResBottleneckBlock(CNNBlockBase): """ The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels 1x1, 3x3, 1x1. """ def __init__( self, in_channels, out_channels, bottleneck_channels, norm="LN", act_layer=nn.GELU, ): """ Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. bottleneck_channels (int): number of output channels for the 3x3 "bottleneck" conv layers. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. act_layer (callable): activation for all conv layers. """ super().__init__(in_channels, out_channels, 1) self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False) self.norm1 = get_norm(norm, bottleneck_channels) self.act1 = act_layer() self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, 3, padding=1, bias=False, ) self.norm2 = get_norm(norm, bottleneck_channels) self.act2 = act_layer() self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False) self.norm3 = get_norm(norm, out_channels) for layer in [self.conv1, self.conv2, self.conv3]: weight_init.c2_msra_fill(layer) for layer in [self.norm1, self.norm2]: layer.weight.data.fill_(1.0) layer.bias.data.zero_() # zero init last norm layer. self.norm3.weight.data.zero_() self.norm3.bias.data.zero_() def forward(self, x): out = x for layer in self.children(): out = layer(out) out = x + out return out class Block(nn.Module): """Transformer blocks with support of window attention and residual propagation blocks""" def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=True, drop_path=0.0, norm_layer=LayerNormWithForceFP32, act_layer=nn.GELU, use_rel_pos=False, rel_pos_zero_init=True, window_size=0, use_residual_block=False, input_size=None, beit_like_qkv_bias=False, beit_like_gamma=False, interp_type="vitdet", ): """ Args: dim (int): Number of input channels. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. If it equals 0, then not use window attention. use_residual_block (bool): If True, use a residual block after the MLP block. input_size (int or None): Input resolution for calculating the relative positional parameter size. beit_like_qkv_bias (bool) beit_like_gamma (bool) """ super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, input_size=input_size if window_size == 0 else (window_size, window_size), beit_like_qkv_bias=beit_like_qkv_bias, interp_type=interp_type, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer) self.window_size = window_size self.use_residual_block = use_residual_block if use_residual_block: # Use a residual block with bottleneck channel as dim // 2 self.residual = ResBottleneckBlock( in_channels=dim, out_channels=dim, bottleneck_channels=dim // 2, norm="LN", act_layer=act_layer, ) self.beit_like_gamma = beit_like_gamma if beit_like_gamma: self.gamma_1 = nn.Parameter(torch.ones((dim)), requires_grad=True) self.gamma_2 = nn.Parameter(torch.ones((dim)), requires_grad=True) def forward(self, x): shortcut = x x = self.norm1(x) # Window partition if self.window_size > 0: H, W = x.shape[1], x.shape[2] x, pad_hw = window_partition(x, self.window_size) x = self.attn(x) # Reverse window partition if self.window_size > 0: x = window_unpartition(x, self.window_size, pad_hw, (H, W)) if self.beit_like_gamma: x = shortcut + self.drop_path(self.gamma_1 * x) x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) else: x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) if self.use_residual_block: x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) return x class EVAViT(Backbone): """ This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`. "Exploring Plain Vision Transformer Backbones for Object Detection", https://arxiv.org/abs/2203.16527 """ def __init__( self, img_size=1024, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, qkv_bias=True, drop_path_rate=0.0, norm_layer=LayerNormWithForceFP32, act_layer=nn.GELU, use_abs_pos=True, use_rel_pos=False, rel_pos_zero_init=True, window_size=0, window_block_indexes=(), residual_block_indexes=(), use_act_checkpoint=False, pretrain_img_size=224, pretrain_use_cls_token=True, out_feature="last_feat", beit_like_qkv_bias=True, beit_like_gamma=False, freeze_patch_embed=False, interp_type="vitdet", ): """ Args: img_size (int): Input image size. patch_size (int): Patch size. in_chans (int): Number of input image channels. embed_dim (int): Patch embedding dimension. depth (int): Depth of ViT. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path_rate (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_abs_pos (bool): If True, use absolute positional embeddings. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. window_block_indexes (list): Indexes for blocks using window attention. residual_block_indexes (list): Indexes for blocks using conv propagation. use_act_checkpoint (bool): If True, use activation checkpointing. pretrain_img_size (int): input image size for pretraining models. pretrain_use_cls_token (bool): If True, pretrainig models use class token. out_feature (str): name of the feature from the last block. beit_like_qkv_bias (bool): beit_like_model that has gamma_1 and gamma_2 in blocks and qkv_bias=False beit_like_gamma (bool) freeze_patch_embed (bool) interp_type: "vitdet" for training / fine-ting, "beit" for eval (slightly improvement at a higher res) """ super().__init__() self.pretrain_use_cls_token = pretrain_use_cls_token self.patch_embed = PatchEmbed( kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), in_chans=in_chans, embed_dim=embed_dim, ) if use_abs_pos: # Initialize absolute positional embedding with pretrain image size. num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size) num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim)) else: self.pos_embed = None # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] self.blocks = nn.ModuleList() if beit_like_qkv_bias: qkv_bias = False for i in range(depth): block = Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, window_size=window_size if i in window_block_indexes else 0, use_residual_block=i in residual_block_indexes, input_size=(img_size // patch_size, img_size // patch_size), beit_like_qkv_bias=beit_like_qkv_bias, beit_like_gamma=beit_like_gamma, interp_type=interp_type, ) if use_act_checkpoint: block = checkpoint_wrapper(block) self.blocks.append(block) self._out_feature_channels = {out_feature: embed_dim} self._out_feature_strides = {out_feature: patch_size} self._out_features = [out_feature] if self.pos_embed is not None: trunc_normal_(self.pos_embed, std=0.02) self.freeze_patch_embed = freeze_patch_embed self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, LayerNormWithForceFP32): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) if self.freeze_patch_embed: for n, p in self.patch_embed.named_parameters(): p.requires_grad = False def forward(self, x): x = self.patch_embed(x) if self.pos_embed is not None: x = x + get_abs_pos( self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2]) ) for blk in self.blocks: x = blk(x) outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)} return outputs class SimpleFeaturePyramid(Backbone): """ This module implements SimpleFeaturePyramid in :paper:`vitdet`. It creates pyramid features built on top of the input feature map. """ def __init__( self, net, in_feature, out_channels, scale_factors, top_block=None, norm="LN", square_pad=0, ): """ Args: net (Backbone): module representing the subnetwork backbone. Must be a subclass of :class:`Backbone`. in_feature (str): names of the input feature maps coming from the net. out_channels (int): number of channels in the output feature maps. scale_factors (list[float]): list of scaling factors to upsample or downsample the input features for creating pyramid features. top_block (nn.Module or None): if provided, an extra operation will be performed on the output of the last (smallest resolution) pyramid output, and the result will extend the result list. The top_block further downsamples the feature map. It must have an attribute "num_levels", meaning the number of extra pyramid levels added by this block, and "in_feature", which is a string representing its input feature (e.g., p5). norm (str): the normalization to use. square_pad (int): If > 0, require input images to be padded to specific square size. """ super(SimpleFeaturePyramid, self).__init__() assert isinstance(net, Backbone) self.scale_factors = scale_factors input_shapes = net.output_shape() strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors] _assert_strides_are_log2_contiguous(strides) dim = input_shapes[in_feature].channels self.stages = [] use_bias = norm == "" for idx, scale in enumerate(scale_factors): out_dim = dim if scale == 4.0: layers = [ nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2), get_norm(norm, dim // 2), nn.GELU(), nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2), ] out_dim = dim // 4 elif scale == 2.0: layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)] out_dim = dim // 2 elif scale == 1.0: layers = [] elif scale == 0.5: layers = [nn.MaxPool2d(kernel_size=2, stride=2)] else: raise NotImplementedError(f"scale_factor={scale} is not supported yet.") layers.extend( [ Conv2d( out_dim, out_channels, kernel_size=1, bias=use_bias, norm=get_norm(norm, out_channels), ), Conv2d( out_channels, out_channels, kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, out_channels), ), ] ) layers = nn.Sequential(*layers) stage = int(math.log2(strides[idx])) self.add_module(f"simfp_{stage}", layers) self.stages.append(layers) self.net = net self.in_feature = in_feature self.top_block = top_block # Return feature names are "p", like ["p2", "p3", ..., "p6"] self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} # top block output feature maps. if self.top_block is not None: for s in range(stage, stage + self.top_block.num_levels): self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) self._out_features = list(self._out_feature_strides.keys()) self._out_feature_channels = {k: out_channels for k in self._out_features} self._size_divisibility = strides[-1] self._square_pad = square_pad @property def padding_constraints(self): return { "size_divisiblity": self._size_divisibility, "square_size": self._square_pad, } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: mapping from feature map name to pyramid feature map tensor in high to low resolution order. Returned feature names follow the FPN convention: "p", where stage has stride = 2 ** stage e.g., ["p2", "p3", ..., "p6"]. """ bottom_up_features = self.net(x) features = bottom_up_features[self.in_feature] results = [] for stage in self.stages: results.append(stage(features)) if self.top_block is not None: if self.top_block.in_feature in bottom_up_features: top_block_in_feature = bottom_up_features[self.top_block.in_feature] else: top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] results.extend(self.top_block(top_block_in_feature)) assert len(self._out_features) == len(results) return {f: res for f, res in zip(self._out_features, results)} @BACKBONE_REGISTRY.register() class D2_EVA01(SimpleFeaturePyramid): def __init__(self, cfg, input_shape): super().__init__( net = EVAViT( img_size= cfg.MODEL.EVA01.IMAGE_SIZE, patch_size=cfg.MODEL.EVA01.PATCH_SIZE, window_size= cfg.MODEL.EVA01.WINDOW_SIZE, embed_dim= cfg.MODEL.EVA01.DMBED_DIM, depth= cfg.MODEL.EVA01.DEPTH, num_heads= cfg.MODEL.EVA01.NUM_HEADS , drop_path_rate= cfg.MODEL.EVA01.DROP_PATH_RATE, mlp_ratio= cfg.MODEL.EVA01.MLP_RATIO, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), window_block_indexes= cfg.MODEL.EVA01.WINDOW_BLOCK_INDEXES, residual_block_indexes=[], use_act_checkpoint = True, use_rel_pos = True, out_feature="last_feat", beit_like_qkv_bias=cfg.MODEL.EVA01.BEIT_LIKE_QKV_BIAS , beit_like_gamma= cfg.MODEL.EVA01.BEIT_LIKE_GAMMA, freeze_patch_embed= cfg.MODEL.EVA01.FREEZE_PATH_EMBED, ), in_feature = "last_feat", out_channels=256, scale_factors=(2.0, 1.0, 0.5), # (4.0, 2.0, 1.0, 0.5) in ViTDet top_block=LastLevelMaxPool(), norm="LN", square_pad=cfg.MODEL.EVA01.IMAGE_SIZE, ) pretrained_weight = cfg.MODEL.EVA01.PRETRAINED_WEIGHT if pretrained_weight: checkpoint = torch.load(pretrained_weight, map_location='cpu') print(f'\nload pretrain weight from {pretrained_weight} \n') self.load_state_dict(checkpoint['model'], strict=False) def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): """ Calculate lr decay rate for different ViT blocks. Args: name (string): parameter name. lr_decay_rate (float): base lr decay rate. num_layers (int): number of ViT blocks. Returns: lr decay rate for the given parameter. """ layer_id = num_layers + 1 if 'backbone' in name: #name.startswith("backbone"): if ".pos_embed" in name or ".patch_embed" in name: layer_id = 0 elif ".blocks." in name and ".residual." not in name: layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1 return lr_decay_rate ** (num_layers + 1 - layer_id) ================================================ FILE: thirdparty/GLEE/glee/backbone/eva02-dino.py ================================================ import logging import math from functools import partial import fvcore.nn.weight_init as weight_init import torch import torch.nn as nn import torch.nn.functional as F from detectron2.layers import CNNBlockBase, Conv2d, get_norm from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous from detectron2.modeling.backbone import Backbone from .eva_02_utils import ( PatchEmbed, add_decomposed_rel_pos, get_abs_pos, window_partition, window_unpartition, VisionRotaryEmbeddingFast, ) try: import xformers.ops as xops HAS_XFORMER=True except: HAS_XFORMER=False pass logger = logging.getLogger(__name__) __all__ = ["EVA02_ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"] class SwiGLU(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0., norm_layer=nn.LayerNorm, subln=False ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.w1 = nn.Linear(in_features, hidden_features) self.w2 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() self.w3 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x1 = self.w1(x) x2 = self.w2(x) hidden = self.act(x1) * x2 x = self.ffn_ln(hidden) x = self.w3(x) x = self.drop(x) return x class Attention(nn.Module): def __init__( self, dim, num_heads=8, qkv_bias=True, qk_scale=None, attn_head_dim=None, rope=None, xattn=True, ): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads if attn_head_dim is not None: head_dim = attn_head_dim all_head_dim = head_dim * self.num_heads self.scale = qk_scale or head_dim ** -0.5 self.q_proj = nn.Linear(dim, all_head_dim, bias=False) self.k_proj = nn.Linear(dim, all_head_dim, bias=False) self.v_proj = nn.Linear(dim, all_head_dim, bias=False) if qkv_bias: self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) else: self.q_bias = None self.v_bias = None self.rope = rope self.xattn = xattn self.proj = nn.Linear(all_head_dim, dim) if not HAS_XFORMER: self.xattn = False def forward(self, x): B, H, W, C = x.shape x = x.view(B, -1, C) N = H * W q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) k = F.linear(input=x, weight=self.k_proj.weight, bias=None) v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) ## rope q = self.rope(q).type_as(v) k = self.rope(k).type_as(v) if self.xattn: q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C k = k.permute(0, 2, 1, 3) v = v.permute(0, 2, 1, 3) x = xops.memory_efficient_attention(q, k, v) x = x.reshape(B, N, -1) else: q = q * self.scale attn = (q @ k.transpose(-2, -1)) attn = attn.softmax(dim=-1).type_as(x) x = (attn @ v).transpose(1, 2).reshape(B, N, -1) x = self.proj(x) x = x.view(B, H, W, C) return x class ResBottleneckBlock(CNNBlockBase): """ The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels 1x1, 3x3, 1x1. """ def __init__( self, in_channels, out_channels, bottleneck_channels, norm="LN", act_layer=nn.GELU, ): """ Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. bottleneck_channels (int): number of output channels for the 3x3 "bottleneck" conv layers. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. act_layer (callable): activation for all conv layers. """ super().__init__(in_channels, out_channels, 1) self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False) self.norm1 = get_norm(norm, bottleneck_channels) self.act1 = act_layer() self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, 3, padding=1, bias=False, ) self.norm2 = get_norm(norm, bottleneck_channels) self.act2 = act_layer() self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False) self.norm3 = get_norm(norm, out_channels) for layer in [self.conv1, self.conv2, self.conv3]: weight_init.c2_msra_fill(layer) for layer in [self.norm1, self.norm2]: layer.weight.data.fill_(1.0) layer.bias.data.zero_() # zero init last norm layer. self.norm3.weight.data.zero_() self.norm3.bias.data.zero_() def forward(self, x): out = x for layer in self.children(): out = layer(out) out = x + out return out class Block(nn.Module): """Transformer blocks with support of window attention and residual propagation blocks""" def __init__( self, dim, num_heads, mlp_ratio=4*2/3, qkv_bias=True, drop_path=0.0, norm_layer=partial(nn.LayerNorm, eps=1e-6), window_size=0, use_residual_block=False, rope=None, xattn=True, ): """ Args: dim (int): Number of input channels. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. If it equals 0, then not use window attention. use_residual_block (bool): If True, use a residual block after the MLP block. input_size (int or None): Input resolution for calculating the relative positional parameter size. """ super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, rope=rope, xattn=xattn, ) from timm.models.layers import DropPath self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) self.mlp = SwiGLU( in_features=dim, hidden_features=int(dim * mlp_ratio), subln=True, norm_layer=norm_layer, ) self.window_size = window_size self.use_residual_block = use_residual_block if use_residual_block: # Use a residual block with bottleneck channel as dim // 2 self.residual = ResBottleneckBlock( in_channels=dim, out_channels=dim, bottleneck_channels=dim // 2, norm="LN", ) def forward(self, x): shortcut = x x = self.norm1(x) # Window partition if self.window_size > 0: H, W = x.shape[1], x.shape[2] x, pad_hw = window_partition(x, self.window_size) x = self.attn(x) # Reverse window partition if self.window_size > 0: x = window_unpartition(x, self.window_size, pad_hw, (H, W)) x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) if self.use_residual_block: x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) return x class EVA02_ViT(Backbone): """ This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`. "Exploring Plain Vision Transformer Backbones for Object Detection", https://arxiv.org/abs/2203.16527 """ def __init__( self, img_size=1024, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4*2/3, qkv_bias=True, drop_path_rate=0.0, norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU, use_abs_pos=True, use_rel_pos=False, rope=True, pt_hw_seq_len=16, intp_freq=True, window_size=0, window_block_indexes=(), residual_block_indexes=(), use_act_checkpoint=False, pretrain_img_size=224, pretrain_use_cls_token=True, out_feature="last_feat", xattn=True, ): """ Args: img_size (int): Input image size. patch_size (int): Patch size. in_chans (int): Number of input image channels. embed_dim (int): Patch embedding dimension. depth (int): Depth of ViT. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path_rate (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_abs_pos (bool): If True, use absolute positional embeddings. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. window_block_indexes (list): Indexes for blocks using window attention. residual_block_indexes (list): Indexes for blocks using conv propagation. use_act_checkpoint (bool): If True, use activation checkpointing. pretrain_img_size (int): input image size for pretraining models. pretrain_use_cls_token (bool): If True, pretrainig models use class token. out_feature (str): name of the feature from the last block. """ super().__init__() self.pretrain_use_cls_token = pretrain_use_cls_token self.patch_embed = PatchEmbed( kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), in_chans=in_chans, embed_dim=embed_dim, ) if use_abs_pos: # Initialize absolute positional embedding with pretrain image size. num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size) num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim)) else: self.pos_embed = None half_head_dim = embed_dim // num_heads // 2 hw_seq_len = img_size // patch_size self.rope_win = VisionRotaryEmbeddingFast( dim=half_head_dim, pt_seq_len=pt_hw_seq_len, ft_seq_len=window_size if intp_freq else None, ) self.rope_glb = VisionRotaryEmbeddingFast( dim=half_head_dim, pt_seq_len=pt_hw_seq_len, ft_seq_len=hw_seq_len if intp_freq else None, ) # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] self.blocks = nn.ModuleList() for i in range(depth): block = Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop_path=dpr[i], norm_layer=norm_layer, window_size=window_size if i in window_block_indexes else 0, use_residual_block=i in residual_block_indexes, rope=self.rope_win if i in window_block_indexes else self.rope_glb, xattn=xattn ) if use_act_checkpoint: # TODO: use torch.utils.checkpoint from fairscale.nn.checkpoint import checkpoint_wrapper block = checkpoint_wrapper(block) self.blocks.append(block) self._out_feature_channels = {out_feature: embed_dim} self._out_feature_strides = {out_feature: patch_size} self._out_features = [out_feature] if self.pos_embed is not None: nn.init.trunc_normal_(self.pos_embed, std=0.02) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): x = self.patch_embed(x) if self.pos_embed is not None: x = x + get_abs_pos( self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2]) ) for blk in self.blocks: x = blk(x) outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)} return outputs class SimpleFeaturePyramid(Backbone): """ This module implements SimpleFeaturePyramid in :paper:`vitdet`. It creates pyramid features built on top of the input feature map. """ def __init__( self, net, in_feature, out_channels, scale_factors, top_block=None, norm="LN", square_pad=0, ): """ Args: net (Backbone): module representing the subnetwork backbone. Must be a subclass of :class:`Backbone`. in_feature (str): names of the input feature maps coming from the net. out_channels (int): number of channels in the output feature maps. scale_factors (list[float]): list of scaling factors to upsample or downsample the input features for creating pyramid features. top_block (nn.Module or None): if provided, an extra operation will be performed on the output of the last (smallest resolution) pyramid output, and the result will extend the result list. The top_block further downsamples the feature map. It must have an attribute "num_levels", meaning the number of extra pyramid levels added by this block, and "in_feature", which is a string representing its input feature (e.g., p5). norm (str): the normalization to use. square_pad (int): If > 0, require input images to be padded to specific square size. """ super(SimpleFeaturePyramid, self).__init__() assert isinstance(net, Backbone) self.scale_factors = scale_factors input_shapes = net.output_shape() strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors] _assert_strides_are_log2_contiguous(strides) dim = input_shapes[in_feature].channels self.stages = [] use_bias = norm == "" for idx, scale in enumerate(scale_factors): out_dim = dim if scale == 4.0: layers = [ nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2), get_norm(norm, dim // 2), nn.GELU(), nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2), ] out_dim = dim // 4 elif scale == 2.0: layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)] out_dim = dim // 2 elif scale == 1.0: layers = [] elif scale == 0.5: layers = [nn.MaxPool2d(kernel_size=2, stride=2)] else: raise NotImplementedError(f"scale_factor={scale} is not supported yet.") layers.extend( [ Conv2d( out_dim, out_channels, kernel_size=1, bias=use_bias, norm=get_norm(norm, out_channels), ), Conv2d( out_channels, out_channels, kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, out_channels), ), ] ) layers = nn.Sequential(*layers) stage = int(math.log2(strides[idx])) self.add_module(f"simfp_{stage}", layers) self.stages.append(layers) self.net = net self.in_feature = in_feature self.top_block = top_block # Return feature names are "p", like ["p2", "p3", ..., "p6"] self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} # top block output feature maps. if self.top_block is not None: for s in range(stage, stage + self.top_block.num_levels): self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) self._out_features = list(self._out_feature_strides.keys()) self._out_feature_channels = {k: out_channels for k in self._out_features} self._size_divisibility = strides[-1] self._square_pad = square_pad @property def padding_constraints(self): return { "size_divisiblity": self._size_divisibility, "square_size": self._square_pad, } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: mapping from feature map name to pyramid feature map tensor in high to low resolution order. Returned feature names follow the FPN convention: "p", where stage has stride = 2 ** stage e.g., ["p2", "p3", ..., "p6"]. """ bottom_up_features = self.net(x) features = bottom_up_features[self.in_feature] results = [] for stage in self.stages: results.append(stage(features)) if self.top_block is not None: if self.top_block.in_feature in bottom_up_features: top_block_in_feature = bottom_up_features[self.top_block.in_feature] else: top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] results.extend(self.top_block(top_block_in_feature)) assert len(self._out_features) == len(results) return {f: res for f, res in zip(self._out_features, results)} def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12): """ Calculate lr decay rate for different ViT blocks. Args: name (string): parameter name. lr_decay_rate (float): base lr decay rate. num_layers (int): number of ViT blocks. Returns: lr decay rate for the given parameter. """ layer_id = num_layers + 1 if name.startswith("backbone"): if ".pos_embed" in name or ".patch_embed" in name: layer_id = 0 elif ".blocks." in name and ".residual." not in name: layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1 return lr_decay_rate ** (num_layers + 1 - layer_id) ================================================ FILE: thirdparty/GLEE/glee/backbone/eva02.py ================================================ # -------------------------------------------------------- # EVA02 # -------------------------------------------------------- import logging import math from functools import partial import fvcore.nn.weight_init as weight_init import torch import torch.nn as nn import torch.nn.functional as F from detectron2.layers import CNNBlockBase, Conv2d, get_norm from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous from detectron2.modeling.backbone import Backbone from timm.models.layers import DropPath, Mlp, trunc_normal_ from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec from .eva_02_utils import ( PatchEmbed, add_decomposed_rel_pos, get_abs_pos, window_partition, window_unpartition, VisionRotaryEmbeddingFast, ) from detectron2.modeling.backbone.fpn import LastLevelMaxPool try: import xformers.ops as xops HAS_XFORMER=True except: HAS_XFORMER=False pass try: from apex.normalization import FusedLayerNorm except: pass logger = logging.getLogger(__name__) __all__ = ["EVA02_ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"] class SwiGLU(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0., norm_layer=nn.LayerNorm, subln=False ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.w1 = nn.Linear(in_features, hidden_features) self.w2 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity() self.w3 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x1 = self.w1(x) x2 = self.w2(x) hidden = self.act(x1) * x2 x = self.ffn_ln(hidden) x = self.w3(x) x = self.drop(x) return x class Attention(nn.Module): def __init__( self, dim, num_heads=8, qkv_bias=True, qk_scale=None, attn_head_dim=None, rope=None, xattn=True, ): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads if attn_head_dim is not None: head_dim = attn_head_dim all_head_dim = head_dim * self.num_heads self.scale = qk_scale or head_dim ** -0.5 self.q_proj = nn.Linear(dim, all_head_dim, bias=False) self.k_proj = nn.Linear(dim, all_head_dim, bias=False) self.v_proj = nn.Linear(dim, all_head_dim, bias=False) if qkv_bias: self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) else: self.q_bias = None self.v_bias = None self.rope = rope self.xattn = xattn self.proj = nn.Linear(all_head_dim, dim) if not HAS_XFORMER: self.xattn = False def forward(self, x): B, H, W, C = x.shape x = x.view(B, -1, C) N = H * W q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias) k = F.linear(input=x, weight=self.k_proj.weight, bias=None) v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias) q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) ## rope q = self.rope(q).type_as(v) k = self.rope(k).type_as(v) if self.xattn: q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C k = k.permute(0, 2, 1, 3) v = v.permute(0, 2, 1, 3) x = xops.memory_efficient_attention(q, k, v) x = x.reshape(B, N, -1) else: q = q * self.scale attn = (q @ k.transpose(-2, -1)) attn = attn.softmax(dim=-1).type_as(x) x = (attn @ v).transpose(1, 2).reshape(B, N, -1) x = self.proj(x) x = x.view(B, H, W, C) return x class ResBottleneckBlock(CNNBlockBase): """ The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels 1x1, 3x3, 1x1. """ def __init__( self, in_channels, out_channels, bottleneck_channels, norm="LN", act_layer=nn.GELU, ): """ Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. bottleneck_channels (int): number of output channels for the 3x3 "bottleneck" conv layers. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. act_layer (callable): activation for all conv layers. """ super().__init__(in_channels, out_channels, 1) self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False) self.norm1 = get_norm(norm, bottleneck_channels) self.act1 = act_layer() self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, 3, padding=1, bias=False, ) self.norm2 = get_norm(norm, bottleneck_channels) self.act2 = act_layer() self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False) self.norm3 = get_norm(norm, out_channels) for layer in [self.conv1, self.conv2, self.conv3]: weight_init.c2_msra_fill(layer) for layer in [self.norm1, self.norm2]: layer.weight.data.fill_(1.0) layer.bias.data.zero_() # zero init last norm layer. self.norm3.weight.data.zero_() self.norm3.bias.data.zero_() def forward(self, x): out = x for layer in self.children(): out = layer(out) out = x + out return out class Block(nn.Module): """Transformer blocks with support of window attention and residual propagation blocks""" def __init__( self, dim, num_heads, mlp_ratio=4*2/3, qkv_bias=True, drop_path=0.0, norm_layer=partial(nn.LayerNorm, eps=1e-6), window_size=0, use_residual_block=False, rope=None, xattn=True, ): """ Args: dim (int): Number of input channels. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. If it equals 0, then not use window attention. use_residual_block (bool): If True, use a residual block after the MLP block. input_size (int or None): Input resolution for calculating the relative positional parameter size. """ super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, rope=rope, xattn=xattn, ) from timm.models.layers import DropPath self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) self.mlp = SwiGLU( in_features=dim, hidden_features=int(dim * mlp_ratio), subln=True, norm_layer=norm_layer, ) self.window_size = window_size self.use_residual_block = use_residual_block if use_residual_block: # Use a residual block with bottleneck channel as dim // 2 self.residual = ResBottleneckBlock( in_channels=dim, out_channels=dim, bottleneck_channels=dim // 2, norm="LN", ) def forward(self, x): shortcut = x x = self.norm1(x) # Window partition if self.window_size > 0: H, W = x.shape[1], x.shape[2] x, pad_hw = window_partition(x, self.window_size) x = self.attn(x) # Reverse window partition if self.window_size > 0: x = window_unpartition(x, self.window_size, pad_hw, (H, W)) x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) if self.use_residual_block: x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) return x class EVA02_ViT(Backbone): """ This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`. "Exploring Plain Vision Transformer Backbones for Object Detection", https://arxiv.org/abs/2203.16527 """ def __init__( self, img_size=1024, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4*2/3, qkv_bias=True, drop_path_rate=0.0, norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU, use_abs_pos=True, use_rel_pos=False, rope=True, pt_hw_seq_len=16, intp_freq=True, window_size=0, window_block_indexes=(), residual_block_indexes=(), use_act_checkpoint=False, pretrain_img_size=224, pretrain_use_cls_token=True, out_feature="last_feat", xattn=True, ): """ Args: img_size (int): Input image size. patch_size (int): Patch size. in_chans (int): Number of input image channels. embed_dim (int): Patch embedding dimension. depth (int): Depth of ViT. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path_rate (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_abs_pos (bool): If True, use absolute positional embeddings. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. window_block_indexes (list): Indexes for blocks using window attention. residual_block_indexes (list): Indexes for blocks using conv propagation. use_act_checkpoint (bool): If True, use activation checkpointing. pretrain_img_size (int): input image size for pretraining models. pretrain_use_cls_token (bool): If True, pretrainig models use class token. out_feature (str): name of the feature from the last block. """ super().__init__() self.pretrain_use_cls_token = pretrain_use_cls_token self.patch_embed = PatchEmbed( kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), in_chans=in_chans, embed_dim=embed_dim, ) if use_abs_pos: # Initialize absolute positional embedding with pretrain image size. num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size) num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim)) else: self.pos_embed = None half_head_dim = embed_dim // num_heads // 2 hw_seq_len = img_size // patch_size self.rope_win = VisionRotaryEmbeddingFast( dim=half_head_dim, pt_seq_len=pt_hw_seq_len, ft_seq_len=window_size if intp_freq else None, ) self.rope_glb = VisionRotaryEmbeddingFast( dim=half_head_dim, pt_seq_len=pt_hw_seq_len, ft_seq_len=hw_seq_len if intp_freq else None, ) # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] self.blocks = nn.ModuleList() for i in range(depth): block = Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop_path=dpr[i], norm_layer=norm_layer, window_size=window_size if i in window_block_indexes else 0, use_residual_block=i in residual_block_indexes, rope=self.rope_win if i in window_block_indexes else self.rope_glb, xattn=xattn ) if use_act_checkpoint: # TODO: use torch.utils.checkpoint from fairscale.nn.checkpoint import checkpoint_wrapper block = checkpoint_wrapper(block) self.blocks.append(block) self._out_feature_channels = {out_feature: embed_dim} self._out_feature_strides = {out_feature: patch_size} self._out_features = [out_feature] if self.pos_embed is not None: nn.init.trunc_normal_(self.pos_embed, std=0.02) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): x = self.patch_embed(x) if self.pos_embed is not None: x = x + get_abs_pos( self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2]) ) for blk in self.blocks: x = blk(x) outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)} return outputs class SimpleFeaturePyramid(Backbone): """ This module implements SimpleFeaturePyramid in :paper:`vitdet`. It creates pyramid features built on top of the input feature map. """ def __init__( self, net, in_feature, out_channels, scale_factors, top_block=None, norm="LN", square_pad=0, ): """ Args: net (Backbone): module representing the subnetwork backbone. Must be a subclass of :class:`Backbone`. in_feature (str): names of the input feature maps coming from the net. out_channels (int): number of channels in the output feature maps. scale_factors (list[float]): list of scaling factors to upsample or downsample the input features for creating pyramid features. top_block (nn.Module or None): if provided, an extra operation will be performed on the output of the last (smallest resolution) pyramid output, and the result will extend the result list. The top_block further downsamples the feature map. It must have an attribute "num_levels", meaning the number of extra pyramid levels added by this block, and "in_feature", which is a string representing its input feature (e.g., p5). norm (str): the normalization to use. square_pad (int): If > 0, require input images to be padded to specific square size. """ super(SimpleFeaturePyramid, self).__init__() assert isinstance(net, Backbone) self.scale_factors = scale_factors input_shapes = net.output_shape() strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors] _assert_strides_are_log2_contiguous(strides) dim = input_shapes[in_feature].channels self.stages = [] use_bias = norm == "" for idx, scale in enumerate(scale_factors): out_dim = dim if scale == 4.0: layers = [ nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2), get_norm(norm, dim // 2), nn.GELU(), nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2), ] out_dim = dim // 4 elif scale == 2.0: layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)] out_dim = dim // 2 elif scale == 1.0: layers = [] elif scale == 0.5: layers = [nn.MaxPool2d(kernel_size=2, stride=2)] else: raise NotImplementedError(f"scale_factor={scale} is not supported yet.") layers.extend( [ Conv2d( out_dim, out_channels, kernel_size=1, bias=use_bias, norm=get_norm(norm, out_channels), ), Conv2d( out_channels, out_channels, kernel_size=3, padding=1, bias=use_bias, norm=get_norm(norm, out_channels), ), ] ) layers = nn.Sequential(*layers) stage = int(math.log2(strides[idx])) self.add_module(f"simfp_{stage}", layers) self.stages.append(layers) self.net = net self.in_feature = in_feature self.top_block = top_block # Return feature names are "p", like ["p2", "p3", ..., "p6"] self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides} # top block output feature maps. if self.top_block is not None: for s in range(stage, stage + self.top_block.num_levels): self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) self._out_features = list(self._out_feature_strides.keys()) self._out_feature_channels = {k: out_channels for k in self._out_features} self._size_divisibility = strides[-1] self._square_pad = square_pad @property def padding_constraints(self): return { "size_divisiblity": self._size_divisibility, "square_size": self._square_pad, } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: mapping from feature map name to pyramid feature map tensor in high to low resolution order. Returned feature names follow the FPN convention: "p", where stage has stride = 2 ** stage e.g., ["p2", "p3", ..., "p6"]. """ bottom_up_features = self.net(x) features = bottom_up_features[self.in_feature] results = [] for stage in self.stages: results.append(stage(features)) if self.top_block is not None: if self.top_block.in_feature in bottom_up_features: top_block_in_feature = bottom_up_features[self.top_block.in_feature] else: top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] results.extend(self.top_block(top_block_in_feature)) assert len(self._out_features) == len(results) return {f: res for f, res in zip(self._out_features, results)} @BACKBONE_REGISTRY.register() class D2_EVA02(SimpleFeaturePyramid): def __init__(self, cfg, input_shape): super().__init__( net = EVA02_ViT( img_size= cfg.MODEL.EVA02.IMAGE_SIZE, patch_size=cfg.MODEL.EVA02.PATCH_SIZE, window_size= cfg.MODEL.EVA02.WINDOW_SIZE, embed_dim= cfg.MODEL.EVA02.DMBED_DIM, depth= cfg.MODEL.EVA02.DEPTH, num_heads= cfg.MODEL.EVA02.NUM_HEADS , drop_path_rate= cfg.MODEL.EVA02.DROP_PATH_RATE, mlp_ratio= cfg.MODEL.EVA02.MLP_RATIO, # qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), window_block_indexes= cfg.MODEL.EVA02.WINDOW_BLOCK_INDEXES, # residual_block_indexes=[], # use_rel_pos=False, use_act_checkpoint = cfg.MODEL.EVA02.CHECKPOINT, out_feature="last_feat", # intp_freq=True, ), in_feature = "last_feat", out_channels=256, scale_factors=(2.0, 1.0, 0.5), # (4.0, 2.0, 1.0, 0.5) in ViTDet top_block=LastLevelMaxPool(), norm="LN", square_pad=cfg.MODEL.EVA02.IMAGE_SIZE, ) pretrained_weight = cfg.MODEL.EVA02.PRETRAINED_WEIGHT if pretrained_weight: checkpoint = torch.load(pretrained_weight, map_location='cpu') print(f'\nload pretrain weight from {pretrained_weight} \n') self.load_state_dict(checkpoint['model'], strict=False) def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 ================================================ FILE: thirdparty/GLEE/glee/backbone/eva_01_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import math import numpy as np from scipy import interpolate import torch import torch.nn as nn import torch.nn.functional as F __all__ = [ "window_partition", "window_unpartition", "add_decomposed_rel_pos", "get_abs_pos", "PatchEmbed", ] def window_partition(x, window_size): """ Partition into non-overlapping windows with padding if needed. Args: x (tensor): input tokens with [B, H, W, C]. window_size (int): window size. Returns: windows: windows after partition with [B * num_windows, window_size, window_size, C]. (Hp, Wp): padded height and width before partition """ B, H, W, C = x.shape pad_h = (window_size - H % window_size) % window_size pad_w = (window_size - W % window_size) % window_size if pad_h > 0 or pad_w > 0: x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) Hp, Wp = H + pad_h, W + pad_w x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows, (Hp, Wp) def window_unpartition(windows, window_size, pad_hw, hw): """ Window unpartition into original sequences and removing padding. Args: x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. window_size (int): window size. pad_hw (Tuple): padded height and width (Hp, Wp). hw (Tuple): original height and width (H, W) before padding. Returns: x: unpartitioned sequences with [B, H, W, C]. """ Hp, Wp = pad_hw H, W = hw B = windows.shape[0] // (Hp * Wp // window_size // window_size) x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) if Hp > H or Wp > W: x = x[:, :H, :W, :].contiguous() return x def get_rel_pos(q_size, k_size, rel_pos, interp_type): """ Get relative positional embeddings according to the relative positions of query and key sizes. Args: q_size (int): size of query q. k_size (int): size of key k. rel_pos (Tensor): relative position embeddings (L, C). Returns: Extracted positional embeddings according to relative positions. """ max_rel_dist = int(2 * max(q_size, k_size) - 1) # Interpolate rel pos if needed. if rel_pos.shape[0] != max_rel_dist: if interp_type == "vitdet": # the vitdet impl: # https://github.com/facebookresearch/detectron2/blob/96c752ce821a3340e27edd51c28a00665dd32a30/detectron2/modeling/backbone/utils.py#L77. rel_pos_resized = F.interpolate( rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), size=max_rel_dist, mode="linear", ) rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) elif interp_type == "beit": # steal from beit https://github.com/microsoft/unilm/tree/master/beit # modified by Yuxin Fang src_size = rel_pos.shape[0] dst_size = max_rel_dist q = 1.0903078 dis = [] cur = 1 for i in range(src_size // 2): dis.append(cur) cur += q ** (i + 1) r_ids = [-_ for _ in reversed(dis)] x = r_ids + [0] + dis t = dst_size // 2.0 dx = np.arange(-t, t + 0.1, 1.0) all_rel_pos_bias = [] for i in range(rel_pos.shape[1]): # a hack from https://github.com/baaivision/EVA/issues/8, # could also be used in fine-tuning but the performance haven't been tested. z = rel_pos[:, i].view(src_size).cpu().float().detach().numpy() f = interpolate.interp1d(x, z, kind='cubic', fill_value="extrapolate") all_rel_pos_bias.append( torch.Tensor(f(dx)).contiguous().view(-1, 1).to(rel_pos.device)) rel_pos_resized = torch.cat(all_rel_pos_bias, dim=-1) else: raise NotImplementedError() else: rel_pos_resized = rel_pos # Scale the coords with short length if shapes for q and k are different. q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) return rel_pos_resized[relative_coords.long()] def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size, interp_type): """ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 Args: attn (Tensor): attention map. q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. q_size (Tuple): spatial sequence size of query q with (q_h, q_w). k_size (Tuple): spatial sequence size of key k with (k_h, k_w). Returns: attn (Tensor): attention map with added relative positional embeddings. """ q_h, q_w = q_size k_h, k_w = k_size Rh = get_rel_pos(q_h, k_h, rel_pos_h, interp_type) Rw = get_rel_pos(q_w, k_w, rel_pos_w, interp_type) B, _, dim = q.shape r_q = q.reshape(B, q_h, q_w, dim) rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) attn = ( attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] ).view(B, q_h * q_w, k_h * k_w) return attn def get_abs_pos(abs_pos, has_cls_token, hw): """ Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the original embeddings. Args: abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. hw (Tuple): size of input image tokens. Returns: Absolute positional embeddings after processing with shape (1, H, W, C) """ h, w = hw if has_cls_token: abs_pos = abs_pos[:, 1:] xy_num = abs_pos.shape[1] size = int(math.sqrt(xy_num)) assert size * size == xy_num if size != h or size != w: new_abs_pos = F.interpolate( abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2), size=(h, w), mode="bicubic", align_corners=False, ) return new_abs_pos.permute(0, 2, 3, 1) else: return abs_pos.reshape(1, h, w, -1) class PatchEmbed(nn.Module): """ Image to Patch Embedding. """ def __init__( self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768 ): """ Args: kernel_size (Tuple): kernel size of the projection layer. stride (Tuple): stride of the projection layer. padding (Tuple): padding size of the projection layer. in_chans (int): Number of input image channels. embed_dim (int): embed_dim (int): Patch embedding dimension. """ super().__init__() self.proj = nn.Conv2d( in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding ) def forward(self, x): x = self.proj(x) # B C H W -> B H W C x = x.permute(0, 2, 3, 1) return x ================================================ FILE: thirdparty/GLEE/glee/backbone/eva_02_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import math import numpy as np from scipy import interpolate import torch import torch.nn as nn import torch.nn.functional as F __all__ = [ "window_partition", "window_unpartition", "add_decomposed_rel_pos", "get_abs_pos", "PatchEmbed", "VisionRotaryEmbeddingFast", ] def window_partition(x, window_size): """ Partition into non-overlapping windows with padding if needed. Args: x (tensor): input tokens with [B, H, W, C]. window_size (int): window size. Returns: windows: windows after partition with [B * num_windows, window_size, window_size, C]. (Hp, Wp): padded height and width before partition """ B, H, W, C = x.shape pad_h = (window_size - H % window_size) % window_size pad_w = (window_size - W % window_size) % window_size if pad_h > 0 or pad_w > 0: x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) Hp, Wp = H + pad_h, W + pad_w x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows, (Hp, Wp) def window_unpartition(windows, window_size, pad_hw, hw): """ Window unpartition into original sequences and removing padding. Args: x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. window_size (int): window size. pad_hw (Tuple): padded height and width (Hp, Wp). hw (Tuple): original height and width (H, W) before padding. Returns: x: unpartitioned sequences with [B, H, W, C]. """ Hp, Wp = pad_hw H, W = hw B = windows.shape[0] // (Hp * Wp // window_size // window_size) x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) if Hp > H or Wp > W: x = x[:, :H, :W, :].contiguous() return x def get_rel_pos(q_size, k_size, rel_pos): """ Get relative positional embeddings according to the relative positions of query and key sizes. Args: q_size (int): size of query q. k_size (int): size of key k. rel_pos (Tensor): relative position embeddings (L, C). Returns: Extracted positional embeddings according to relative positions. """ max_rel_dist = int(2 * max(q_size, k_size) - 1) use_log_interpolation = True # Interpolate rel pos if needed. if rel_pos.shape[0] != max_rel_dist: if not use_log_interpolation: # Interpolate rel pos. rel_pos_resized = F.interpolate( rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), size=max_rel_dist, mode="linear", ) rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) else: src_size = rel_pos.shape[0] dst_size = max_rel_dist # q = 1.13492 q = 1.0903078 dis = [] cur = 1 for i in range(src_size // 2): dis.append(cur) cur += q ** (i + 1) r_ids = [-_ for _ in reversed(dis)] x = r_ids + [0] + dis t = dst_size // 2.0 dx = np.arange(-t, t + 0.1, 1.0) # print("x = %s" % str(x)) # print("dx = %s" % str(dx)) all_rel_pos_bias = [] for i in range(rel_pos.shape[1]): z = rel_pos[:, i].view(src_size).cpu().float().numpy() f = interpolate.interp1d(x, z, kind='cubic', fill_value="extrapolate") all_rel_pos_bias.append( torch.Tensor(f(dx)).contiguous().view(-1, 1).to(rel_pos.device)) rel_pos_resized = torch.cat(all_rel_pos_bias, dim=-1) else: rel_pos_resized = rel_pos # Scale the coords with short length if shapes for q and k are different. q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) return rel_pos_resized[relative_coords.long()] def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size): """ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 Args: attn (Tensor): attention map. q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. q_size (Tuple): spatial sequence size of query q with (q_h, q_w). k_size (Tuple): spatial sequence size of key k with (k_h, k_w). Returns: attn (Tensor): attention map with added relative positional embeddings. """ q_h, q_w = q_size k_h, k_w = k_size Rh = get_rel_pos(q_h, k_h, rel_pos_h) Rw = get_rel_pos(q_w, k_w, rel_pos_w) B, _, dim = q.shape r_q = q.reshape(B, q_h, q_w, dim) rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) attn = ( attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] ).view(B, q_h * q_w, k_h * k_w) return attn def get_abs_pos(abs_pos, has_cls_token, hw): """ Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the original embeddings. Args: abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. hw (Tuple): size of input image tokens. Returns: Absolute positional embeddings after processing with shape (1, H, W, C) """ h, w = hw if has_cls_token: abs_pos = abs_pos[:, 1:] xy_num = abs_pos.shape[1] size = int(math.sqrt(xy_num)) assert size * size == xy_num if size != h or size != w: new_abs_pos = F.interpolate( abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2), size=(h, w), mode="bicubic", align_corners=False, ) return new_abs_pos.permute(0, 2, 3, 1) else: return abs_pos.reshape(1, h, w, -1) class PatchEmbed(nn.Module): """ Image to Patch Embedding. """ def __init__( self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768 ): """ Args: kernel_size (Tuple): kernel size of the projection layer. stride (Tuple): stride of the projection layer. padding (Tuple): padding size of the projection layer. in_chans (int): Number of input image channels. embed_dim (int): embed_dim (int): Patch embedding dimension. """ super().__init__() self.proj = nn.Conv2d( in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding ) def forward(self, x): x = self.proj(x) # B C H W -> B H W C x = x.permute(0, 2, 3, 1) return x from math import pi import torch from torch import nn from einops import rearrange, repeat def broadcat(tensors, dim = -1): num_tensors = len(tensors) shape_lens = set(list(map(lambda t: len(t.shape), tensors))) assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions' shape_len = list(shape_lens)[0] dim = (dim + shape_len) if dim < 0 else dim dims = list(zip(*map(lambda t: list(t.shape), tensors))) expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation' max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims)) expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims)) expanded_dims.insert(dim, (dim, dims[dim])) expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims))) tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes))) return torch.cat(tensors, dim = dim) def rotate_half(x): x = rearrange(x, '... (d r) -> ... d r', r = 2) x1, x2 = x.unbind(dim = -1) x = torch.stack((-x2, x1), dim = -1) return rearrange(x, '... d r -> ... (d r)') class VisionRotaryEmbedding(nn.Module): def __init__( self, dim, pt_seq_len, ft_seq_len=None, custom_freqs = None, freqs_for = 'lang', theta = 10000, max_freq = 10, num_freqs = 1, ): super().__init__() if custom_freqs: freqs = custom_freqs elif freqs_for == 'lang': freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) elif freqs_for == 'pixel': freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi elif freqs_for == 'constant': freqs = torch.ones(num_freqs).float() else: raise ValueError(f'unknown modality {freqs_for}') if ft_seq_len is None: ft_seq_len = pt_seq_len t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len freqs_h = torch.einsum('..., f -> ... f', t, freqs) freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2) freqs_w = torch.einsum('..., f -> ... f', t, freqs) freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2) freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim = -1) self.register_buffer("freqs_cos", freqs.cos()) self.register_buffer("freqs_sin", freqs.sin()) print('======== shape of rope freq', self.freqs_cos.shape, '========') def forward(self, t, start_index = 0): rot_dim = self.freqs_cos.shape[-1] end_index = start_index + rot_dim assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}' t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:] t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin) return torch.cat((t_left, t, t_right), dim = -1) class VisionRotaryEmbeddingFast(nn.Module): def __init__( self, dim, pt_seq_len=16, ft_seq_len=None, custom_freqs = None, freqs_for = 'lang', theta = 10000, max_freq = 10, num_freqs = 1, ): super().__init__() if custom_freqs: freqs = custom_freqs elif freqs_for == 'lang': freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim)) elif freqs_for == 'pixel': freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi elif freqs_for == 'constant': freqs = torch.ones(num_freqs).float() else: raise ValueError(f'unknown modality {freqs_for}') if ft_seq_len is None: ft_seq_len = pt_seq_len t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len freqs = torch.einsum('..., f -> ... f', t, freqs) freqs = repeat(freqs, '... n -> ... (n r)', r = 2) freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1) freqs_cos = freqs.cos().view(-1, freqs.shape[-1]) freqs_sin = freqs.sin().view(-1, freqs.shape[-1]) self.register_buffer("freqs_cos", freqs_cos) self.register_buffer("freqs_sin", freqs_sin) print('======== shape of rope freq', self.freqs_cos.shape, '========') # def forward(self, t): return t * self.freqs_cos + rotate_half(t) * self.freqs_sin def forward(self, t): if t.shape[2] != self.freqs_cos.shape[0]: t_len = t.shape[2] output = t * self.freqs_cos[:t_len] + rotate_half(t) * self.freqs_sin[:t_len] else: output = t * self.freqs_cos + rotate_half(t) * self.freqs_sin return output ================================================ FILE: thirdparty/GLEE/glee/backbone/internimage.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from timm.models.layers import trunc_normal_, DropPath from detectron2.utils.logger import setup_logger from detectron2.modeling.backbone import Backbone from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec from .ops_dcnv3 import modules as opsm class to_channels_first(nn.Module): def __init__(self): super().__init__() def forward(self, x): return x.permute(0, 3, 1, 2) class to_channels_last(nn.Module): def __init__(self): super().__init__() def forward(self, x): return x.permute(0, 2, 3, 1) def build_norm_layer(dim, norm_layer, in_format='channels_last', out_format='channels_last', eps=1e-6): layers = [] if norm_layer == 'BN': if in_format == 'channels_last': layers.append(to_channels_first()) layers.append(nn.BatchNorm2d(dim)) if out_format == 'channels_last': layers.append(to_channels_last()) elif norm_layer == 'LN': if in_format == 'channels_first': layers.append(to_channels_last()) layers.append(nn.LayerNorm(dim, eps=eps)) if out_format == 'channels_first': layers.append(to_channels_first()) else: raise NotImplementedError( f'build_norm_layer does not support {norm_layer}') return nn.Sequential(*layers) def build_act_layer(act_layer): if act_layer == 'ReLU': return nn.ReLU(inplace=True) elif act_layer == 'SiLU': return nn.SiLU(inplace=True) elif act_layer == 'GELU': return nn.GELU() raise NotImplementedError(f'build_act_layer does not support {act_layer}') class CrossAttention(nn.Module): r""" Cross Attention Module Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. Default: 8 qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. Default: False. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 attn_head_dim (int, optional): Dimension of attention head. out_dim (int, optional): Dimension of output. """ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., attn_head_dim=None, out_dim=None): super().__init__() if out_dim is None: out_dim = dim self.num_heads = num_heads head_dim = dim // num_heads if attn_head_dim is not None: head_dim = attn_head_dim all_head_dim = head_dim * self.num_heads self.scale = qk_scale or head_dim ** -0.5 assert all_head_dim == dim self.q = nn.Linear(dim, all_head_dim, bias=False) self.k = nn.Linear(dim, all_head_dim, bias=False) self.v = nn.Linear(dim, all_head_dim, bias=False) if qkv_bias: self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) self.k_bias = nn.Parameter(torch.zeros(all_head_dim)) self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) else: self.q_bias = None self.k_bias = None self.v_bias = None self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(all_head_dim, out_dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x, k=None, v=None): B, N, C = x.shape N_k = k.shape[1] N_v = v.shape[1] q_bias, k_bias, v_bias = None, None, None if self.q_bias is not None: q_bias = self.q_bias k_bias = self.k_bias v_bias = self.v_bias q = F.linear(input=x, weight=self.q.weight, bias=q_bias) q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) # (B, N_head, N_q, dim) k = F.linear(input=k, weight=self.k.weight, bias=k_bias) k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) v = F.linear(input=v, weight=self.v.weight, bias=v_bias) v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) q = q * self.scale attn = (q @ k.transpose(-2, -1)) # (B, N_head, N_q, N_k) attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, -1) x = self.proj(x) x = self.proj_drop(x) return x class AttentiveBlock(nn.Module): r"""Attentive Block Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. Default: 8 qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. Default: False. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. drop (float, optional): Dropout rate. Default: 0.0. attn_drop (float, optional): Attention dropout rate. Default: 0.0. drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm. attn_head_dim (int, optional): Dimension of attention head. Default: None. out_dim (int, optional): Dimension of output. Default: None. """ def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., norm_layer="LN", attn_head_dim=None, out_dim=None): super().__init__() self.norm1_q = build_norm_layer(dim, norm_layer, eps=1e-6) self.norm1_k = build_norm_layer(dim, norm_layer, eps=1e-6) self.norm1_v = build_norm_layer(dim, norm_layer, eps=1e-6) self.cross_dcn = CrossAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim, out_dim=out_dim) self.drop_path = DropPath( drop_path) if drop_path > 0. else nn.Identity() def forward(self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bias=None): x_q = self.norm1_q(x_q + pos_q) x_k = self.norm1_k(x_kv + pos_k) x_v = self.norm1_v(x_kv) x = self.cross_dcn(x_q, k=x_k, v=x_v) return x class AttentionPoolingBlock(AttentiveBlock): def forward(self, x): x_q = x.mean(1, keepdim=True) x_kv = x pos_q, pos_k = 0, 0 x = super().forward(x_q, x_kv, pos_q, pos_k, bool_masked_pos=None, rel_pos_bias=None) x = x.squeeze(1) return x class StemLayer(nn.Module): r""" Stem layer of InternImage Args: in_chans (int): number of input channels out_chans (int): number of output channels act_layer (str): activation layer norm_layer (str): normalization layer """ def __init__(self, in_chans=3, out_chans=96, act_layer='GELU', norm_layer='BN'): super().__init__() self.conv1 = nn.Conv2d(in_chans, out_chans // 2, kernel_size=3, stride=2, padding=1) self.norm1 = build_norm_layer(out_chans // 2, norm_layer, 'channels_first', 'channels_first') self.act = build_act_layer(act_layer) self.conv2 = nn.Conv2d(out_chans // 2, out_chans, kernel_size=3, stride=2, padding=1) self.norm2 = build_norm_layer(out_chans, norm_layer, 'channels_first', 'channels_last') def forward(self, x): x = self.conv1(x) x = self.norm1(x) x = self.act(x) x = self.conv2(x) x = self.norm2(x) return x class DownsampleLayer(nn.Module): r""" Downsample layer of InternImage Args: channels (int): number of input channels norm_layer (str): normalization layer """ def __init__(self, channels, norm_layer='LN'): super().__init__() self.conv = nn.Conv2d(channels, 2 * channels, kernel_size=3, stride=2, padding=1, bias=False) self.norm = build_norm_layer(2 * channels, norm_layer, 'channels_first', 'channels_last') def forward(self, x): x = self.conv(x.permute(0, 3, 1, 2)) x = self.norm(x) return x class MLPLayer(nn.Module): r""" MLP layer of InternImage Args: in_features (int): number of input features hidden_features (int): number of hidden features out_features (int): number of output features act_layer (str): activation layer drop (float): dropout rate """ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer='GELU', drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = build_act_layer(act_layer) self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class InternImageLayer(nn.Module): r""" Basic layer of InternImage Args: core_op (nn.Module): core operation of InternImage channels (int): number of input channels groups (list): Groups of each block. mlp_ratio (float): ratio of mlp hidden features to input channels drop (float): dropout rate drop_path (float): drop path rate act_layer (str): activation layer norm_layer (str): normalization layer post_norm (bool): whether to use post normalization layer_scale (float): layer scale offset_scale (float): offset scale with_cp (bool): whether to use checkpoint """ def __init__(self, core_op, channels, groups, mlp_ratio=4., drop=0., drop_path=0., act_layer='GELU', norm_layer='LN', post_norm=False, layer_scale=None, offset_scale=1.0, with_cp=False, dw_kernel_size=None, # for InternImage-H/G res_post_norm=False, # for InternImage-H/G center_feature_scale=False): # for InternImage-H/G super().__init__() self.channels = channels self.groups = groups self.mlp_ratio = mlp_ratio self.with_cp = with_cp self.norm1 = build_norm_layer(channels, 'LN') self.post_norm = post_norm self.dcn = core_op( channels=channels, kernel_size=3, stride=1, pad=1, dilation=1, group=groups, offset_scale=offset_scale, act_layer=act_layer, norm_layer=norm_layer, dw_kernel_size=dw_kernel_size, # for InternImage-H/G center_feature_scale=center_feature_scale) # for InternImage-H/G self.drop_path = DropPath(drop_path) if drop_path > 0. \ else nn.Identity() self.norm2 = build_norm_layer(channels, 'LN') self.mlp = MLPLayer(in_features=channels, hidden_features=int(channels * mlp_ratio), act_layer=act_layer, drop=drop) self.layer_scale = layer_scale is not None if self.layer_scale: self.gamma1 = nn.Parameter(layer_scale * torch.ones(channels), requires_grad=True) self.gamma2 = nn.Parameter(layer_scale * torch.ones(channels), requires_grad=True) self.res_post_norm = res_post_norm if res_post_norm: self.res_post_norm1 = build_norm_layer(channels, 'LN') self.res_post_norm2 = build_norm_layer(channels, 'LN') def forward(self, x): def _inner_forward(x): if not self.layer_scale: if self.post_norm: x = x + self.drop_path(self.norm1(self.dcn(x))) x = x + self.drop_path(self.norm2(self.mlp(x))) elif self.res_post_norm: # for InternImage-H/G x = x + self.drop_path(self.res_post_norm1(self.dcn(self.norm1(x)))) x = x + self.drop_path(self.res_post_norm2(self.mlp(self.norm2(x)))) else: x = x + self.drop_path(self.dcn(self.norm1(x))) x = x + self.drop_path(self.mlp(self.norm2(x))) return x if self.post_norm: x = x + self.drop_path(self.gamma1 * self.norm1(self.dcn(x))) x = x + self.drop_path(self.gamma2 * self.norm2(self.mlp(x))) else: x = x + self.drop_path(self.gamma1 * self.dcn(self.norm1(x))) x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x))) return x if self.with_cp and x.requires_grad: x = checkpoint.checkpoint(_inner_forward, x) else: x = _inner_forward(x) return x class InternImageBlock(nn.Module): r""" Block of InternImage Args: core_op (nn.Module): core operation of InternImage channels (int): number of input channels depths (list): Depth of each block. groups (list): Groups of each block. mlp_ratio (float): ratio of mlp hidden features to input channels drop (float): dropout rate drop_path (float): drop path rate act_layer (str): activation layer norm_layer (str): normalization layer post_norm (bool): whether to use post normalization layer_scale (float): layer scale offset_scale (float): offset scale with_cp (bool): whether to use checkpoint """ def __init__(self, core_op, channels, depth, groups, downsample=True, mlp_ratio=4., drop=0., drop_path=0., act_layer='GELU', norm_layer='LN', post_norm=False, offset_scale=1.0, layer_scale=None, with_cp=False, dw_kernel_size=None, # for InternImage-H/G post_norm_block_ids=None, # for InternImage-H/G res_post_norm=False, # for InternImage-H/G center_feature_scale=False): # for InternImage-H/G super().__init__() self.channels = channels self.depth = depth self.post_norm = post_norm self.center_feature_scale = center_feature_scale self.blocks = nn.ModuleList([ InternImageLayer( core_op=core_op, channels=channels, groups=groups, mlp_ratio=mlp_ratio, drop=drop, drop_path=drop_path[i] if isinstance( drop_path, list) else drop_path, act_layer=act_layer, norm_layer=norm_layer, post_norm=post_norm, layer_scale=layer_scale, offset_scale=offset_scale, with_cp=with_cp, dw_kernel_size=dw_kernel_size, # for InternImage-H/G res_post_norm=res_post_norm, # for InternImage-H/G center_feature_scale=center_feature_scale # for InternImage-H/G ) for i in range(depth) ]) if not self.post_norm or center_feature_scale: self.norm = build_norm_layer(channels, 'LN') self.post_norm_block_ids = post_norm_block_ids if post_norm_block_ids is not None: # for InternImage-H/G self.post_norms = nn.ModuleList( [build_norm_layer(channels, 'LN', eps=1e-6) for _ in post_norm_block_ids] ) self.downsample = DownsampleLayer( channels=channels, norm_layer=norm_layer) if downsample else None def forward(self, x, return_wo_downsample=False): for i, blk in enumerate(self.blocks): x = blk(x) if (self.post_norm_block_ids is not None) and (i in self.post_norm_block_ids): index = self.post_norm_block_ids.index(i) x = self.post_norms[index](x) # for InternImage-H/G if not self.post_norm or self.center_feature_scale: x = self.norm(x) if return_wo_downsample: x_ = x if self.downsample is not None: x = self.downsample(x) if return_wo_downsample: return x, x_ return x class InternImage(Backbone): r""" InternImage A PyTorch impl of : `InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions` - https://arxiv.org/pdf/2103.14030 Args: core_op (str): Core operator. Default: 'DCNv3' channels (int): Number of the first stage. Default: 64 depths (list): Depth of each block. Default: [3, 4, 18, 5] groups (list): Groups of each block. Default: [3, 6, 12, 24] mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. drop_rate (float): Probability of an element to be zeroed. Default: 0. drop_path_rate (float): Stochastic depth rate. Default: 0. act_layer (str): Activation layer. Default: 'GELU' norm_layer (str): Normalization layer. Default: 'LN' layer_scale (bool): Whether to use layer scale. Default: False cls_scale (bool): Whether to use class scale. Default: False with_cp (bool): Use checkpoint or not. Using checkpoint will save some dw_kernel_size (int): Size of the dwconv. Default: None level2_post_norm (bool): Whether to use level2 post norm. Default: False level2_post_norm_block_ids (list): Indexes of post norm blocks. Default: None res_post_norm (bool): Whether to use res post norm. Default: False center_feature_scale (bool): Whether to use center feature scale. Default: False """ def __init__(self, core_op='DCNv3', channels=64, depths=[3, 4, 18, 5], groups=[3, 6, 12, 24], mlp_ratio=4., drop_rate=0., drop_path_rate=0.2, drop_path_type='linear', act_layer='GELU', norm_layer='LN', layer_scale=None, offset_scale=1.0, post_norm=False, with_cp=False, dw_kernel_size=None, # for InternImage-H/G level2_post_norm=False, # for InternImage-H/G level2_post_norm_block_ids=None, # for InternImage-H/G res_post_norm=False, # for InternImage-H/G center_feature_scale=False, # for InternImage-H/G out_indices=(0, 1, 2, 3), init_cfg=None, **kwargs): super().__init__() self.core_op = core_op self.num_levels = len(depths) self.depths = depths self.channels = channels self.num_features = int(channels * 2**(self.num_levels - 1)) self.post_norm = post_norm self.mlp_ratio = mlp_ratio self.init_cfg = init_cfg self.out_indices = out_indices self.level2_post_norm_block_ids = level2_post_norm_block_ids logger = setup_logger(name="InternImage") logger.info(f'using core type: {core_op}') logger.info(f'using activation layer: {act_layer}') logger.info(f'using main norm layer: {norm_layer}') logger.info(f'using dpr: {drop_path_type}, {drop_path_rate}') logger.info(f"level2_post_norm: {level2_post_norm}") logger.info(f"level2_post_norm_block_ids: {level2_post_norm_block_ids}") logger.info(f"res_post_norm: {res_post_norm}") in_chans = 3 self.patch_embed = StemLayer(in_chans=in_chans, out_chans=channels, act_layer=act_layer, norm_layer=norm_layer) self.pos_drop = nn.Dropout(p=drop_rate) dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] if drop_path_type == 'uniform': for i in range(len(dpr)): dpr[i] = drop_path_rate self.levels = nn.ModuleList() for i in range(self.num_levels): post_norm_block_ids = level2_post_norm_block_ids if level2_post_norm and ( i == 2) else None # for InternImage-H/G level = InternImageBlock( core_op=getattr(opsm, core_op), channels=int(channels * 2**i), depth=depths[i], groups=groups[i], mlp_ratio=self.mlp_ratio, drop=drop_rate, drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])], act_layer=act_layer, norm_layer=norm_layer, post_norm=post_norm, downsample=(i < self.num_levels - 1), layer_scale=layer_scale, offset_scale=offset_scale, with_cp=with_cp, dw_kernel_size=dw_kernel_size, # for InternImage-H/G post_norm_block_ids=post_norm_block_ids, # for InternImage-H/G res_post_norm=res_post_norm, # for InternImage-H/G center_feature_scale=center_feature_scale # for InternImage-H/G ) self.levels.append(level) self.num_layers = len(depths) self.apply(self._init_weights) self.apply(self._init_deform_weights) # add basic info for d2 backbone self._out_features = ["res{}".format(i+2) for i in self.out_indices] self._out_feature_channels = { "res{}".format(i+2): self.channels * 2**i for i in self.out_indices } self._out_feature_strides = {"res{}".format(i+2): 2 ** (i + 2) for i in self.out_indices} self._size_devisibility = 32 def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def _init_deform_weights(self, m): if isinstance(m, getattr(opsm, self.core_op)): m._reset_parameters() def forward(self, x): x = self.patch_embed(x) x = self.pos_drop(x) # d2 need dict output # seq_out = [] seq_out = {} for level_idx, level in enumerate(self.levels): x, x_ = level(x, return_wo_downsample=True) if level_idx in self.out_indices: # seq_out.append(x_.permute(0, 3, 1, 2).contiguous()) seq_out["res{}".format(level_idx+2)] = x_.permute(0, 3, 1, 2).contiguous() return seq_out @BACKBONE_REGISTRY.register() class D2InternImage(InternImage): def __init__(self, cfg, input_shape): super().__init__( core_op= cfg.MODEL.INTERNIMAGE.CORE_OP , channels=cfg.MODEL.INTERNIMAGE.CHANNELS, depths=cfg.MODEL.INTERNIMAGE.DEPTHS, groups=cfg.MODEL.INTERNIMAGE.GROUPS, mlp_ratio= cfg.MODEL.INTERNIMAGE.MLP_RATIO , drop_path_rate=cfg.MODEL.INTERNIMAGE.DROP_PATH_RATE, norm_layer=cfg.MODEL.INTERNIMAGE.NORM_LAYER, layer_scale=cfg.MODEL.INTERNIMAGE.LAYER_SCALE , offset_scale=cfg.MODEL.INTERNIMAGE.OFFSET_SCALE, post_norm=cfg.MODEL.INTERNIMAGE.POST_NORM, with_cp=cfg.MODEL.INTERNIMAGE.WITH_CP , out_indices=cfg.MODEL.INTERNIMAGE.OUT_IINDICES, dw_kernel_size= cfg.MODEL.INTERNIMAGE.DW_KERNEL_SIZE, # for InternImage-H/G res_post_norm= cfg.MODEL.INTERNIMAGE.RES_POST_NORM, # for InternImage-H/G level2_post_norm= cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM, # for InternImage-H/G level2_post_norm_block_ids= cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM_BLOCK_IDS, # for InternImage-H/G center_feature_scale= cfg.MODEL.INTERNIMAGE.CENTER_FEATURE_SCALE, # for InternImage-H/G ) pretrained_weight = cfg.MODEL.INTERNIMAGE.PRETRAINED_WEIGHT if pretrained_weight: checkpoint = torch.load(pretrained_weight, map_location='cpu') print(f'\nload pretrain weight from {pretrained_weight} \n') self.load_state_dict(checkpoint['model'], strict=False) def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert ( x.dim() == 4 ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} y = super().forward(x) for k in y.keys(): if k in self._out_features: outputs[k] = y[k] return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 ================================================ FILE: thirdparty/GLEE/glee/backbone/registry.py ================================================ _model_entrypoints = {} def register_backbone(fn): module_name_split = fn.__module__.split('.') model_name = module_name_split[-1] _model_entrypoints[model_name] = fn return fn def model_entrypoints(model_name): return _model_entrypoints[model_name] def is_model(model_name): return model_name in _model_entrypoints ================================================ FILE: thirdparty/GLEE/glee/backbone/resnet.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import pickle import numpy as np from typing import Any, Dict import fvcore.nn.weight_init as weight_init import torch import torch.nn.functional as F from torch import nn from .backbone import Backbone from .registry import register_backbone from detectron2.layers import ( CNNBlockBase, Conv2d, DeformConv, ModulatedDeformConv, ShapeSpec, get_norm, ) from detectron2.utils.file_io import PathManager __all__ = [ "ResNetBlockBase", "BasicBlock", "BottleneckBlock", "DeformBottleneckBlock", "BasicStem", "ResNet", "make_stage", "get_resnet_backbone", ] class BasicBlock(CNNBlockBase): """ The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`, with two 3x3 conv layers and a projection shortcut if needed. """ def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"): """ Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. stride (int): Stride for the first conv. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. """ super().__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None self.conv1 = Conv2d( in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False, norm=get_norm(norm, out_channels), ) self.conv2 = Conv2d( out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) def forward(self, x): out = self.conv1(x) out = F.relu_(out) out = self.conv2(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu_(out) return out class BottleneckBlock(CNNBlockBase): """ The standard bottleneck residual block used by ResNet-50, 101 and 152 defined in :paper:`ResNet`. It contains 3 conv layers with kernels 1x1, 3x3, 1x1, and a projection shortcut if needed. """ def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, ): """ Args: bottleneck_channels (int): number of output channels for the 3x3 "bottleneck" conv layers. num_groups (int): number of groups for the 3x3 conv layer. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. stride_in_1x1 (bool): when stride>1, whether to put stride in the first 1x1 convolution or the bottleneck 3x3 convolution. dilation (int): the dilation rate of the 3x3 conv layer. """ super().__init__(in_channels, out_channels, stride) if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None # The original MSRA ResNet models have stride in the first 1x1 conv # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have # stride in the 3x3 conv stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, norm=get_norm(norm, bottleneck_channels), ) self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, norm=get_norm(norm, bottleneck_channels), ) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) # Zero-initialize the last normalization in each residual branch, # so that at the beginning, the residual branch starts with zeros, # and each residual block behaves like an identity. # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": # "For BN layers, the learnable scaling coefficient γ is initialized # to be 1, except for each residual block's last BN # where γ is initialized to be 0." # nn.init.constant_(self.conv3.norm.weight, 0) # TODO this somehow hurts performance when training GN models from scratch. # Add it as an option when we need to use this code to train a backbone. def forward(self, x): out = self.conv1(x) out = F.relu_(out) out = self.conv2(out) out = F.relu_(out) out = self.conv3(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu_(out) return out class DeformBottleneckBlock(CNNBlockBase): """ Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv ` in the 3x3 convolution. """ def __init__( self, in_channels, out_channels, *, bottleneck_channels, stride=1, num_groups=1, norm="BN", stride_in_1x1=False, dilation=1, deform_modulated=False, deform_num_groups=1, ): super().__init__(in_channels, out_channels, stride) self.deform_modulated = deform_modulated if in_channels != out_channels: self.shortcut = Conv2d( in_channels, out_channels, kernel_size=1, stride=stride, bias=False, norm=get_norm(norm, out_channels), ) else: self.shortcut = None stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) self.conv1 = Conv2d( in_channels, bottleneck_channels, kernel_size=1, stride=stride_1x1, bias=False, norm=get_norm(norm, bottleneck_channels), ) if deform_modulated: deform_conv_op = ModulatedDeformConv # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size offset_channels = 27 else: deform_conv_op = DeformConv offset_channels = 18 self.conv2_offset = Conv2d( bottleneck_channels, offset_channels * deform_num_groups, kernel_size=3, stride=stride_3x3, padding=1 * dilation, dilation=dilation, ) self.conv2 = deform_conv_op( bottleneck_channels, bottleneck_channels, kernel_size=3, stride=stride_3x3, padding=1 * dilation, bias=False, groups=num_groups, dilation=dilation, deformable_groups=deform_num_groups, norm=get_norm(norm, bottleneck_channels), ) self.conv3 = Conv2d( bottleneck_channels, out_channels, kernel_size=1, bias=False, norm=get_norm(norm, out_channels), ) for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: if layer is not None: # shortcut can be None weight_init.c2_msra_fill(layer) nn.init.constant_(self.conv2_offset.weight, 0) nn.init.constant_(self.conv2_offset.bias, 0) def forward(self, x): out = self.conv1(x) out = F.relu_(out) if self.deform_modulated: offset_mask = self.conv2_offset(out) offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1) offset = torch.cat((offset_x, offset_y), dim=1) mask = mask.sigmoid() out = self.conv2(out, offset, mask) else: offset = self.conv2_offset(out) out = self.conv2(out, offset) out = F.relu_(out) out = self.conv3(out) if self.shortcut is not None: shortcut = self.shortcut(x) else: shortcut = x out += shortcut out = F.relu_(out) return out class BasicStem(CNNBlockBase): """ The standard ResNet stem (layers before the first residual block), with a conv, relu and max_pool. """ def __init__(self, in_channels=3, out_channels=64, norm="BN"): """ Args: norm (str or callable): norm after the first conv layer. See :func:`layers.get_norm` for supported format. """ super().__init__(in_channels, out_channels, 4) self.in_channels = in_channels self.conv1 = Conv2d( in_channels, out_channels, kernel_size=7, stride=2, padding=3, bias=False, norm=get_norm(norm, out_channels), ) weight_init.c2_msra_fill(self.conv1) def forward(self, x): x = self.conv1(x) x = F.relu_(x) x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) return x class ResNet(Backbone): """ Implement :paper:`ResNet`. """ def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0): """ Args: stem (nn.Module): a stem module stages (list[list[CNNBlockBase]]): several (typically 4) stages, each contains multiple :class:`CNNBlockBase`. num_classes (None or int): if None, will not perform classification. Otherwise, will create a linear layer. out_features (list[str]): name of the layers whose outputs should be returned in forward. Can be anything in "stem", "linear", or "res2" ... If None, will return the output of the last layer. freeze_at (int): The number of stages at the beginning to freeze. see :meth:`freeze` for detailed explanation. """ super().__init__() self.stem = stem self.num_classes = num_classes current_stride = self.stem.stride self._out_feature_strides = {"stem": current_stride} self._out_feature_channels = {"stem": self.stem.out_channels} self.stage_names, self.stages = [], [] if out_features is not None: # Avoid keeping unused layers in this module. They consume extra memory # and may cause allreduce to fail num_stages = max( [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features] ) stages = stages[:num_stages] for i, blocks in enumerate(stages): assert len(blocks) > 0, len(blocks) for block in blocks: assert isinstance(block, CNNBlockBase), block name = "res" + str(i + 2) stage = nn.Sequential(*blocks) self.add_module(name, stage) self.stage_names.append(name) self.stages.append(stage) self._out_feature_strides[name] = current_stride = int( current_stride * np.prod([k.stride for k in blocks]) ) self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels self.stage_names = tuple(self.stage_names) # Make it static for scripting if num_classes is not None: self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.linear = nn.Linear(curr_channels, num_classes) # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": # "The 1000-way fully-connected layer is initialized by # drawing weights from a zero-mean Gaussian with standard deviation of 0.01." nn.init.normal_(self.linear.weight, std=0.01) name = "linear" if out_features is None: out_features = [name] self._out_features = out_features assert len(self._out_features) children = [x[0] for x in self.named_children()] for out_feature in self._out_features: assert out_feature in children, "Available children: {}".format(", ".join(children)) self.freeze(freeze_at) def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} x = self.stem(x) if "stem" in self._out_features: outputs["stem"] = x for name, stage in zip(self.stage_names, self.stages): x = stage(x) if name in self._out_features: outputs[name] = x if self.num_classes is not None: x = self.avgpool(x) x = torch.flatten(x, 1) x = self.linear(x) if "linear" in self._out_features: outputs["linear"] = x return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } def freeze(self, freeze_at=0): """ Freeze the first several stages of the ResNet. Commonly used in fine-tuning. Layers that produce the same feature map spatial size are defined as one "stage" by :paper:`FPN`. Args: freeze_at (int): number of stages to freeze. `1` means freezing the stem. `2` means freezing the stem and one residual stage, etc. Returns: nn.Module: this ResNet itself """ if freeze_at >= 1: self.stem.freeze() for idx, stage in enumerate(self.stages, start=2): if freeze_at >= idx: for block in stage.children(): block.freeze() return self @staticmethod def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs): """ Create a list of blocks of the same type that forms one ResNet stage. Args: block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this stage. A module of this type must not change spatial resolution of inputs unless its stride != 1. num_blocks (int): number of blocks in this stage in_channels (int): input channels of the entire stage. out_channels (int): output channels of **every block** in the stage. kwargs: other arguments passed to the constructor of `block_class`. If the argument name is "xx_per_block", the argument is a list of values to be passed to each block in the stage. Otherwise, the same argument is passed to every block in the stage. Returns: list[CNNBlockBase]: a list of block module. Examples: :: stage = ResNet.make_stage( BottleneckBlock, 3, in_channels=16, out_channels=64, bottleneck_channels=16, num_groups=1, stride_per_block=[2, 1, 1], dilations_per_block=[1, 1, 2] ) Usually, layers that produce the same feature map spatial size are defined as one "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should all be 1. """ blocks = [] for i in range(num_blocks): curr_kwargs = {} for k, v in kwargs.items(): if k.endswith("_per_block"): assert len(v) == num_blocks, ( f"Argument '{k}' of make_stage should have the " f"same length as num_blocks={num_blocks}." ) newk = k[: -len("_per_block")] assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!" curr_kwargs[newk] = v[i] else: curr_kwargs[k] = v blocks.append( block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs) ) in_channels = out_channels return blocks @staticmethod def make_default_stages(depth, block_class=None, **kwargs): """ Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152). If it doesn't create the ResNet variant you need, please use :meth:`make_stage` instead for fine-grained customization. Args: depth (int): depth of ResNet block_class (type): the CNN block class. Has to accept `bottleneck_channels` argument for depth > 50. By default it is BasicBlock or BottleneckBlock, based on the depth. kwargs: other arguments to pass to `make_stage`. Should not contain stride and channels, as they are predefined for each depth. Returns: list[list[CNNBlockBase]]: modules in all stages; see arguments of :class:`ResNet.__init__`. """ num_blocks_per_stage = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3], }[depth] if block_class is None: block_class = BasicBlock if depth < 50 else BottleneckBlock if depth < 50: in_channels = [64, 64, 128, 256] out_channels = [64, 128, 256, 512] else: in_channels = [64, 256, 512, 1024] out_channels = [256, 512, 1024, 2048] ret = [] for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels): if depth >= 50: kwargs["bottleneck_channels"] = o // 4 ret.append( ResNet.make_stage( block_class=block_class, num_blocks=n, stride_per_block=[s] + [1] * (n - 1), in_channels=i, out_channels=o, **kwargs, ) ) return ret ResNetBlockBase = CNNBlockBase """ Alias for backward compatibiltiy. """ def make_stage(*args, **kwargs): """ Deprecated alias for backward compatibiltiy. """ return ResNet.make_stage(*args, **kwargs) def _convert_ndarray_to_tensor(state_dict: Dict[str, Any]) -> None: """ In-place convert all numpy arrays in the state_dict to torch tensor. Args: state_dict (dict): a state-dict to be loaded to the model. Will be modified. """ # model could be an OrderedDict with _metadata attribute # (as returned by Pytorch's state_dict()). We should preserve these # properties. for k in list(state_dict.keys()): v = state_dict[k] if not isinstance(v, np.ndarray) and not isinstance(v, torch.Tensor): raise ValueError( "Unsupported type found in checkpoint! {}: {}".format(k, type(v)) ) if not isinstance(v, torch.Tensor): state_dict[k] = torch.from_numpy(v) @register_backbone def get_resnet_backbone(cfg): """ Create a ResNet instance from config. Returns: ResNet: a :class:`ResNet` instance. """ res_cfg = cfg['MODEL']['BACKBONE']['RESNETS'] # need registration of new blocks/stems? norm = res_cfg['NORM'] stem = BasicStem( in_channels=res_cfg['STEM_IN_CHANNELS'], out_channels=res_cfg['STEM_OUT_CHANNELS'], norm=norm, ) # fmt: off freeze_at = res_cfg['FREEZE_AT'] out_features = res_cfg['OUT_FEATURES'] depth = res_cfg['DEPTH'] num_groups = res_cfg['NUM_GROUPS'] width_per_group = res_cfg['WIDTH_PER_GROUP'] bottleneck_channels = num_groups * width_per_group in_channels = res_cfg['STEM_OUT_CHANNELS'] out_channels = res_cfg['RES2_OUT_CHANNELS'] stride_in_1x1 = res_cfg['STRIDE_IN_1X1'] res5_dilation = res_cfg['RES5_DILATION'] deform_on_per_stage = res_cfg['DEFORM_ON_PER_STAGE'] deform_modulated = res_cfg['DEFORM_MODULATED'] deform_num_groups = res_cfg['DEFORM_NUM_GROUPS'] # fmt: on assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation) num_blocks_per_stage = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3], }[depth] if depth in [18, 34]: assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34" assert not any( deform_on_per_stage ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34" assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34" assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34" stages = [] for idx, stage_idx in enumerate(range(2, 6)): # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper dilation = res5_dilation if stage_idx == 5 else 1 first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2 stage_kargs = { "num_blocks": num_blocks_per_stage[idx], "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1), "in_channels": in_channels, "out_channels": out_channels, "norm": norm, } # Use BasicBlock for R18 and R34. if depth in [18, 34]: stage_kargs["block_class"] = BasicBlock else: stage_kargs["bottleneck_channels"] = bottleneck_channels stage_kargs["stride_in_1x1"] = stride_in_1x1 stage_kargs["dilation"] = dilation stage_kargs["num_groups"] = num_groups if deform_on_per_stage[idx]: stage_kargs["block_class"] = DeformBottleneckBlock stage_kargs["deform_modulated"] = deform_modulated stage_kargs["deform_num_groups"] = deform_num_groups else: stage_kargs["block_class"] = BottleneckBlock blocks = ResNet.make_stage(**stage_kargs) in_channels = out_channels out_channels *= 2 bottleneck_channels *= 2 stages.append(blocks) backbone = ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at) if cfg['MODEL']['BACKBONE']['LOAD_PRETRAINED'] is True: filename = cfg['MODEL']['BACKBONE']['PRETRAINED'] with PathManager.open(filename, "rb") as f: ckpt = pickle.load(f, encoding="latin1")['model'] _convert_ndarray_to_tensor(ckpt) ckpt.pop('stem.fc.weight') ckpt.pop('stem.fc.bias') backbone.load_state_dict(ckpt) return backbone ================================================ FILE: thirdparty/GLEE/glee/backbone/swin.py ================================================ # -------------------------------------------------------- # Swin Transformer # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu, Yutong Lin, Yixuan Wei # -------------------------------------------------------- import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as checkpoint from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec from .registry import register_backbone class Mlp(nn.Module): """Multilayer perceptron.""" def __init__( self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): """Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) ) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table, std=0.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """Forward function. Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = ( self.qkv(x) .reshape(B_, N, 3, self.num_heads, C // self.num_heads) .permute(2, 0, 3, 1, 4) ) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = q @ k.transpose(-2, -1) relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 ) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1 ).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x class SwinTransformerBlock(nn.Module): """Swin Transformer Block. Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__( self, dim, num_heads, window_size=7, shift_size=0, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, ): super().__init__() self.dim = dim self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop ) self.H = None self.W = None def forward(self, x, mask_matrix): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. mask_matrix: Attention mask for cyclic shift. """ B, L, C = x.shape H, W = self.H, self.W assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # pad feature maps to multiples of window size pad_l = pad_t = 0 pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) _, Hp, Wp, _ = x.shape # cyclic shift if self.shift_size > 0: shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) attn_mask = mask_matrix else: shifted_x = x attn_mask = None # partition windows x_windows = window_partition( shifted_x, self.window_size ) # nW*B, window_size, window_size, C x_windows = x_windows.view( -1, self.window_size * self.window_size, C ) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b > 0: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class PatchMerging(nn.Module): """Patch Merging Layer Args: dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ B, L, C = x.shape assert L == H * W, "input feature has wrong size" x = x.view(B, H, W, C) # padding pad_input = (H % 2 == 1) or (W % 2 == 1) if pad_input: x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x class BasicLayer(nn.Module): """A basic Swin Transformer layer for one stage. Args: dim (int): Number of feature channels depth (int): Depths of this stage. num_heads (int): Number of attention head. window_size (int): Local window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, dim, depth, num_heads, window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False, ): super().__init__() self.window_size = window_size self.shift_size = window_size // 2 self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList( [ SwinTransformerBlock( dim=dim, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer, ) for i in range(depth) ] ) # patch merging layer if downsample is not None: self.downsample = downsample(dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x, H, W): """Forward function. Args: x: Input feature, tensor size (B, H*W, C). H, W: Spatial resolution of the input feature. """ # calculate attention mask for SW-MSA Hp = int(np.ceil(H / self.window_size)) * self.window_size Wp = int(np.ceil(W / self.window_size)) * self.window_size img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 h_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) w_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition( img_mask, self.window_size ) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0) ) for blk in self.blocks: blk.H, blk.W = H, W if self.use_checkpoint: x = checkpoint.checkpoint(blk, x, attn_mask) else: x = blk(x, attn_mask) if self.downsample is not None: x_down = self.downsample(x, H, W) Wh, Ww = (H + 1) // 2, (W + 1) // 2 return x, H, W, x_down, Wh, Ww else: return x, H, W, x, H, W class PatchEmbed(nn.Module): """Image to Patch Embedding Args: patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() patch_size = to_2tuple(patch_size) self.patch_size = patch_size self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): """Forward function.""" # padding _, _, H, W = x.size() if W % self.patch_size[1] != 0: x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) if H % self.patch_size[0] != 0: x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) x = self.proj(x) # B C Wh Ww if self.norm is not None: Wh, Ww = x.size(2), x.size(3) x = x.flatten(2).transpose(1, 2) x = self.norm(x) x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) return x class SwinTransformer(nn.Module): """Swin Transformer backbone. A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: pretrain_img_size (int): Input image size for training the pretrained model, used in absolute postion embedding. Default 224. patch_size (int | tuple(int)): Patch size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. depths (tuple[int]): Depths of each Swin Transformer stage. num_heads (tuple[int]): Number of attention head of each stage. window_size (int): Window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. drop_rate (float): Dropout rate. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Default: 0.2. norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. patch_norm (bool): If True, add normalization after patch embedding. Default: True. out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (stop grad and set eval mode). -1 means not freezing any parameters. use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__( self, pretrain_img_size=224, patch_size=4, in_chans=3, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, out_indices=(0, 1, 2, 3), frozen_stages=-1, use_checkpoint=False, ): super().__init__() self.pretrain_img_size = pretrain_img_size self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.out_indices = out_indices self.frozen_stages = frozen_stages # split image into non-overlapping patches self.patch_embed = PatchEmbed( patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, ) # absolute position embedding if self.ape: pretrain_img_size = to_2tuple(pretrain_img_size) patch_size = to_2tuple(patch_size) patches_resolution = [ pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1], ] self.absolute_pos_embed = nn.Parameter( torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) ) trunc_normal_(self.absolute_pos_embed, std=0.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) ] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2 ** i_layer), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint, ) self.layers.append(layer) num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] self.num_features = num_features # add a norm layer for each output for i_layer in out_indices: layer = norm_layer(num_features[i_layer]) layer_name = f"norm{i_layer}" self.add_module(layer_name, layer) self._freeze_stages() def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.frozen_stages >= 1 and self.ape: self.absolute_pos_embed.requires_grad = False if self.frozen_stages >= 2: self.pos_drop.eval() for i in range(0, self.frozen_stages - 1): m = self.layers[i] m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self, pretrained=None): """Initialize the weights in backbone. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ def _init_weights(m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) if isinstance(pretrained, str): self.apply(_init_weights) checkpoint = torch.load(pretrained, map_location='cpu') print(f'\nload pretrain weight from {pretrained} \n') self.load_state_dict(checkpoint['model'], strict=False) elif pretrained is None: self.apply(_init_weights) else: raise TypeError('pretrained must be a str or None') def forward(self, x): """Forward function.""" x = self.patch_embed(x) Wh, Ww = x.size(2), x.size(3) if self.ape: # interpolate the position embedding to the corresponding size absolute_pos_embed = F.interpolate( self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" ) x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C else: x = x.flatten(2).transpose(1, 2) x = self.pos_drop(x) outs = {} for i in range(self.num_layers): layer = self.layers[i] x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) if i in self.out_indices: norm_layer = getattr(self, f"norm{i}") x_out = norm_layer(x_out) out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs["res{}".format(i + 2)] = out return outs def train(self, mode=True): """Convert the model into training mode while keep layers freezed.""" super(SwinTransformer, self).train(mode) self._freeze_stages() @BACKBONE_REGISTRY.register() class D2SwinTransformer(SwinTransformer, Backbone): def __init__(self, cfg, input_shape): pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE patch_size = cfg.MODEL.SWIN.PATCH_SIZE in_chans = 3 embed_dim = cfg.MODEL.SWIN.EMBED_DIM depths = cfg.MODEL.SWIN.DEPTHS num_heads = cfg.MODEL.SWIN.NUM_HEADS window_size = cfg.MODEL.SWIN.WINDOW_SIZE mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO qkv_bias = cfg.MODEL.SWIN.QKV_BIAS qk_scale = cfg.MODEL.SWIN.QK_SCALE drop_rate = cfg.MODEL.SWIN.DROP_RATE attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE norm_layer = nn.LayerNorm ape = cfg.MODEL.SWIN.APE patch_norm = cfg.MODEL.SWIN.PATCH_NORM use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT pretrained_weight = cfg.MODEL.SWIN.PRETRAINED_WEIGHT super().__init__( pretrain_img_size, patch_size, in_chans, embed_dim, depths, num_heads, window_size, mlp_ratio, qkv_bias, qk_scale, drop_rate, attn_drop_rate, drop_path_rate, norm_layer, ape, patch_norm, use_checkpoint=use_checkpoint, ) self.init_weights(pretrained_weight) self._out_features = cfg.MODEL.SWIN.OUT_FEATURES self._out_feature_strides = { "res2": 4, "res3": 8, "res4": 16, "res5": 32, } self._out_feature_channels = { "res2": self.num_features[0], "res3": self.num_features[1], "res4": self.num_features[2], "res5": self.num_features[3], } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert ( x.dim() == 4 ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} y = super().forward(x) for k in y.keys(): if k in self._out_features: outputs[k] = y[k] return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 ================================================ FILE: thirdparty/GLEE/glee/backbone/vit.py ================================================ import logging import math import fvcore.nn.weight_init as weight_init import torch import torch.nn as nn from detectron2.layers import CNNBlockBase, Conv2d, get_norm from detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous import torch.nn.functional as F from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec from .utils import ( PatchEmbed, add_decomposed_rel_pos, get_abs_pos, window_partition, window_unpartition, ) from functools import partial import torch.utils.checkpoint as checkpoint logger = logging.getLogger(__name__) __all__ = ["ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"] class Attention(nn.Module): """Multi-head Attention block with relative position embeddings.""" def __init__( self, dim, num_heads=8, qkv_bias=True, use_rel_pos=False, rel_pos_zero_init=True, input_size=None, ): """ Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. qkv_bias (bool: If True, add a learnable bias to query, key, value. rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. input_size (int or None): Input resolution for calculating the relative positional parameter size. """ super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.proj = nn.Linear(dim, dim) self.use_rel_pos = use_rel_pos if self.use_rel_pos: # initialize relative positional embeddings self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) if not rel_pos_zero_init: nn.init.trunc_normal_(self.rel_pos_h, std=0.02) nn.init.trunc_normal_(self.rel_pos_w, std=0.02) def forward(self, x): B, H, W, _ = x.shape # qkv with shape (3, B, nHead, H * W, C) qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # q, k, v with shape (B * nHead, H * W, C) q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) with torch.backends.cuda.sdp_kernel( enable_flash=True, enable_math=False, enable_mem_efficient=True ): x = F.scaled_dot_product_attention(q, k, v) attn = (q * self.scale) @ k.transpose(-2, -1) if self.use_rel_pos: attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) attn = attn.softmax(dim=-1) x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) x = self.proj(x) return x class ResBottleneckBlock(CNNBlockBase): """ The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels 1x1, 3x3, 1x1. """ def __init__( self, in_channels, out_channels, bottleneck_channels, norm="LN", act_layer=nn.GELU, ): """ Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. bottleneck_channels (int): number of output channels for the 3x3 "bottleneck" conv layers. norm (str or callable): normalization for all conv layers. See :func:`layers.get_norm` for supported format. act_layer (callable): activation for all conv layers. """ super().__init__(in_channels, out_channels, 1) self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False) self.norm1 = get_norm(norm, bottleneck_channels) self.act1 = act_layer() self.conv2 = Conv2d( bottleneck_channels, bottleneck_channels, 3, padding=1, bias=False, ) self.norm2 = get_norm(norm, bottleneck_channels) self.act2 = act_layer() self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False) self.norm3 = get_norm(norm, out_channels) for layer in [self.conv1, self.conv2, self.conv3]: weight_init.c2_msra_fill(layer) for layer in [self.norm1, self.norm2]: layer.weight.data.fill_(1.0) layer.bias.data.zero_() # zero init last norm layer. self.norm3.weight.data.zero_() self.norm3.bias.data.zero_() def forward(self, x): out = x for layer in self.children(): out = layer(out) out = x + out return out class Block(nn.Module): """Transformer blocks with support of window attention and residual propagation blocks""" def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=True, drop_path=0.0, norm_layer=nn.LayerNorm, act_layer=nn.GELU, use_rel_pos=False, rel_pos_zero_init=True, window_size=0, use_residual_block=False, input_size=None, ): """ Args: dim (int): Number of input channels. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. If it equals 0, then not use window attention. use_residual_block (bool): If True, use a residual block after the MLP block. input_size (int or None): Input resolution for calculating the relative positional parameter size. """ super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, input_size=input_size if window_size == 0 else (window_size, window_size), ) from timm.models.layers import DropPath, Mlp self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer) self.window_size = window_size self.use_residual_block = use_residual_block if use_residual_block: # Use a residual block with bottleneck channel as dim // 2 self.residual = ResBottleneckBlock( in_channels=dim, out_channels=dim, bottleneck_channels=dim // 2, norm="LN", act_layer=act_layer, ) def forward(self, x): shortcut = x x = self.norm1(x) # Window partition if self.window_size > 0: H, W = x.shape[1], x.shape[2] x, pad_hw = window_partition(x, self.window_size) x = self.attn(x) # Reverse window partition if self.window_size > 0: x = window_unpartition(x, self.window_size, pad_hw, (H, W)) x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) if self.use_residual_block: x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) return x class ViT(Backbone): """ This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`. "Exploring Plain Vision Transformer Backbones for Object Detection", https://arxiv.org/abs/2203.16527 """ def __init__( self, img_size=1024, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, qkv_bias=True, drop_path_rate=0.0, norm_layer=nn.LayerNorm, act_layer=nn.GELU, use_abs_pos=True, use_rel_pos=False, rel_pos_zero_init=True, window_size=0, window_block_indexes=(), residual_block_indexes=(), use_act_checkpoint=False, pretrain_img_size=224, pretrain_use_cls_token=True, out_feature="last_feat", ): """ Args: img_size (int): Input image size. patch_size (int): Patch size. in_chans (int): Number of input image channels. embed_dim (int): Patch embedding dimension. depth (int): Depth of ViT. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool): If True, add a learnable bias to query, key, value. drop_path_rate (float): Stochastic depth rate. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_abs_pos (bool): If True, use absolute positional embeddings. use_rel_pos (bool): If True, add relative positional embeddings to the attention map. rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. window_size (int): Window size for window attention blocks. window_block_indexes (list): Indexes for blocks using window attention. residual_block_indexes (list): Indexes for blocks using conv propagation. use_act_checkpoint (bool): If True, use activation checkpointing. pretrain_img_size (int): input image size for pretraining models. pretrain_use_cls_token (bool): If True, pretrainig models use class token. out_feature (str): name of the feature from the last block. """ super().__init__() self.pretrain_use_cls_token = pretrain_use_cls_token self.patch_embed = PatchEmbed( kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), in_chans=in_chans, embed_dim=embed_dim, ) if use_abs_pos: # Initialize absolute positional embedding with pretrain image size. num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size) num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim)) else: self.pos_embed = None # stochastic depth decay rule dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] self.blocks = nn.ModuleList() for i in range(depth): block = Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop_path=dpr[i], norm_layer=norm_layer, act_layer=act_layer, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, window_size=window_size if i in window_block_indexes else 0, use_residual_block=i in residual_block_indexes, input_size=(img_size // patch_size, img_size // patch_size), ) if use_act_checkpoint: # TODO: use torch.utils.checkpoint from fairscale.nn.checkpoint import checkpoint_wrapper block = checkpoint_wrapper(block) self.blocks.append(block) self._out_feature_channels = {out_feature: embed_dim} self._out_feature_strides = {out_feature: patch_size} self._out_features = [out_feature] if self.pos_embed is not None: nn.init.trunc_normal_(self.pos_embed, std=0.02) # In our method, we don't use backbone feature with stride 4 self.fpn1 = nn.Sequential( nn.ConvTranspose2d(embed_dim, embed_dim // 2, kernel_size=2, stride=2), ) self.fpn2 = nn.Identity() self.fpn3 = nn.MaxPool2d(kernel_size=2, stride=2) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): nn.init.trunc_normal_(m.weight, std=0.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def forward(self, x): x = self.patch_embed(x) if self.pos_embed is not None: x = x + get_abs_pos( self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2]) ) for blk in self.blocks: x = blk(x) xp = x.permute(0, 3, 1, 2) # (b, h, w, c) --> (b, c, h, w) features = [] ops = [self.fpn1, self.fpn2, self.fpn3] for i in range(len(ops)): features.append(ops[i](xp)) rets = {"res{}".format(u + 3): v for (u,v) in enumerate(features)} return rets @BACKBONE_REGISTRY.register() class D2ViT(ViT, Backbone): def __init__(self, cfg, input_shape): use_checkpoint = cfg.MODEL.VIT.USE_CHECKPOINT if cfg.MODEL.VIT.NAME == "ViT-Base": embed_dim=768 depth=12 drop_path_rate=0.1 num_heads=12 elif cfg.MODEL.VIT.NAME == "ViT-Large": embed_dim=1024 depth=24 drop_path_rate=0.4 num_heads=16 elif cfg.MODEL.VIT.NAME == "ViT-huge": embed_dim=1280 depth=32 drop_path_rate=0.5 num_heads=16 else: raise ValueError("Unsupported ViT name") super().__init__( img_size=1024, patch_size=16, in_chans=input_shape.channels, embed_dim=embed_dim, depth=depth, num_heads=num_heads, drop_path_rate=drop_path_rate, window_size=14, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), window_block_indexes=[ # 2, 5, 8 11 for global attention 0, 1, 3, 4, 6, 7, 9, 10, ], residual_block_indexes=[], use_rel_pos=True, out_feature="last_feat", use_act_checkpoint=use_checkpoint) self._out_features = cfg.MODEL.VIT.OUT_FEATURES self._out_feature_strides = { "res3": 8, "res4": 16, "res5": 32, } self._out_feature_channels = { "res3": embed_dim // 2, "res4": embed_dim, "res5": embed_dim, } def forward(self, x): """ Args: x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. Returns: dict[str->Tensor]: names and the corresponding features """ assert ( x.dim() == 4 ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!" outputs = {} y = super().forward(x) for k in y.keys(): if k in self._out_features: outputs[k] = y[k] return outputs def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features } @property def size_divisibility(self): return 32 ================================================ FILE: thirdparty/GLEE/glee/backbone/vit_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import math import numpy as np from scipy import interpolate import torch import torch.nn as nn import torch.nn.functional as F __all__ = [ "window_partition", "window_unpartition", "add_decomposed_rel_pos", "get_abs_pos", "PatchEmbed", ] def window_partition(x, window_size): """ Partition into non-overlapping windows with padding if needed. Args: x (tensor): input tokens with [B, H, W, C]. window_size (int): window size. Returns: windows: windows after partition with [B * num_windows, window_size, window_size, C]. (Hp, Wp): padded height and width before partition """ B, H, W, C = x.shape pad_h = (window_size - H % window_size) % window_size pad_w = (window_size - W % window_size) % window_size if pad_h > 0 or pad_w > 0: x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) Hp, Wp = H + pad_h, W + pad_w x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows, (Hp, Wp) def window_unpartition(windows, window_size, pad_hw, hw): """ Window unpartition into original sequences and removing padding. Args: x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. window_size (int): window size. pad_hw (Tuple): padded height and width (Hp, Wp). hw (Tuple): original height and width (H, W) before padding. Returns: x: unpartitioned sequences with [B, H, W, C]. """ Hp, Wp = pad_hw H, W = hw B = windows.shape[0] // (Hp * Wp // window_size // window_size) x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) if Hp > H or Wp > W: x = x[:, :H, :W, :].contiguous() return x def get_rel_pos(q_size, k_size, rel_pos, interp_type): """ Get relative positional embeddings according to the relative positions of query and key sizes. Args: q_size (int): size of query q. k_size (int): size of key k. rel_pos (Tensor): relative position embeddings (L, C). Returns: Extracted positional embeddings according to relative positions. """ max_rel_dist = int(2 * max(q_size, k_size) - 1) # Interpolate rel pos if needed. if rel_pos.shape[0] != max_rel_dist: if interp_type == "vitdet": # the vitdet impl: # https://github.com/facebookresearch/detectron2/blob/96c752ce821a3340e27edd51c28a00665dd32a30/detectron2/modeling/backbone/utils.py#L77. rel_pos_resized = F.interpolate( rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), size=max_rel_dist, mode="linear", ) rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) elif interp_type == "beit": # steal from beit https://github.com/microsoft/unilm/tree/master/beit # modified by Yuxin Fang src_size = rel_pos.shape[0] dst_size = max_rel_dist q = 1.0903078 dis = [] cur = 1 for i in range(src_size // 2): dis.append(cur) cur += q ** (i + 1) r_ids = [-_ for _ in reversed(dis)] x = r_ids + [0] + dis t = dst_size // 2.0 dx = np.arange(-t, t + 0.1, 1.0) all_rel_pos_bias = [] for i in range(rel_pos.shape[1]): # a hack from https://github.com/baaivision/EVA/issues/8, # could also be used in fine-tuning but the performance haven't been tested. z = rel_pos[:, i].view(src_size).cpu().float().detach().numpy() f = interpolate.interp1d(x, z, kind='cubic', fill_value="extrapolate") all_rel_pos_bias.append( torch.Tensor(f(dx)).contiguous().view(-1, 1).to(rel_pos.device)) rel_pos_resized = torch.cat(all_rel_pos_bias, dim=-1) else: raise NotImplementedError() else: rel_pos_resized = rel_pos # Scale the coords with short length if shapes for q and k are different. q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) return rel_pos_resized[relative_coords.long()] def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size, interp_type): """ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 Args: attn (Tensor): attention map. q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. q_size (Tuple): spatial sequence size of query q with (q_h, q_w). k_size (Tuple): spatial sequence size of key k with (k_h, k_w). Returns: attn (Tensor): attention map with added relative positional embeddings. """ q_h, q_w = q_size k_h, k_w = k_size Rh = get_rel_pos(q_h, k_h, rel_pos_h, interp_type) Rw = get_rel_pos(q_w, k_w, rel_pos_w, interp_type) B, _, dim = q.shape r_q = q.reshape(B, q_h, q_w, dim) rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) attn = ( attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] ).view(B, q_h * q_w, k_h * k_w) return attn def get_abs_pos(abs_pos, has_cls_token, hw): """ Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the original embeddings. Args: abs_pos (Tensor): absolute positional embeddings with (1, num_position, C). has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token. hw (Tuple): size of input image tokens. Returns: Absolute positional embeddings after processing with shape (1, H, W, C) """ h, w = hw if has_cls_token: abs_pos = abs_pos[:, 1:] xy_num = abs_pos.shape[1] size = int(math.sqrt(xy_num)) assert size * size == xy_num if size != h or size != w: new_abs_pos = F.interpolate( abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2), size=(h, w), mode="bicubic", align_corners=False, ) return new_abs_pos.permute(0, 2, 3, 1) else: return abs_pos.reshape(1, h, w, -1) class PatchEmbed(nn.Module): """ Image to Patch Embedding. """ def __init__( self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768 ): """ Args: kernel_size (Tuple): kernel size of the projection layer. stride (Tuple): stride of the projection layer. padding (Tuple): padding size of the projection layer. in_chans (int): Number of input image channels. embed_dim (int): embed_dim (int): Patch embedding dimension. """ super().__init__() self.proj = nn.Conv2d( in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding ) def forward(self, x): x = self.proj(x) # B C H W -> B H W C x = x.permute(0, 2, 3, 1) return x ================================================ FILE: thirdparty/GLEE/glee/config.py ================================================ # -*- coding: utf-8 -*- from detectron2.config import CfgNode as CN def add_glee_config(cfg): """ Add config for DETR. """ cfg.FIND_UNUSED_PARAMETERS = True cfg.MODEL.MAX_CATEGORY_LEN = 100 cfg.MODEL.PSEUDO_VIDEO = False cfg.MODEL.FREEZE_WHOLE = False cfg.MODEL.CONTRAS_MEAN = False cfg.MODEL.CROSS_TRACK = False cfg.MODEL.TRACK_VERSION = 'v3' cfg.INPUT.SAMPLING_FRAME_NUM = 1 cfg.INPUT.SAMPLING_FRAME_RANGE = 10 cfg.INPUT.SAMPLING_INTERVAL = 1 cfg.INPUT.SAMPLING_FRAME_SHUFFLE = False cfg.INPUT.AUGMENTATIONS = [] # "brightness", "contrast", "saturation", "rotation" cfg.INPUT.DATASET_MAPPER_NAME = None cfg.DATALOADER.DATASET_RATIO = [1, 1] cfg.DATALOADER.USE_DIFF_BS_SIZE = True cfg.DATALOADER.DATASET_BS = [2, 2] cfg.DATALOADER.DATASET_FILTERS = [True, True] cfg.DATALOADER.USE_RFS = [False, False] cfg.DATALOADER.MULTI_DATASET_GROUPING = True cfg.DATALOADER.DATASET_ANN = ['image'] cfg.INPUT.SIZE_DIVISIBILITY = -1 cfg.DATALOADER.DATASET_RATIO = [1, 1] cfg.DATALOADER.USE_DIFF_BS_SIZE = True cfg.DATALOADER.DATASET_BS = [2, 2] cfg.DATALOADER.USE_RFS = [False, False] cfg.DATALOADER.MULTI_DATASET_GROUPING = True cfg.DATALOADER.DATASET_ANN = ['box', 'box'] # Allow different datasets to use different input resolutions cfg.INPUT.MIN_SIZE_TRAIN_MULTI = [(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), (320, 352, 392, 416, 448, 480, 512, 544, 576, 608, 640)] cfg.INPUT.MAX_SIZE_TRAIN_MULTI = [1333, 768] # MaskDINO model config cfg.MODEL.MaskDINO = CN() cfg.MODEL.MaskDINO.LEARN_TGT = False # loss cfg.MODEL.MaskDINO.PANO_BOX_LOSS = False cfg.MODEL.MaskDINO.SEMANTIC_CE_LOSS = False cfg.MODEL.MaskDINO.DEEP_SUPERVISION = True cfg.MODEL.MaskDINO.NO_OBJECT_WEIGHT = 0.1 cfg.MODEL.MaskDINO.CLASS_WEIGHT = 4.0 cfg.MODEL.MaskDINO.DICE_WEIGHT = 5.0 cfg.MODEL.MaskDINO.MASK_WEIGHT = 5.0 cfg.MODEL.MaskDINO.BOX_WEIGHT = 5. cfg.MODEL.MaskDINO.GIOU_WEIGHT = 2. # cost weight cfg.MODEL.MaskDINO.COST_CLASS_WEIGHT = 4.0 cfg.MODEL.MaskDINO.COST_DICE_WEIGHT = 5.0 cfg.MODEL.MaskDINO.COST_MASK_WEIGHT = 5.0 cfg.MODEL.MaskDINO.COST_BOX_WEIGHT = 5. cfg.MODEL.MaskDINO.COST_GIOU_WEIGHT = 2. # transformer config cfg.MODEL.MaskDINO.NHEADS = 8 cfg.MODEL.MaskDINO.DROPOUT = 0.1 cfg.MODEL.MaskDINO.DIM_FEEDFORWARD = 2048 cfg.MODEL.MaskDINO.ENC_LAYERS = 0 cfg.MODEL.MaskDINO.DEC_LAYERS = 6 cfg.MODEL.MaskDINO.INITIAL_PRED = True cfg.MODEL.MaskDINO.PRE_NORM = False cfg.MODEL.MaskDINO.BOX_LOSS = True cfg.MODEL.MaskDINO.HIDDEN_DIM = 256 cfg.MODEL.MaskDINO.NUM_OBJECT_QUERIES = 100 cfg.MODEL.MaskDINO.ENFORCE_INPUT_PROJ = False cfg.MODEL.MaskDINO.TWO_STAGE = True cfg.MODEL.MaskDINO.INITIALIZE_BOX_TYPE = 'no' # ['no', 'bitmask', 'mask2box'] cfg.MODEL.MaskDINO.DN="seg" cfg.MODEL.MaskDINO.DN_NOISE_SCALE=0.4 cfg.MODEL.MaskDINO.DN_NUM=100 cfg.MODEL.MaskDINO.PRED_CONV=False cfg.MODEL.MaskDINO.EVAL_FLAG = 1 # MSDeformAttn encoder configs cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 cfg.MODEL.SEM_SEG_HEAD.DIM_FEEDFORWARD = 2048 cfg.MODEL.SEM_SEG_HEAD.NUM_FEATURE_LEVELS = 3 cfg.MODEL.SEM_SEG_HEAD.TOTAL_NUM_FEATURE_LEVELS = 4 cfg.MODEL.SEM_SEG_HEAD.FEATURE_ORDER = 'high2low' # ['low2high', 'high2low'] high2low: from high level to low level ##################### # MaskDINO inference config cfg.MODEL.MaskDINO.TEST = CN() cfg.MODEL.MaskDINO.TEST.TEST_FOUCUS_ON_BOX = False cfg.MODEL.MaskDINO.TEST.SEMANTIC_ON = True cfg.MODEL.MaskDINO.TEST.INSTANCE_ON = False cfg.MODEL.MaskDINO.TEST.PANOPTIC_ON = False cfg.MODEL.MaskDINO.TEST.OBJECT_MASK_THRESHOLD = 0.0 cfg.MODEL.MaskDINO.TEST.OVERLAP_THRESHOLD = 0.0 cfg.MODEL.MaskDINO.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False cfg.MODEL.MaskDINO.TEST.PANO_TRANSFORM_EVAL = True cfg.MODEL.MaskDINO.TEST.PANO_TEMPERATURE = 0.06 # cfg.MODEL.MaskDINO.TEST.EVAL_FLAG = 1 # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) # you can use this config to override cfg.MODEL.MaskDINO.SIZE_DIVISIBILITY = 32 # pixel decoder config cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 # adding transformer in pixel decoder cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 # pixel decoder cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "MaskDINOEncoder" # transformer module cfg.MODEL.MaskDINO.TRANSFORMER_DECODER_NAME = "MaskDINODecoder" # LSJ aug cfg.INPUT.IMAGE_SIZE = 1024 cfg.INPUT.MIN_SCALE = 0.1 cfg.INPUT.MAX_SCALE = 2.0 # point loss configs # Number of points sampled during training for a mask point head. cfg.MODEL.MaskDINO.TRAIN_NUM_POINTS = 112 * 112 # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the # original paper. cfg.MODEL.MaskDINO.OVERSAMPLE_RATIO = 3.0 # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in # the original paper. cfg.MODEL.MaskDINO.IMPORTANCE_SAMPLE_RATIO = 0.75 cfg.MODEL.DIM_PROJ = 256 cfg.MODEL.VISUAL_PROMPT = False cfg.MODEL.TEXT = CN() cfg.MODEL.TEXT.ARCH = 'vlpencoder' cfg.MODEL.TEXT.NAME= 'transformer' cfg.MODEL.TEXT.TOKENIZER= 'clip' cfg.MODEL.TEXT.CONTEXT_LENGTH= 77 # 77 cfg.MODEL.TEXT.WIDTH= 512 cfg.MODEL.TEXT.HEADS= 8 cfg.MODEL.TEXT.LAYERS= 12 # 6 cfg.MODEL.TEXT.AUTOGRESSIVE= True cfg.MODEL.LANGUAGE_BACKBONE = CN() cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT = False cfg.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE = "bert-base-uncased" cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "bert-base-uncased" cfg.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 768 cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN = 77 # max length of the tokenized captions. cfg.MODEL.LANGUAGE_BACKBONE.N_LAYERS = 1 # cfg.MODEL.LANGUAGE_BACKBONE.UNUSED_TOKEN = 106 # cfg.MODEL.LANGUAGE_BACKBONE.MASK_SPECIAL = False cfg.MODEL.LANGUAGE_BACKBONE.PAD_MAX = True cfg.MODEL.ENCODER = CN() cfg.MODEL.ENCODER.NAME= 'transformer_encoder_fpn' cfg.MODEL.ENCODER.IGNORE_VALUE= 255 cfg.MODEL.ENCODER.NUM_CLASSES= 133 cfg.MODEL.ENCODER.LOSS_WEIGHT= 1.0 cfg.MODEL.ENCODER.CONVS_DIM= 512 cfg.MODEL.ENCODER.MASK_DIM= 512 cfg.MODEL.ENCODER.NORM= "GN" cfg.MODEL.ENCODER.IN_FEATURES= ["res2", "res3", "res4", "res5"] cfg.MODEL.ENCODER.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES= ["res3", "res4", "res5"] cfg.MODEL.ENCODER.COMMON_STRIDE= 4 cfg.MODEL.ENCODER.TRANSFORMER_ENC_LAYERS= 6 cfg.MODEL.DECODER = CN() cfg.MODEL.DECODER.TRANSFORMER_IN_FEATURE= "multi_scale_pixel_decoder" cfg.MODEL.DECODER.MASK = True # DETECTION= False # SPATIAL= # ENABLED= True # GROUNDING= # ENABLED= False # MAX_LEN= 5 # TEXT_WEIGHT= 2.0 # CLASS_WEIGHT= 0.5 # VISUAL= # ENABLED= False # AUDIO= # ENABLED= False # OPENIMAGE= # ENABLED= False # NEGATIVE_SAMPLES= 5 # GROUNDING= # ENABLED= False # MAX_LEN= 5 # CAPTION= # ENABLED= False # PHRASE_PROB= 0.5 # SIM_THRES= 0.95 cfg.MODEL.DECODER.HIDDEN_DIM= 512 cfg.MODEL.DECODER.NUM_OBJECT_QUERIES= 101 cfg.MODEL.DECODER.NHEADS= 8 cfg.MODEL.DECODER.DROPOUT= 0.0 cfg.MODEL.DECODER.DIM_FEEDFORWARD= 2048 cfg.MODEL.DECODER.MAX_SPATIAL_LEN= [512, 512, 512, 512] cfg.MODEL.DECODER.PRE_NORM= False cfg.MODEL.DECODER.ENFORCE_INPUT_PROJ= False cfg.MODEL.DECODER.SIZE_DIVISIBILITY= 32 cfg.MODEL.DECODER.TRAIN_NUM_POINTS= 12544 cfg.MODEL.DECODER.OVERSAMPLE_RATIO= 3.0 cfg.MODEL.DECODER.IMPORTANCE_SAMPLE_RATIO= 0.75 cfg.MODEL.DECODER.DEC_LAYERS= 10 # 9 decoder layers, add one for the loss on learnable query cfg.MODEL.DECODER.TOP_GROUNDING_LAYERS= 10 cfg.MODEL.DECODER.TOP_CAPTION_LAYERS= 10 cfg.MODEL.DECODER.TOP_SPATIAL_LAYERS= 10 cfg.MODEL.DECODER.TOP_OPENIMAGE_LAYERS= 10 # TEST= # SEMANTIC_ON= True # INSTANCE_ON= True # PANOPTIC_ON= True # OVERLAP_THRESHOLD= 0.8 # OBJECT_MASK_THRESHOLD= 0.4 # SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE= false # DETECTIONS_PER_IMAGE= 100 cfg.ATTENTION_ARCH = CN() # cfg.ATTENTION_ARCH.VARIABLE={ # 'queries': ['object'], # 'tokens': ['grounding', 'spatial', 'visual', 'audio']} # SELF_ATTENTION: # queries: # object: ['queries_object', 'tokens_grounding', 'tokens_spatial', 'tokens_visual', 'tokens_audio'] # tokens: # grounding: ['queries_object', 'tokens_grounding'] # spatial: ['tokens_spatial'] # visual: ['tokens_visual'] # audio: ['queries_object', 'tokens_audio'] # CROSS_ATTENTION: # queries: # object: True # tokens: # grounding: False # spatial: False # visual: False # audio: False # MASKING: ['tokens_spatial', 'tokens_grounding', 'tokens_visual', 'tokens_audio'] # DUPLICATION: # queries: # grounding: 'queries_object' # spatial: 'queries_object' # SPATIAL_MEMORIES: 32 cfg.SOLVER.OPTIMIZER = "ADAMW" cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 cfg.SOLVER.TEXTENCODER_MULTIPLIER = 1.0 cfg.SOLVER.LR_DECAY_RATE = None cfg.SOLVER.LR_DECAY_RATE_NUM_LAYERS = None ## support Swin backbone cfg.MODEL.SWIN = CN() cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 cfg.MODEL.SWIN.PATCH_SIZE = 4 cfg.MODEL.SWIN.EMBED_DIM = 96 cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] cfg.MODEL.SWIN.WINDOW_SIZE = 7 cfg.MODEL.SWIN.MLP_RATIO = 4.0 cfg.MODEL.SWIN.QKV_BIAS = True cfg.MODEL.SWIN.QK_SCALE = None cfg.MODEL.SWIN.DROP_RATE = 0.0 cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 cfg.MODEL.SWIN.APE = False cfg.MODEL.SWIN.PATCH_NORM = True cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] cfg.MODEL.SWIN.USE_CHECKPOINT = False cfg.MODEL.SWIN.PRETRAINED_WEIGHT = None # support InterImage backbone cfg.MODEL.INTERNIMAGE = CN() # large as base #### large cfg.MODEL.INTERNIMAGE.PRETRAINED_WEIGHT = None cfg.MODEL.INTERNIMAGE.CORE_OP = "DCNv3" cfg.MODEL.INTERNIMAGE.CHANNELS = 160 cfg.MODEL.INTERNIMAGE.DEPTHS = [5, 5, 22, 5] cfg.MODEL.INTERNIMAGE.GROUPS =[10, 20, 40, 80] cfg.MODEL.INTERNIMAGE.MLP_RATIO =4. cfg.MODEL.INTERNIMAGE.DROP_PATH_RATE =0.0 cfg.MODEL.INTERNIMAGE.NORM_LAYER = "LN" cfg.MODEL.INTERNIMAGE.LAYER_SCALE = 1.0 cfg.MODEL.INTERNIMAGE.OFFSET_SCALE = 2.0 cfg.MODEL.INTERNIMAGE.POST_NORM = True cfg.MODEL.INTERNIMAGE.WITH_CP = False cfg.MODEL.INTERNIMAGE.OUT_IINDICES = (0, 1, 2, 3) cfg.MODEL.INTERNIMAGE.DW_KERNEL_SIZE = None cfg.MODEL.INTERNIMAGE.RES_POST_NORM = False cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM = False cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM_BLOCK_IDS = None cfg.MODEL.INTERNIMAGE.CENTER_FEATURE_SCALE = False ### huge # cfg.MODEL.INTERNIMAGE.PRETRAINED_WEIGHT = None # cfg.MODEL.INTERNIMAGE.CORE_OP = "DCNv3" # cfg.MODEL.INTERNIMAGE.CHANNELS = 320 # cfg.MODEL.INTERNIMAGE.DEPTHS = [6, 6, 32, 6] # cfg.MODEL.INTERNIMAGE.GROUPS = [10, 20, 40, 80] # cfg.MODEL.INTERNIMAGE.MLP_RATIO =4. # cfg.MODEL.INTERNIMAGE.DROP_PATH_RATE = 0.5 # cfg.MODEL.INTERNIMAGE.NORM_LAYER = "LN" # cfg.MODEL.INTERNIMAGE.LAYER_SCALE = None # cfg.MODEL.INTERNIMAGE.OFFSET_SCALE = 1.0 # cfg.MODEL.INTERNIMAGE.POST_NORM = False # cfg.MODEL.INTERNIMAGE.WITH_CP = False # cfg.MODEL.INTERNIMAGE.OUT_IINDICES = (0, 1, 2, 3) # cfg.MODEL.INTERNIMAGE.DW_KERNEL_SIZE = 5 # cfg.MODEL.INTERNIMAGE.RES_POST_NORM = True # cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM = True # cfg.MODEL.INTERNIMAGE.LEVEL2_POST_NORM_BLOCK_IDS = [5, 11, 17, 23, 29] # cfg.MODEL.INTERNIMAGE.CENTER_FEATURE_SCALE = True # support EVA02 backbone cfg.MODEL.EVA02 = CN() # large as base #### large cfg.MODEL.EVA02.PRETRAINED_WEIGHT = None cfg.MODEL.EVA02.IMAGE_SIZE = 1536 cfg.MODEL.EVA02.PATCH_SIZE = 16 cfg.MODEL.EVA02.WINDOW_SIZE = 16 cfg.MODEL.EVA02.DMBED_DIM =1024 cfg.MODEL.EVA02.DEPTH = 24 cfg.MODEL.EVA02.NUM_HEADS = 16 cfg.MODEL.EVA02.MLP_RATIO = 4*2/3 cfg.MODEL.EVA02.DROP_PATH_RATE = 0.3 cfg.MODEL.EVA02.CHECKPOINT = True cfg.MODEL.EVA02.WINDOW_BLOCK_INDEXES = [0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, 16, 18, 19, 21, 22] # support EVA01 backbone cfg.MODEL.EVA01 = CN() # large as base #### large cfg.MODEL.EVA01.PRETRAINED_WEIGHT = None cfg.MODEL.EVA01.BEIT_LIKE_QKV_BIAS = True cfg.MODEL.EVA01.BEIT_LIKE_GAMMA = False cfg.MODEL.EVA01.FREEZE_PATH_EMBED = True cfg.MODEL.EVA01.IMAGE_SIZE = 1280 # only for correct dim in pos embed cfg.MODEL.EVA01.PATCH_SIZE = 16 cfg.MODEL.EVA01.WINDOW_SIZE = 16 cfg.MODEL.EVA01.DMBED_DIM = 1408 cfg.MODEL.EVA01.DEPTH = 40 cfg.MODEL.EVA01.NUM_HEADS = 16 cfg.MODEL.EVA01.MLP_RATIO = 6144 / 1408 cfg.MODEL.EVA01.DROP_PATH_RATE = 0.6 cfg.MODEL.EVA01.WINDOW_BLOCK_INDEXES = [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 38] ================================================ FILE: thirdparty/GLEE/glee/config_deeplab.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. def add_deeplab_config(cfg): """ Add config for DeepLab. """ # We retry random cropping until no single category in semantic segmentation GT occupies more # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 # Used for `poly` learning rate schedule. cfg.SOLVER.POLY_LR_POWER = 0.9 cfg.SOLVER.POLY_LR_CONSTANT_ENDING = 0.0 # Loss type, choose from `cross_entropy`, `hard_pixel_mining`. cfg.MODEL.SEM_SEG_HEAD.LOSS_TYPE = "hard_pixel_mining" # DeepLab settings cfg.MODEL.SEM_SEG_HEAD.PROJECT_FEATURES = ["res2"] cfg.MODEL.SEM_SEG_HEAD.PROJECT_CHANNELS = [48] cfg.MODEL.SEM_SEG_HEAD.ASPP_CHANNELS = 256 cfg.MODEL.SEM_SEG_HEAD.ASPP_DILATIONS = [6, 12, 18] cfg.MODEL.SEM_SEG_HEAD.ASPP_DROPOUT = 0.1 cfg.MODEL.SEM_SEG_HEAD.USE_DEPTHWISE_SEPARABLE_CONV = False # Backbone new configs cfg.MODEL.RESNETS.RES4_DILATION = 1 cfg.MODEL.RESNETS.RES5_MULTI_GRID = [1, 2, 4] # ResNet stem type from: `basic`, `deeplab` cfg.MODEL.RESNETS.STEM_TYPE = "deeplab" ================================================ FILE: thirdparty/GLEE/glee/models/glee_model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ """ import torch import torch.nn.functional as F from torch import nn # from ..backbone import build_backbone, Backbone # from ..body.encoder import build_encoder # from ..body.decoder import build_decoder from detectron2.modeling import build_backbone from .pixel_decoder.maskdino_encoder import build_pixel_decoder from .transformer_decoder.maskdino_decoder import build_transformer_decoder import random from transformers import AutoTokenizer from collections import OrderedDict from ..modules.point_features import point_sample from timm.models.layers import trunc_normal_ from transformers import CLIPTokenizer,CLIPTextModel from .vos_utils import masks_to_boxes, FeatureFuser import numpy as np import math def rand_sample(x, max_len): if x.shape[1] <= max_len: return x else: rand_idx = torch.randperm(x.shape[1])[:max_len] return x[:,rand_idx] def agg_lang_feat(features, mask, pool_type="average"): """average pooling of language features""" # feat: (bs, seq_len, C) # mask: (bs, seq_len) if pool_type == "average": embedded = features * mask.unsqueeze(-1).float() # use mask to zero out invalid token features aggregate = embedded.sum(1) / (mask.sum(-1).unsqueeze(-1).float()) elif pool_type == "max": out = [] for i in range(len(features)): pool_feat, _ = torch.max(features[i][mask[i]], 0) # (L, C) -> (C, ) out.append(pool_feat) aggregate = torch.stack(out, dim=0) # (bs, C) else: raise ValueError("pool_type should be average or max") return aggregate class GLEE_Model(nn.Module): """ Main class for mask classification semantic segmentation architectures. """ def __init__(self, cfg, matcher, device, video_info, contras_mean): super().__init__() self.cfg = cfg self.matcher = matcher self.backbone = build_backbone(cfg) output_channels = [v for k,v in self.backbone._out_feature_channels.items()] self.sot_fuser = FeatureFuser(output_channels[-3:], 256) self.tokenizer = CLIPTokenizer.from_pretrained('/home/PJLAB/caiwenzhe/Desktop/checkpoints/clip-vit-base-patch32') self.tokenizer.add_special_tokens({'cls_token': self.tokenizer.eos_token}) self.text_encoder = CLIPTextModel.from_pretrained('/home/PJLAB/caiwenzhe/Desktop/checkpoints/clip-vit-base-patch32') # self.text_encoder_teacher = CLIPTextModel.from_pretrained('GLEE/clip_vit_base_patch32') self.lang_encoder = None # for p in self.text_encoder_teacher.parameters(): # p.requires_grad = False self.lang_projection = nn.Parameter(torch.rand(cfg.MODEL.LANGUAGE_BACKBONE.LANG_DIM, cfg.MODEL.DIM_PROJ)) self.text_encode_type = 'clip_teacher' # self.lang_encoder = None self.pixel_decoder = build_pixel_decoder(cfg, self.backbone.output_shape()) transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM self.predictor = build_transformer_decoder(cfg, transformer_predictor_in_channels, lang_encoder = self.lang_encoder, mask_classification=True,) self.to(device) self.video_info = video_info self.contras_mean = contras_mean self.track_loss_version = cfg.MODEL.TRACK_VERSION self.no_mask_tasks = ['obj365', 'obj365_clip','openimage', 'openimage_clip', 'vg', 'grit', 'bdd_det', 'bdd_track_box'] # for visual prompt hidden_dim = 256 self.max_spatial_len = [512,512,512,512] self.mask_sptial_embed = nn.ParameterList([nn.Parameter(torch.empty(hidden_dim, hidden_dim)) for x in range(4)]) trunc_normal_(self.mask_sptial_embed[0], std=.02) trunc_normal_(self.mask_sptial_embed[1], std=.02) trunc_normal_(self.mask_sptial_embed[2], std=.02) trunc_normal_(self.mask_sptial_embed[3], std=.02) # learnable positive negative indicator self.pn_indicator = nn.Embedding(2, hidden_dim) @property def device(self): return self.pixel_mean.device def forward(self, images, prompts, task, targets=None, batch_name_list=None, is_train = True, visual_prompt_type='scribble'): extra = {} # dist_loss = None early_semantic = None if self.text_encode_type == "clip_teacher": if task not in ['grounding','rvos']: assert batch_name_list calsses_name_list = batch_name_list tokenized = self.tokenizer.batch_encode_plus(calsses_name_list, max_length=self.cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN, # 256 padding='max_length' if self.cfg.MODEL.LANGUAGE_BACKBONE.PAD_MAX else "longest", # max_length return_special_tokens_mask=True, return_tensors='pt', truncation=True).to(images.device) texts = (tokenized['input_ids'], tokenized['attention_mask']) token_x = self.text_encoder(*texts)['last_hidden_state'] valid_mask = tokenized['attention_mask'].bool() # token_x_teacher = self.text_encoder_teacher(*texts)['last_hidden_state'] # if is_train: # dist_loss = F.mse_loss(token_x[valid_mask], token_x_teacher[valid_mask] ) # F.l2_loss(token_x[valid_mask], token_x_teacher[valid_mask] ) token_x = token_x @ self.lang_projection lang_feat_pool = agg_lang_feat(token_x, tokenized['attention_mask'], pool_type="average") # (bs, 768) extra['class_embeddings'] = lang_feat_pool if True: # early_fusion gather_all_classtoken = token_x.flatten(0,1)[tokenized['attention_mask'].flatten(0,1)>0] gather_all_classtoken = gather_all_classtoken.unsqueeze(0).repeat(len(images),1,1) #[bs,L,C] gather_all_classtoken_mask = torch.ones_like(gather_all_classtoken[:,:,0])>0 #[bs,L] early_semantic = {"hidden":gather_all_classtoken.float(),"masks":gather_all_classtoken_mask} if 'grounding' in prompts: if self.text_encode_type == 'clip_frozen' or self.text_encode_type == 'clip_teacher': tokens = self.tokenizer( prompts['grounding'], padding='max_length', truncation=True, max_length=self.cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN, return_tensors='pt' ) tokens = {key: value.to(images.device) for key, value in tokens.items()} texts = (tokens['input_ids'], tokens['attention_mask']) x = self.text_encoder(*texts) token_x = x['last_hidden_state'] token_x = token_x @ self.lang_projection extra['grounding_tokens'] = token_x.permute(1,0,2) #[len,bz,C] non_zero_query_mask = tokens['attention_mask'] lang_feat_pool = agg_lang_feat(token_x, non_zero_query_mask, pool_type="average").unsqueeze(1) # (bs, 1, 768) dist_loss = (lang_feat_pool*0).sum() extra['grounding_nonzero_mask'] = ~non_zero_query_mask.bool() # [bz,len] extra['grounding_class'] = lang_feat_pool.squeeze(1) #[bz,C # gather_all_classtoken = token_x.flatten(0,1)[tokenized['attention_mask'].flatten(0,1)>0] # gather_all_classtoken = gather_all_classtoken.unsqueeze(0).repeat(len(images),1,1) #[bs,L,C] # gather_all_classtoken_mask = torch.ones_like(gather_all_classtoken[:,:,0])>0 #[bs,L] # early_semantic = {"hidden":gather_all_classtoken.float(),"masks":gather_all_classtoken_mask} early_semantic = {"hidden":token_x.float(),"masks":tokens['attention_mask']>0} if isinstance(images,torch.Tensor): features = self.backbone(images) else: features = self.backbone(images.tensor) if 'spatial' in prompts: ## setp 1,2,3 key_images = [ images ] #bz*[1,3,H,W] key_promptmasks = [m.unsqueeze(0) for m in prompts['spatial']] #bz*[1,1,H,W] prompt_mode = visual_prompt_type ref_feats, ref_masks = self.get_template(key_images, key_promptmasks, prompt_mode) early_fusion = {"hidden":ref_feats,"masks":ref_masks} if early_semantic is None: early_semantic = early_fusion else: early_semantic["hidden"] = torch.cat([early_semantic["hidden"],early_fusion["hidden"]],dim=1) early_semantic["masks"] = torch.cat([early_semantic["masks"],early_fusion["masks"]],dim=1) # bz = len(images)//2 mask_features, _, multi_scale_features, zero_loss = self.pixel_decoder.forward_features(features, masks=None, early_fusion = early_semantic) if 'spatial' in prompts: pos_masks = prompts['spatial'] # neg_masks = [~p for p in prompts['spatial']] neg_masks = [p&False for p in prompts['spatial']] extra.update({'spatial_query_pos_mask': pos_masks, 'spatial_query_neg_mask': neg_masks}) _,h,w = extra['spatial_query_pos_mask'][0].shape divisor = torch.tensor([h,w], device=mask_features.device)[None,] # Get mean pos spatial query non_zero_pos_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_pos_mask']] non_zero_pos_point = nn.utils.rnn.pad_sequence(non_zero_pos_point, padding_value=-1).permute(1,0,2) non_zero_pos_mask = (non_zero_pos_point.sum(dim=-1) < 0) spatial_query_pos = point_sample(mask_features, non_zero_pos_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True) #[(N, C, P) spatial_query_pos = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_pos.transpose(1,2), ~non_zero_pos_mask)]).transpose(0,1).nan_to_num() # [1,bz,C] # Get mean neg spatial query non_zero_neg_point = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[-1]).t() for m in extra['spatial_query_neg_mask']] non_zero_neg_point = nn.utils.rnn.pad_sequence(non_zero_neg_point, padding_value=-1).permute(1,0,2) non_zero_neg_mask = (non_zero_neg_point.sum(dim=-1) < 0) spatial_query_neg = point_sample(mask_features, non_zero_neg_point.flip(dims=(2,)).type(mask_features.dtype), align_corners=True) spatial_query_neg = torch.stack([x[m].mean(dim=0, keepdim=True) for x, m in zip(spatial_query_neg.transpose(1,2), ~non_zero_neg_mask)]).transpose(0,1).nan_to_num() # Get layerwise spatial query src_spatial_queries = [] src_spatial_maskings = [] for i in range(len(multi_scale_features)): bs,dc,h,w = multi_scale_features[i].shape # src_mask_features = multi_scale_features[i].view(h,w,bs,dc) src_mask_features = multi_scale_features[i].permute(2,3,0,1) src_mask_features = src_mask_features @ self.mask_sptial_embed[i] non_zero_query_point_pos = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_pos_mask']] non_zero_query_point_neg = [rand_sample((m.nonzero()[:,1:]/divisor).t(), self.max_spatial_len[i]).t() for m in extra['spatial_query_neg_mask']] non_zero_query_point = [torch.cat([x,y], dim=0) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)] pos_neg_indicator = [torch.cat([torch.ones(x.shape[0], device=x.device), -torch.ones(y.shape[0], device=y.device)]) for x,y in zip(non_zero_query_point_pos, non_zero_query_point_neg)] pos_neg_indicator = nn.utils.rnn.pad_sequence(pos_neg_indicator, padding_value=0) non_zero_query_point = nn.utils.rnn.pad_sequence(non_zero_query_point, padding_value=-1).permute(1,0,2) non_zero_query_mask = (non_zero_query_point.sum(dim=-1) < 0) non_zero_query_point[non_zero_query_mask] = 0 spatial_tokens = point_sample(src_mask_features.permute(2,3,0,1), non_zero_query_point.flip(dims=(2,)).type(src_mask_features.dtype), align_corners=True).permute(2,0,1) spatial_tokens[pos_neg_indicator==1] += self.pn_indicator.weight[0:1] spatial_tokens[pos_neg_indicator==-1] += self.pn_indicator.weight[1:2] src_spatial_queries += [spatial_tokens] src_spatial_maskings += [non_zero_query_mask] extra['visual_prompt_tokens'] = src_spatial_queries #[len,bz,C] extra['visual_prompt_nonzero_mask'] = src_spatial_maskings # [bz,len] outputs = self.predictor(multi_scale_features, mask_features, extra=extra, task=task, masks=None, targets=targets) return outputs def get_template(self, imgs, pad_masks, prompt_mode='scribble'): """img: (N, 3, H, W), mask: (N, 1, H, W), bbox: (1, 4)""" """get 4-channel template""" croped_img_with_mask = [] for image_i, mask_i in zip( imgs, pad_masks): if prompt_mode in ['scribble','point']: image_with_mask = image_i + mask_i.to(image_i) else: image_with_mask = image_i # image_with_mask = torch.cat([image_i,mask_i.to(image_i)],dim=1) #[1,3,H,W] box_i = masks_to_boxes(mask_i[0]) #[xyxy] box_i[:, 2:] = box_i[:, 2:] - box_i[:, :2] #xywh x, y, w, h = box_i[0].long().tolist() self.search_area_factor=2 crop_sz = math.ceil(math.sqrt(w * h) * self.search_area_factor) x1 = max(0,round(x + 0.5 * w - crop_sz * 0.5)) x2 = x1 + crop_sz y1 = max(0,round(y + 0.5 * h - crop_sz * 0.5)) y2 = y1 + crop_sz im_crop = image_with_mask[:, :, y1:y2, x1:x2] # resize if im_crop.shape[-1] ==0 or im_crop.shape[-2] ==0 : im_crop = image_with_mask im_crop = F.interpolate(im_crop, (256,256), mode='bilinear', align_corners=False) croped_img_with_mask.append(im_crop) croped_img_with_mask = torch.cat(croped_img_with_mask,dim=0) #[bz,3,256,256] with torch.no_grad(): ref_srcs = self.backbone(croped_img_with_mask.contiguous()) ref_srcs = [v for k,v in ref_srcs.items()] ref_feats = self.sot_fuser(ref_srcs[1:]).float() #[bz,256,32,32] ref_feats = ref_feats.flatten(-2).permute(0, 2, 1) # (bs, L, C) ref_masks = torch.ones_like(ref_feats[:,:,0])>0 #[bs,L] return ref_feats, ref_masks ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/__init__.py ================================================ # Copyright (c) IDEA, Inc. and its affiliates. ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/early_fusion.py ================================================ import torch import torch.nn.functional as F from torch import nn from timm.models.layers import DropPath class VLFuse(torch.nn.Module): """ Early Fusion Module """ def __init__(self, ): super(VLFuse, self).__init__() self.init_configs() # early fusion module # bi-direction (text->image, image->text) self.b_attn = BiAttentionBlockForCheckpoint(v_dim=self.img_dim, # 256 l_dim=self.lang_dim, # 768 embed_dim=self.embed_dim, # 2048 num_heads=self.n_head, # 8 dropout=0.1, drop_path=.0, init_values=1.0 / 6, ) def init_configs(self, ): # common params self.img_dim = 256 self.max_query_len = 256 self.n_layers =1 # mha params self.n_head = 8 self.embed_dim = 2048 # 2048 by default self.lang_dim = 256 def forward(self, x, task=None): visual_features = x["visual"] language_dict_features = x["lang"] fused_visual_features, language_features = self.b_attn( visual_features, language_dict_features['hidden'], language_dict_features['masks'], task) language_dict_features['hidden'] = language_features fused_language_dict_features = language_dict_features features_dict = {"visual": fused_visual_features, "lang": fused_language_dict_features} return features_dict class BiMultiHeadAttention(nn.Module): def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1): super(BiMultiHeadAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.head_dim = embed_dim // num_heads self.v_dim = v_dim self.l_dim = l_dim assert ( self.head_dim * self.num_heads == self.embed_dim ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." self.scale = self.head_dim ** (-0.5) self.dropout = dropout self.v_proj = nn.Linear(self.v_dim, self.embed_dim) self.l_proj = nn.Linear(self.l_dim, self.embed_dim) self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim) self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim) self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim) self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim) self.stable_softmax_2d = False self.clamp_min_for_underflow = True self.clamp_max_for_overflow = True self._reset_parameters() def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def _reset_parameters(self): nn.init.xavier_uniform_(self.v_proj.weight) self.v_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.l_proj.weight) self.l_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.values_v_proj.weight) self.values_v_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.values_l_proj.weight) self.values_l_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.out_v_proj.weight) self.out_v_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.out_l_proj.weight) self.out_l_proj.bias.data.fill_(0) def forward(self, v, l, attention_mask_l=None): bsz, tgt_len, embed_dim = v.size() query_states = self.v_proj(v) * self.scale key_states = self._shape(self.l_proj(l), -1, bsz) value_v_states = self._shape(self.values_v_proj(v), -1, bsz) value_l_states = self._shape(self.values_l_proj(l), -1, bsz) proj_shape = (bsz * self.num_heads, -1, self.head_dim) # (bs * 8, -1, embed_dim//8) query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) # (bs * 8, seq_len_img, embed_dim//8) key_states = key_states.view(*proj_shape) # (bs * 8, seq_len_text, embed_dim//8) value_v_states = value_v_states.view(*proj_shape) value_l_states = value_l_states.view(*proj_shape) src_len = key_states.size(1) attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # (bs * 8, seq_len_img, seq_len_text) if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): raise ValueError( f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" ) # attn_weights_l = nn.functional.softmax(attn_weights.transpose(1, 2), dim=-1) if self.stable_softmax_2d: attn_weights = attn_weights - attn_weights.max() if self.clamp_min_for_underflow: attn_weights = torch.clamp(attn_weights, min=-50000) # Do not increase -50000, data type half has quite limited range if self.clamp_max_for_overflow: attn_weights = torch.clamp(attn_weights, max=50000) # Do not increase 50000, data type half has quite limited range attn_weights_T = attn_weights.transpose(1, 2) attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[ 0]) if self.clamp_min_for_underflow: attn_weights_l = torch.clamp(attn_weights_l, min=-50000) # Do not increase -50000, data type half has quite limited range if self.clamp_max_for_overflow: attn_weights_l = torch.clamp(attn_weights_l, max=50000) # Do not increase 50000, data type half has quite limited range attn_weights_l = attn_weights_l.softmax(dim=-1) # assert attention_mask_l.dtype == torch.int64 if attention_mask_l is not None: assert (attention_mask_l.dim() == 2) # (bs, seq_len) attention_mask = attention_mask_l.unsqueeze(1).unsqueeze(1) # (bs, 1, 1, seq_len) attention_mask = attention_mask.expand(bsz, 1, tgt_len, src_len) attention_mask = attention_mask.masked_fill(attention_mask == 0, -9e15) if attention_mask.size() != (bsz, 1, tgt_len, src_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}" ) attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights_v = nn.functional.softmax(attn_weights, dim=-1) attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training) attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training) attn_output_v = torch.bmm(attn_probs_v, value_l_states) attn_output_l = torch.bmm(attn_probs_l, value_v_states) if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim): raise ValueError( f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}" ) if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim): raise ValueError( f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}" ) attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output_v = attn_output_v.transpose(1, 2) attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim) attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim) attn_output_l = attn_output_l.transpose(1, 2) attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim) attn_output_v = self.out_v_proj(attn_output_v) attn_output_l = self.out_l_proj(attn_output_l) return attn_output_v, attn_output_l class BiAttentionBlockForCheckpoint(nn.Module): def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, drop_path=.0, init_values=1e-4, ): """ Inputs: embed_dim - Dimensionality of input and attention feature vectors num_heads - Number of heads to use in the Multi-Head Attention block dropout - Amount of dropout to apply in the feed-forward network """ super(BiAttentionBlockForCheckpoint, self).__init__() # pre layer norm self.layer_norm_v = nn.LayerNorm(v_dim) self.layer_norm_l = nn.LayerNorm(l_dim) self.attn = BiMultiHeadAttention(v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, ) # add layer scale for training stability self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True) self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True) def forward(self, v, l, attention_mask_l=None, task=None): # v: visual features, (bs, sigma(HW), 256) # l: language features, (bs, seq_len, 768) v = self.layer_norm_v(v) l = self.layer_norm_l(l) delta_v, delta_l = self.attn(v, l, attention_mask_l=attention_mask_l) # v, l = v + delta_v, l + delta_l v = v + self.drop_path(self.gamma_v * delta_v) l = l + self.drop_path(self.gamma_l * delta_l) return v, l ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/maskdino_encoder.py ================================================ # ------------------------------------------------------------------------ # DINO # Copyright (c) 2022 IDEA. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ # Modified by Feng Li and Hao Zhang. import logging import numpy as np from typing import Callable, Dict, List, Optional, Tuple, Union import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ from torch.cuda.amp import autocast from detectron2.config import configurable from detectron2.layers import Conv2d, ShapeSpec, get_norm from detectron2.modeling import SEM_SEG_HEADS_REGISTRY from .position_encoding import PositionEmbeddingSine from ...utils.utils import _get_clones, _get_clones_advanced, _get_activation_fn from .ops.modules import MSDeformAttn from .early_fusion import VLFuse def build_pixel_decoder(cfg, input_shape): """ Build a pixel decoder from `cfg.MODEL.MaskDINO.PIXEL_DECODER_NAME`. """ name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape) forward_features = getattr(model, "forward_features", None) if not callable(forward_features): raise ValueError( "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. " f"Please implement forward_features for {name} to only return mask features." ) return model # MSDeformAttn Transformer encoder in deformable detr class MSDeformAttnTransformerEncoderOnly(nn.Module): def __init__(self, d_model=256, nhead=8, num_encoder_layers=6, dim_feedforward=1024, dropout=0.1, activation="relu", num_feature_levels=4, enc_n_points=4,): super().__init__() self.d_model = d_model self.nhead = nhead vl_fusion_layer = VLFuse() encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points) self.encoder = MSDeformAttnTransformerEncoder(vl_fusion_layer, encoder_layer, num_encoder_layers) self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, MSDeformAttn): m._reset_parameters() normal_(self.level_embed) def get_valid_ratio(self, mask): _, H, W = mask.shape valid_H = torch.sum(~mask[:, :, 0], 1) valid_W = torch.sum(~mask[:, 0, :], 1) valid_ratio_h = valid_H.float() / H valid_ratio_w = valid_W.float() / W valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) return valid_ratio def forward(self, srcs, masks, pos_embeds, early_fusion=None): enable_mask=0 if masks is not None: for src in srcs: if src.size(2)%32 or src.size(3)%32: enable_mask = 1 if enable_mask==0: masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs] # prepare input for encoder src_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): bs, c, h, w = src.shape spatial_shape = (h, w) spatial_shapes.append(spatial_shape) src = src.flatten(2).transpose(1, 2) mask = mask.flatten(1) pos_embed = pos_embed.flatten(2).transpose(1, 2) lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) lvl_pos_embed_flatten.append(lvl_pos_embed) src_flatten.append(src) mask_flatten.append(mask) src_flatten = torch.cat(src_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device) level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) # encoder memory, zero_loss = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten, early_fusion) return memory, spatial_shapes, level_start_index, zero_loss class MSDeformAttnTransformerEncoderLayer(nn.Module): def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4): super().__init__() # self attention self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) # ffn self.linear1 = nn.Linear(d_model, d_ffn) self.activation = _get_activation_fn(activation) self.dropout2 = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ffn, d_model) self.dropout3 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm(d_model) @staticmethod def with_pos_embed(tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, src): src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) src = src + self.dropout3(src2) src = self.norm2(src) return src def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None): # self attention src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask) src = src + self.dropout1(src2) src = self.norm1(src) # ffn src = self.forward_ffn(src) return src class MSDeformAttnTransformerEncoder(nn.Module): def __init__(self, vl_fusion_layer, encoder_layer, num_layers): super().__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers self.vl_layers = _get_clones_advanced(vl_fusion_layer, num_layers, 1) @staticmethod def get_reference_points(spatial_shapes, valid_ratios, device): reference_points_list = [] for lvl, (H_, W_) in enumerate(spatial_shapes): ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device)) ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_) ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_) ref = torch.stack((ref_x, ref_y), -1) reference_points_list.append(ref) reference_points = torch.cat(reference_points_list, 1) reference_points = reference_points[:, :, None] * valid_ratios[:, None] return reference_points def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None, early_fusion=None): if early_fusion: output = {"visual": src, "lang": early_fusion} else: output = src reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device) for _, (layer,vl_layer) in enumerate(zip(self.layers, self.vl_layers)): if early_fusion: output = vl_layer(output) output["visual"] = layer(output["visual"], pos, reference_points, spatial_shapes, level_start_index, padding_mask) else: output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask) if early_fusion: return output["visual"] , (output['lang']['hidden']*0).sum() else: return output, None @SEM_SEG_HEADS_REGISTRY.register() class MaskDINOEncoder(nn.Module): """ This is the multi-scale encoder in detection models, also named as pixel decoder in segmentation models. """ @configurable def __init__( self, input_shape: Dict[str, ShapeSpec], *, transformer_dropout: float, transformer_nheads: int, transformer_dim_feedforward: int, transformer_enc_layers: int, conv_dim: int, mask_dim: int, norm: Optional[Union[str, Callable]] = None, # deformable transformer encoder args transformer_in_features: List[str], common_stride: int, num_feature_levels: int, total_num_feature_levels: int, feature_order: str, ViTBackbone: bool, ): """ NOTE: this interface is experimental. Args: input_shape: shapes (channels and stride) of the input features transformer_dropout: dropout probability in transformer transformer_nheads: number of heads in transformer transformer_dim_feedforward: dimension of feedforward network transformer_enc_layers: number of transformer encoder layers conv_dims: number of output channels for the intermediate conv layers. mask_dim: number of output channels for the final conv layer. norm (str or callable): normalization for all conv layers num_feature_levels: feature scales used total_num_feature_levels: total feautre scales used (include the downsampled features) feature_order: 'low2high' or 'high2low', i.e., 'low2high' means low-resolution features are put in the first. """ super().__init__() transformer_input_shape = { k: v for k, v in input_shape.items() if k in transformer_in_features } # this is the input shape of pixel decoder input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" self.feature_strides = [v.stride for k, v in input_shape] self.feature_channels = [v.channels for k, v in input_shape] self.feature_order = feature_order if feature_order == "low2high": transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: -x[1].stride) else: transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride) self.transformer_in_features = [k for k, v in transformer_input_shape] # starting from "res2" to "res5" transformer_in_channels = [v.channels for k, v in transformer_input_shape] self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape] # to decide extra FPN layers self.maskdino_num_feature_levels = num_feature_levels # always use 3 scales self.total_num_feature_levels = total_num_feature_levels self.common_stride = common_stride self.transformer_num_feature_levels = len(self.transformer_in_features) self.low_resolution_index = transformer_in_channels.index(max(transformer_in_channels)) self.high_resolution_index = 0 if self.feature_order == 'low2high' else -1 self.isViTBackbone = ViTBackbone if not ViTBackbone: if self.transformer_num_feature_levels > 1: input_proj_list = [] for in_channels in transformer_in_channels[::-1]: input_proj_list.append(nn.Sequential( nn.Conv2d(in_channels, conv_dim, kernel_size=1), nn.GroupNorm(32, conv_dim), )) # input projectino for downsample in_channels = max(transformer_in_channels) for _ in range(self.total_num_feature_levels - self.transformer_num_feature_levels): # exclude the res2 input_proj_list.append(nn.Sequential( nn.Conv2d(in_channels, conv_dim, kernel_size=3, stride=2, padding=1), nn.GroupNorm(32, conv_dim), )) in_channels = conv_dim self.input_proj = nn.ModuleList(input_proj_list) else: self.input_proj = nn.ModuleList([ nn.Sequential( nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1), nn.GroupNorm(32, conv_dim), )]) for proj in self.input_proj: nn.init.xavier_uniform_(proj[0].weight, gain=1) nn.init.constant_(proj[0].bias, 0) self.transformer = MSDeformAttnTransformerEncoderOnly( d_model=conv_dim, dropout=transformer_dropout, nhead=transformer_nheads, dim_feedforward=transformer_dim_feedforward, num_encoder_layers=transformer_enc_layers, num_feature_levels=self.total_num_feature_levels, ) N_steps = conv_dim // 2 self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) self.mask_dim = mask_dim # use 1x1 conv instead self.mask_features = Conv2d( conv_dim, mask_dim, kernel_size=1, stride=1, padding=0, ) weight_init.c2_xavier_fill(self.mask_features) # extra fpn levels stride = min(self.transformer_feature_strides) self.num_fpn_levels = max(int(np.log2(stride) - np.log2(self.common_stride)), 1) lateral_convs = [] output_convs = [] use_bias = norm == "" for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]): lateral_norm = get_norm(norm, conv_dim) output_norm = get_norm(norm, conv_dim) lateral_conv = Conv2d( in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm ) output_conv = Conv2d( conv_dim, conv_dim, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=output_norm, activation=F.relu, ) weight_init.c2_xavier_fill(lateral_conv) weight_init.c2_xavier_fill(output_conv) self.add_module("adapter_{}".format(idx + 1), lateral_conv) self.add_module("layer_{}".format(idx + 1), output_conv) lateral_convs.append(lateral_conv) output_convs.append(output_conv) # Place convs into top-down order (from low to high resolution) # to make the top-down computation in forward clearer. self.lateral_convs = lateral_convs[::-1] self.output_convs = output_convs[::-1] @classmethod def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): ret = {} ret["input_shape"] = { k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES } ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM ret["transformer_dropout"] = cfg.MODEL.MaskDINO.DROPOUT ret["transformer_nheads"] = cfg.MODEL.MaskDINO.NHEADS ret["transformer_dim_feedforward"] = cfg.MODEL.SEM_SEG_HEAD.DIM_FEEDFORWARD # deformable transformer encoder ret[ "transformer_enc_layers" ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config ret["transformer_in_features"] = cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES # ['res3', 'res4', 'res5'] ret["common_stride"] = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE ret["total_num_feature_levels"] = cfg.MODEL.SEM_SEG_HEAD.TOTAL_NUM_FEATURE_LEVELS ret["num_feature_levels"] = cfg.MODEL.SEM_SEG_HEAD.NUM_FEATURE_LEVELS ret["feature_order"] = cfg.MODEL.SEM_SEG_HEAD.FEATURE_ORDER ret["ViTBackbone"] = cfg.MODEL.BACKBONE.NAME in ['D2_EVA02', 'D2_EVA01' , 'D2_ViT'] return ret @autocast(enabled=False) def forward_features(self, features, masks, early_fusion=None): """ :param features: multi-scale features from the backbone :param masks: image mask :return: enhanced multi-scale features and mask feature (1/4 resolution) for the decoder to produce binary mask """ # backbone features srcs = [] pos = [] # additional downsampled features srcsl = [] posl = [] if self.isViTBackbone: for idx, f in enumerate(self.transformer_in_features[::-1]): x = features[f].float() # deformable detr does not support half precision srcs.append(x) pos.append(self.pe_layer(x)) if self.feature_order != 'low2high': srcs = srcs[::-1] pos = pos[::-1] else: if self.total_num_feature_levels > self.transformer_num_feature_levels: smallest_feat = features[self.transformer_in_features[self.low_resolution_index]].float() _len_srcs = self.transformer_num_feature_levels for l in range(_len_srcs, self.total_num_feature_levels): if l == _len_srcs: src = self.input_proj[l](smallest_feat) else: src = self.input_proj[l](srcsl[-1]) srcsl.append(src) posl.append(self.pe_layer(src)) srcsl = srcsl[::-1] # Reverse feature maps for idx, f in enumerate(self.transformer_in_features[::-1]): x = features[f].float() # deformable detr does not support half precision srcs.append(self.input_proj[idx](x)) pos.append(self.pe_layer(x)) srcs.extend(srcsl) if self.feature_order == 'low2high' else srcsl.extend(srcs) pos.extend(posl) if self.feature_order == 'low2high' else posl.extend(pos) if self.feature_order != 'low2high': srcs = srcsl pos = posl y, spatial_shapes, level_start_index, zero_loss = self.transformer(srcs, masks, pos, early_fusion) bs = y.shape[0] split_size_or_sections = [None] * self.total_num_feature_levels for i in range(self.total_num_feature_levels): if i < self.total_num_feature_levels - 1: split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i] else: split_size_or_sections[i] = y.shape[1] - level_start_index[i] y = torch.split(y, split_size_or_sections, dim=1) out = [] multi_scale_features = [] num_cur_levels = 0 for i, z in enumerate(y): out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1])) # append `out` with extra FPN levels # Reverse feature maps into top-down order (from low to high resolution) for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]): x = features[f].float() lateral_conv = self.lateral_convs[idx] output_conv = self.output_convs[idx] cur_fpn = lateral_conv(x) # Following FPN implementation, we use nearest upsampling here y = cur_fpn + F.interpolate(out[self.high_resolution_index], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False) y = output_conv(y) out.append(y) for o in out: if num_cur_levels < self.total_num_feature_levels: multi_scale_features.append(o) num_cur_levels += 1 return self.mask_features(out[-1]), out[0], multi_scale_features, zero_loss ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/functions/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn_func import MSDeformAttnFunction ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/functions/ms_deform_attn_func.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import torch import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable try: import MultiScaleDeformableAttention as MSDA except ModuleNotFoundError as e: info_string = ( "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" "\t`cd maskdino/modeling/pixel_decoder/ops`\n" "\t`sh make.sh`\n" ) # raise ModuleNotFoundError(info_string) class MSDeformAttnFunction(Function): @staticmethod def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): ctx.im2col_step = im2col_step output = MSDA.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable def backward(ctx, grad_output): value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors grad_value, grad_sampling_loc, grad_attn_weight = \ MSDA.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): # for debug and test only, # need to use cuda version instead N_, S_, M_, D_ = value.shape _, Lq_, M_, L_, P_, _ = sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for lid_, (H_, W_) in enumerate(value_spatial_shapes): # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) # N_*M_, D_, Lq_, P_ sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) return output.transpose(1, 2).contiguous() ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/make.sh ================================================ #!/usr/bin/env bash # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR python setup.py build install --user ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/modules/__init__.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from .ms_deform_attn import MSDeformAttn ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/modules/ms_deform_attn.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import warnings import math import torch from torch import nn import torch.nn.functional as F from torch.nn.init import xavier_uniform_, constant_ from ..functions import MSDeformAttnFunction from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) return (n & (n-1) == 0) and n != 0 class MSDeformAttn(nn.Module): def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): """ Multi-Scale Deformable Attention Module :param d_model hidden dimension :param n_levels number of feature levels :param n_heads number of attention heads :param n_points number of sampling points per attention head per feature level """ super().__init__() if d_model % n_heads != 0: raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) _d_per_head = d_model // n_heads # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_head): warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " "which is more efficient in our CUDA implementation.") self.im2col_step = 128 self.d_model = d_model self.n_levels = n_levels self.n_heads = n_heads self.n_points = n_points self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) self.value_proj = nn.Linear(d_model, d_model) self.output_proj = nn.Linear(d_model, d_model) self._reset_parameters() def _reset_parameters(self): constant_(self.sampling_offsets.weight.data, 0.) thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) for i in range(self.n_points): grid_init[:, :, i, :] *= i + 1 with torch.no_grad(): self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) constant_(self.attention_weights.weight.data, 0.) constant_(self.attention_weights.bias.data, 0.) xavier_uniform_(self.value_proj.weight.data) constant_(self.value_proj.bias.data, 0.) xavier_uniform_(self.output_proj.weight.data) constant_(self.output_proj.bias.data, 0.) def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): """ :param query (N, Length_{query}, C) :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements :return output (N, Length_{query}, C) """ N, Len_q, _ = query.shape N, Len_in, _ = input_flatten.shape assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in value = self.value_proj(input_flatten) if input_padding_mask is not None: value = value.masked_fill(input_padding_mask[..., None], float(0)) value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) # N, Len_q, n_heads, n_levels, n_points, 2 if reference_points.shape[-1] == 2: offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: sampling_locations = reference_points[:, :, None, :, None, :2] \ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 else: raise ValueError( 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) try: output = MSDeformAttnFunction.apply( value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) except: # CPU output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) # # For FLOPs calculation only # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) output = self.output_proj(output) return output ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/setup.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR import os import glob import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) sources = main_file + source_cpu extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] # Force cuda since torch ask for a device, not if cuda is in fact available. if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] else: if CUDA_HOME is None: raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') else: raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "MultiScaleDeformableAttention", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="MultiScaleDeformableAttention", version="1.0", author="Weijie Su", url="https://github.com/fundamentalvision/Deformable-DETR", description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", packages=find_packages(exclude=("configs", "tests",)), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ERROR("Not implement on cpu"); } std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ERROR("Not implement on cpu"); } ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include at::Tensor ms_deform_attn_cpu_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cpu_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include "cuda/ms_deform_im2col_cuda.cuh" #include #include #include #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); const int batch_n = im2col_step_; auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; for (int n = 0; n < batch/im2col_step_; ++n) { auto columns = output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, columns.data()); })); } output = output.view({batch, num_query, num_heads*channels}); return output; } std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); const int batch = value.size(0); const int spatial_size = value.size(1); const int num_heads = value.size(2); const int channels = value.size(3); const int num_levels = spatial_shapes.size(0); const int num_query = sampling_loc.size(1); const int num_point = sampling_loc.size(4); const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); auto grad_value = at::zeros_like(value); auto grad_sampling_loc = at::zeros_like(sampling_loc); auto grad_attn_weight = at::zeros_like(attn_weight); const int batch_n = im2col_step_; auto per_value_size = spatial_size * num_heads * channels; auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); for (int n = 0; n < batch/im2col_step_; ++n) { auto grad_output_g = grad_output_n.select(0, n); AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), grad_output_g.data(), value.data() + n * im2col_step_ * per_value_size, spatial_shapes.data(), level_start_index.data(), sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, attn_weight.data() + n * im2col_step_ * per_attn_weight_size, batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value.data() + n * im2col_step_ * per_value_size, grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); })); } return { grad_value, grad_sampling_loc, grad_attn_weight }; } ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include at::Tensor ms_deform_attn_cuda_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step); std::vector ms_deform_attn_cuda_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh ================================================ /*! ************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************** * Modified from DCN (https://github.com/msracver/Deformable-ConvNets) * Copyright (c) 2018 Microsoft ************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include #include #include #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } template __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_attn_weight = top_grad * val; *grad_sampling_loc = width * grad_w_weight * top_grad_value; *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; } template __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t* &grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value+ptr1, w1*top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value+ptr2, w2*top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value+ptr3, w3*top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value+ptr4, w4*top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_attn_weight, top_grad * val); atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); } template __global__ void ms_deformable_im2col_gpu_kernel(const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *data_col) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; scalar_t *data_col_ptr = data_col + index; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; scalar_t col = 0; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_col_ptr = col; } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockSize; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockSize/2; s>0; s>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; int sid=2; for (unsigned int tid = 1; tid < blockDim.x; ++tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[tid]; sid += 2; } *grad_sampling_loc = _grad_w; *(grad_sampling_loc + 1) = _grad_h; *grad_attn_weight = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_sampling_loc = cache_grad_sampling_loc[0]; *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { extern __shared__ int _s[]; scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight+threadIdx.x)=0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); } __syncthreads(); for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; const int q_col = _temp % num_query; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_sampling_loc += grad_sampling_ptr << 1; grad_attn_weight += grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col=0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col=0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, grad_sampling_loc, grad_attn_weight); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight += grad_weight_stride; grad_sampling_loc += grad_loc_stride; } } } } template void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t* data_value, const int64_t* data_spatial_shapes, const int64_t* data_level_start_index, const scalar_t* data_sampling_loc, const scalar_t* data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* data_col) { const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; const int num_threads = CUDA_NUM_THREADS; ms_deformable_im2col_gpu_kernel <<>>( num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); } } template void ms_deformable_col2im_cuda(cudaStream_t stream, const scalar_t* grad_col, const scalar_t* data_value, const int64_t * data_spatial_shapes, const int64_t * data_level_start_index, const scalar_t * data_sampling_loc, const scalar_t * data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t* grad_value, scalar_t* grad_sampling_loc, scalar_t* grad_attn_weight) { const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; const int num_kernels = batch_size * num_query * num_heads * channels; const int num_actual_kernels = batch_size * num_query * num_heads * channels; if (channels > 1024) { if ((channels & 1023) == 0) { ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_gm <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } else{ switch(channels) { case 1: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 2: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 4: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 8: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 16: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 32: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 64: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 128: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 256: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 512: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; case 1024: ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); break; default: if (channels < 64) { ms_deformable_col2im_gpu_kernel_shm_reduce_v1 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } else { ms_deformable_col2im_gpu_kernel_shm_reduce_v2 <<>>( num_kernels, grad_col, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, grad_value, grad_sampling_loc, grad_attn_weight); } } } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/src/ms_deform_attn.h ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #pragma once #include "cpu/ms_deform_attn_cpu.h" #ifdef WITH_CUDA #include "cuda/ms_deform_attn_cuda.h" #endif at::Tensor ms_deform_attn_forward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_forward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } std::vector ms_deform_attn_backward( const at::Tensor &value, const at::Tensor &spatial_shapes, const at::Tensor &level_start_index, const at::Tensor &sampling_loc, const at::Tensor &attn_weight, const at::Tensor &grad_output, const int im2col_step) { if (value.type().is_cuda()) { #ifdef WITH_CUDA return ms_deform_attn_cuda_backward( value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/src/vision.cpp ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ /*! * Copyright (c) Facebook, Inc. and its affiliates. * Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR */ #include "ms_deform_attn.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); } ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/ops/test.py ================================================ # ------------------------------------------------------------------------------------------------ # Deformable DETR # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR from __future__ import absolute_import from __future__ import print_function from __future__ import division import time import torch import torch.nn as nn from torch.autograd import gradcheck from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch N, M, D = 1, 2, 2 Lq, L, P = 2, 2, 2 shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) S = sum([(H*W).item() for H, W in shapes]) torch.manual_seed(3) @torch.no_grad() def check_forward_equal_with_pytorch_double(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') @torch.no_grad() def check_forward_equal_with_pytorch_float(): value = torch.rand(N, S, M, D).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): value = torch.rand(N, S, M, channels).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 func = MSDeformAttnFunction.apply value.requires_grad = grad_value sampling_locations.requires_grad = grad_sampling_loc attention_weights.requires_grad = grad_attn_weight gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) print(f'* {gradok} check_gradient_numerical(D={channels})') if __name__ == '__main__': check_forward_equal_with_pytorch_double() check_forward_equal_with_pytorch_float() for channels in [30, 32, 64, 71, 1025, 2048, 3096]: check_gradient_numerical(channels, True, True, True) ================================================ FILE: thirdparty/GLEE/glee/models/pixel_decoder/position_encoding.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) 2022 IDEA. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang. """ Various positional encodings for the transformer. """ import math import torch from torch import nn class PositionEmbeddingSine(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. """ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, x, mask=None): if mask is None: mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) not_mask = ~mask y_embed = not_mask.cumsum(1, dtype=torch.float32) x_embed = not_mask.cumsum(2, dtype=torch.float32) if self.normalize: eps = 1e-6 y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) pos_x = x_embed[:, :, :, None] / dim_t pos_y = y_embed[:, :, :, None] / dim_t pos_x = torch.stack( (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos_y = torch.stack( (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) return pos def __repr__(self, _repr_indent=4): head = "Positional encoding " + self.__class__.__name__ body = [ "num_pos_feats: {}".format(self.num_pos_feats), "temperature: {}".format(self.temperature), "normalize: {}".format(self.normalize), "scale: {}".format(self.scale), ] # _repr_indent = 4 lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: thirdparty/GLEE/glee/models/transformer_decoder/__init__.py ================================================ # Copyright (c) IDEA, Inc. and its affiliates. from .maskdino_decoder import MaskDINODecoder ================================================ FILE: thirdparty/GLEE/glee/models/transformer_decoder/dino_decoder.py ================================================ # ------------------------------------------------------------------------ # DINO # Copyright (c) 2022 IDEA. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ # Modified from DINO https://github.com/IDEA-Research/DINO by Feng Li and Hao Zhang. # ------------------------------------------------------------------------ from typing import Optional, List, Union import torch from torch import nn, Tensor from torch.cuda.amp import autocast from ...utils.utils import MLP, _get_clones, _get_activation_fn, gen_sineembed_for_position, inverse_sigmoid from ..pixel_decoder.ops.modules import MSDeformAttn class TransformerDecoder(nn.Module): def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False, d_model=256, query_dim=4, modulate_hw_attn=True, num_feature_levels=1, deformable_decoder=True, decoder_query_perturber=None, dec_layer_number=None, # number of queries each layer in decoder rm_dec_query_scale=True, dec_layer_share=False, dec_layer_dropout_prob=None, cross_track_layer = False, n_levels = None, n_heads = None, n_points = None, ): super().__init__() if num_layers > 0: self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share) else: self.layers = [] self.num_layers = num_layers self.norm = norm self.return_intermediate = return_intermediate assert return_intermediate, "support return_intermediate only" self.query_dim = query_dim assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim) self.num_feature_levels = num_feature_levels self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2) if not deformable_decoder: self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2) else: self.query_pos_sine_scale = None if rm_dec_query_scale: self.query_scale = None else: raise NotImplementedError self.query_scale = MLP(d_model, d_model, d_model, 2) self.bbox_embed = None self.class_embed = None self.d_model = d_model self.modulate_hw_attn = modulate_hw_attn self.deformable_decoder = deformable_decoder if not deformable_decoder and modulate_hw_attn: self.ref_anchor_head = MLP(d_model, d_model, 2, 2) else: self.ref_anchor_head = None self.decoder_query_perturber = decoder_query_perturber self.box_pred_damping = None self.dec_layer_number = dec_layer_number if dec_layer_number is not None: assert isinstance(dec_layer_number, list) assert len(dec_layer_number) == num_layers # assert dec_layer_number[0] == self.dec_layer_dropout_prob = dec_layer_dropout_prob if dec_layer_dropout_prob is not None: assert isinstance(dec_layer_dropout_prob, list) assert len(dec_layer_dropout_prob) == num_layers for i in dec_layer_dropout_prob: assert 0.0 <= i <= 1.0 if cross_track_layer: # add a cross-attention-layer before track ffn head self.cross_track_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) self.cross_track = True else: self.cross_track = False self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, MSDeformAttn): m._reset_parameters() @staticmethod def with_pos_embed(tensor, pos): return tensor if pos is None else tensor + pos def forward(self, tgt, memory, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2 # for memory level_start_index: Optional[Tensor] = None, # num_levels spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2 valid_ratios: Optional[Tensor] = None, task = None, extra = None, ): """ Input: - tgt: nq, bs, d_model - memory: hw, bs, d_model - pos: hw, bs, d_model - refpoints_unsigmoid: nq, bs, 2/4 - valid_ratios/spatial_shapes: bs, nlevel, 2 """ output = tgt device = tgt.device intermediate = [] reference_points = refpoints_unsigmoid.sigmoid().to(device) ref_points = [reference_points] for layer_id, layer in enumerate(self.layers): # preprocess ref points if self.training and self.decoder_query_perturber is not None and layer_id != 0: reference_points = self.decoder_query_perturber(reference_points) reference_points_input = reference_points[:, :, None] \ * torch.cat([valid_ratios, valid_ratios], -1)[None, :] # nq, bs, nlevel, 4 query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # nq, bs, 256*2 raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256 pos_scale = self.query_scale(output) if self.query_scale is not None else 1 query_pos = pos_scale * raw_query_pos output = layer( tgt=output, tgt_query_pos=query_pos, tgt_query_sine_embed=query_sine_embed, tgt_key_padding_mask=tgt_key_padding_mask, tgt_reference_points=reference_points_input, memory=memory, memory_key_padding_mask=memory_key_padding_mask, memory_level_start_index=level_start_index, memory_spatial_shapes=spatial_shapes, memory_pos=pos, self_attn_mask=tgt_mask, cross_attn_mask=memory_mask, task = task, extra = extra, layer_id = layer_id, ) # iter update if self.bbox_embed is not None: reference_before_sigmoid = inverse_sigmoid(reference_points) delta_unsig = self.bbox_embed[layer_id](output).to(device) outputs_unsig = delta_unsig + reference_before_sigmoid new_reference_points = outputs_unsig.sigmoid() reference_points = new_reference_points.detach() # if layer_id != self.num_layers - 1: ref_points.append(new_reference_points) intermediate.append(self.norm(output)) if self.cross_track: tgt_track = self.cross_track_attn(self.with_pos_embed(output, query_pos).transpose(0, 1), reference_points_input.transpose(0, 1).contiguous(), memory.transpose(0, 1), spatial_shapes, level_start_index, memory_key_padding_mask).transpose(0, 1) tgt_track = tgt_track + output tgt_track = tgt_track.transpose(0, 1) else: tgt_track = None return [ [itm_out.transpose(0, 1) for itm_out in intermediate], [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points], tgt_track ] class DeformableTransformerDecoderLayer(nn.Module): def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4, use_deformable_box_attn=False, key_aware_type=None, ): super().__init__() self.n_heads = n_heads # cross attention if use_deformable_box_attn: raise NotImplementedError else: self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) # self attention self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) self.dropout2 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm(d_model) # ffn self.linear1 = nn.Linear(d_model, d_ffn) self.activation = _get_activation_fn(activation) self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ffn, d_model) self.dropout4 = nn.Dropout(dropout) self.norm3 = nn.LayerNorm(d_model) self.key_aware_type = key_aware_type self.key_aware_proj = None def rm_self_attn_modules(self): self.self_attn = None self.dropout2 = None self.norm2 = None @staticmethod def with_pos_embed(tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, tgt): tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout4(tgt2) tgt = self.norm3(tgt) return tgt @autocast(enabled=False) def forward(self, # for tgt tgt: Optional[Tensor], # nq, bs, d_model tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos)) tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos) tgt_key_padding_mask: Optional[Tensor] = None, tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4 # for memory memory: Optional[Tensor] = None, # hw, bs, d_model memory_key_padding_mask: Optional[Tensor] = None, memory_level_start_index: Optional[Tensor] = None, # num_levels memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2 memory_pos: Optional[Tensor] = None, # pos for memory # sa self_attn_mask: Optional[Tensor] = None, # mask used for self-attention cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention task = None, extra = None, layer_id = None, ): """ Input: - tgt/tgt_query_pos: nq, bs, d_model - """ # self attention if task in ['grounding', 'rvos'] or 'visual_prompt_tokens' in extra: if self_attn_mask is not None: # training with denoising query if 'visual_prompt_tokens' in extra: # has visual prompt level_index = layer_id % 3 # src level : self.num_feature_levels prompt_tokens = extra['visual_prompt_tokens'][level_index] promot_pos = prompt_tokens.detach().clone() prompt_mask = extra['visual_prompt_nonzero_mask'][level_index] else: #grounding prompt_tokens = extra['grounding_tokens'] promot_pos = prompt_tokens.detach().clone() prompt_mask = extra['grounding_nonzero_mask'] ori_size = tgt.shape[0] new_mask_size = tgt.shape[0]+prompt_tokens.shape[0] new_self_attn_mask = torch.zeros((tgt.shape[1], new_mask_size, new_mask_size), dtype=torch.bool, device=tgt.device) new_self_attn_mask[:,:ori_size,:ori_size] = self_attn_mask.unsqueeze(0).repeat(tgt.shape[1],1,1) #denoising matching keepmask # prompt to prompt mask set to True if they are not valid # new_self_attn_mask[:,ori_size:,ori_size:][prompt_mask] = True # new_self_attn_mask[:,ori_size:,ori_size:].transpose(1,2)[prompt_mask] = True # prompt2obj and obj2prompt mask set to True # new_self_attn_mask[:,ori_size-300:ori_size,ori_size:][] = True new_self_attn_mask[:,:ori_size,ori_size:].transpose(1,2)[prompt_mask] = True new_self_attn_mask[:,ori_size:,:ori_size][prompt_mask] = True # new_self_attn_mask[:,ori_size:,ori_size-300:ori_size].transpose(1,2)[] = True new_self_attn_mask = new_self_attn_mask.repeat_interleave(self.n_heads, dim=0) else: # with out denoising query if 'visual_prompt_tokens' in extra: # has visual prompt level_index = layer_id % 3 # src level : self.num_feature_levels prompt_tokens = extra['visual_prompt_tokens'][level_index] promot_pos = prompt_tokens.detach().clone() prompt_mask = extra['visual_prompt_nonzero_mask'][level_index] else: #grounding prompt_tokens = extra['grounding_tokens'] promot_pos = prompt_tokens.detach().clone() prompt_mask = extra['grounding_nonzero_mask'] ori_size = tgt.shape[0] new_mask_size = tgt.shape[0]+prompt_tokens.shape[0] new_self_attn_mask = torch.zeros((tgt.shape[1], new_mask_size, new_mask_size), dtype=torch.bool, device=tgt.device) new_self_attn_mask[:,:ori_size,ori_size:].transpose(1,2)[prompt_mask] = True new_self_attn_mask[:,ori_size:,:ori_size][prompt_mask] = True new_self_attn_mask = new_self_attn_mask.repeat_interleave(self.n_heads, dim=0) if self.self_attn is not None: tgt = torch.cat([tgt,prompt_tokens],dim=0) tgt_query_pos = torch.cat([tgt_query_pos,promot_pos],dim=0) q = k = self.with_pos_embed(tgt, tgt_query_pos) tgt2 = self.self_attn(q, k, tgt, attn_mask=new_self_attn_mask)[0] tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) tgt = tgt[:ori_size] tgt_query_pos = tgt_query_pos[:ori_size] else: if self.self_attn is not None: q = k = self.with_pos_embed(tgt, tgt_query_pos) tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0] tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) # cross attention if self.key_aware_type is not None: if self.key_aware_type == 'mean': tgt = tgt + memory.mean(0, keepdim=True) elif self.key_aware_type == 'proj_mean': tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True) else: raise NotImplementedError("Unknown key_aware_type: {}".format(self.key_aware_type)) tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1), tgt_reference_points.transpose(0, 1).contiguous(), memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1) tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) # ffn tgt = self.forward_ffn(tgt) return tgt ================================================ FILE: thirdparty/GLEE/glee/models/transformer_decoder/maskdino_decoder.py ================================================ # ------------------------------------------------------------------------ # DINO # Copyright (c) 2022 IDEA. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------ # Modified from Mask2Former https://github.com/facebookresearch/Mask2Former by Feng Li and Hao Zhang. import logging import fvcore.nn.weight_init as weight_init import torch from torch import nn from torch.nn import functional as F from detectron2.config import configurable from detectron2.layers import Conv2d from detectron2.utils.registry import Registry from detectron2.structures import BitMasks from timm.models.layers import trunc_normal_ from .dino_decoder import TransformerDecoder, DeformableTransformerDecoderLayer from ...utils.utils import MLP, gen_encoder_output_proposals, inverse_sigmoid from ...utils import box_ops TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE") TRANSFORMER_DECODER_REGISTRY.__doc__ = """ Registry for transformer module in MaskDINO. """ def build_transformer_decoder(cfg, in_channels, lang_encoder, mask_classification=True): """ Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`. """ name = cfg.MODEL.MaskDINO.TRANSFORMER_DECODER_NAME return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, lang_encoder, mask_classification) @TRANSFORMER_DECODER_REGISTRY.register() class MaskDINODecoder(nn.Module): @configurable def __init__( self, in_channels, lang_encoder, mask_classification=True, *, num_classes: int, hidden_dim: int, num_queries: int, nheads: int, dim_feedforward: int, dec_layers: int, mask_dim: int, dim_projection: int, enforce_input_project: bool, two_stage: bool, dn: str, noise_scale:float, dn_num:int, initialize_box_type:bool, initial_pred:bool, learn_tgt: bool, total_num_feature_levels: int = 4, dropout: float = 0.0, activation: str = 'relu', nhead: int = 8, dec_n_points: int = 4, return_intermediate_dec: bool = True, query_dim: int = 4, dec_layer_share: bool = False, semantic_ce_loss: bool = False, cross_track_layer: bool = False, ): """ NOTE: this interface is experimental. Args: in_channels: channels of the input features mask_classification: whether to add mask classifier or not num_classes: number of classes hidden_dim: Transformer feature dimension num_queries: number of queries nheads: number of heads dim_feedforward: feature dimension in feedforward network enc_layers: number of Transformer encoder layers dec_layers: number of Transformer decoder layers pre_norm: whether to use pre-LayerNorm or not mask_dim: mask feature dimension enforce_input_project: add input project 1x1 conv even if input channels and hidden dim is identical d_model: transformer dimension dropout: dropout rate activation: activation function nhead: num heads in multi-head attention dec_n_points: number of sampling points in decoder return_intermediate_dec: return the intermediate results of decoder query_dim: 4 -> (x, y, w, h) dec_layer_share: whether to share each decoder layer semantic_ce_loss: use ce loss for semantic segmentation """ super().__init__() assert mask_classification, "Only support mask classification model" self.mask_classification = mask_classification self.num_feature_levels = total_num_feature_levels self.initial_pred = initial_pred self.lang_encoder = lang_encoder # define Transformer decoder here self.dn=dn self.learn_tgt = learn_tgt self.noise_scale=noise_scale self.dn_num=dn_num self.num_heads = nheads self.num_layers = dec_layers self.two_stage=two_stage self.initialize_box_type = initialize_box_type self.total_num_feature_levels = total_num_feature_levels self.num_queries = num_queries self.semantic_ce_loss = semantic_ce_loss # learnable query features if not two_stage or self.learn_tgt: self.query_feat = nn.Embedding(num_queries, hidden_dim) if not two_stage and initialize_box_type == 'no': self.query_embed = nn.Embedding(num_queries, 4) if two_stage: self.enc_output = nn.Linear(hidden_dim, hidden_dim) self.enc_output_norm = nn.LayerNorm(hidden_dim) self.input_proj = nn.ModuleList() for _ in range(self.num_feature_levels): if in_channels != hidden_dim or enforce_input_project: self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1)) weight_init.c2_xavier_fill(self.input_proj[-1]) else: self.input_proj.append(nn.Sequential()) self.num_classes = { 'obj365':100, 'obj365_clip':100, 'lvis':100, 'openimage':100, 'lvis_clip':100, 'openimage_clip':100, 'grit':100, 'vg':200, 'coco':80, 'coco_clip':80, 'grounding':1, 'rvos':1, 'sa1b':1, 'sa1b_clip':1, 'bdd_det':10, 'bdd_inst':8, 'ytvis19':40, 'image_yt19':40, 'image_yt21':40, 'bdd_track_seg':8, 'bdd_track_box':8, 'ovis':25, 'image_o':25, 'ytvis21':40, 'uvo_video': 81, 'ytbvos':1, } # output FFNs assert self.mask_classification, "why not class embedding?" self.confidence_score = MLP(hidden_dim, hidden_dim, 1, 2) self.category_embed = nn.Parameter(torch.rand(hidden_dim, dim_projection)) # trunc_normal_(self.category_embed, std=.02) # self.track_embed = MLP(hidden_dim, hidden_dim, hidden_dim, 3) self.coco_label_enc = nn.Embedding(80,hidden_dim) self.obj365_label_enc = nn.Embedding(100, hidden_dim) self.vg_label_enc = nn.Embedding(200, hidden_dim) self.grounding_label_enc = nn.Embedding(1,hidden_dim) self.ytvis19_label_enc = nn.Embedding(40,hidden_dim) self.ytvis21_label_enc = nn.Embedding(40,hidden_dim) self.ovis_label_enc = nn.Embedding(25,hidden_dim) self.uvo_label_enc = nn.Embedding(81,hidden_dim) self.bdd_det = nn.Embedding(10,hidden_dim) self.bdd_inst = nn.Embedding(8,hidden_dim) self.label_enc = { 'coco': self.coco_label_enc, 'coco_clip': self.coco_label_enc, 'coconomask': self.coco_label_enc, 'obj365': self.obj365_label_enc, 'lvis': self.obj365_label_enc, 'openimage': self.obj365_label_enc, 'grit': self.obj365_label_enc, 'vg': self.vg_label_enc, 'obj365_clip': self.obj365_label_enc, 'lvis_clip': self.obj365_label_enc, 'openimage_clip': self.obj365_label_enc, 'bdd_det':self.bdd_det, 'bdd_inst':self.bdd_inst, 'bdd_track_seg':self.bdd_inst, 'bdd_track_box':self.bdd_inst, 'sa1b': self.grounding_label_enc, 'sa1b_clip': self.grounding_label_enc, 'grounding': self.grounding_label_enc, 'rvos': self.grounding_label_enc, 'uvo_video':self.uvo_label_enc, 'ytvis19':self.ytvis19_label_enc, 'image_yt19': self.ytvis19_label_enc, 'ytvis21':self.ytvis21_label_enc, 'image_yt21':self.ytvis21_label_enc, 'ovis':self.ovis_label_enc, 'image_o': self.ovis_label_enc, 'burst':self.grounding_label_enc, 'ytbvos':self.grounding_label_enc, } self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) # init decoder self.decoder_norm = decoder_norm = nn.LayerNorm(hidden_dim) decoder_layer = DeformableTransformerDecoderLayer(hidden_dim, dim_feedforward, dropout, activation, self.num_feature_levels, nhead, dec_n_points) self.decoder = TransformerDecoder(decoder_layer, self.num_layers, decoder_norm, return_intermediate=return_intermediate_dec, d_model=hidden_dim, query_dim=query_dim, num_feature_levels=self.num_feature_levels, dec_layer_share=dec_layer_share, cross_track_layer = cross_track_layer, n_levels=self.num_feature_levels, n_heads=nhead, n_points=dec_n_points ) self.cross_track_layer = cross_track_layer self.hidden_dim = hidden_dim self._bbox_embed = _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0) nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) box_embed_layerlist = [_bbox_embed for i in range(self.num_layers)] # share box prediction each layer self.bbox_embed = nn.ModuleList(box_embed_layerlist) self.decoder.bbox_embed = self.bbox_embed @classmethod def from_config(cls, cfg, in_channels, lang_encoder, mask_classification): ret = {} ret["in_channels"] = in_channels ret["lang_encoder"] = lang_encoder ret["mask_classification"] = mask_classification ret["dim_projection"] = cfg.MODEL.DIM_PROJ ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES ret["hidden_dim"] = cfg.MODEL.MaskDINO.HIDDEN_DIM ret["num_queries"] = cfg.MODEL.MaskDINO.NUM_OBJECT_QUERIES # Transformer parameters: ret["nheads"] = cfg.MODEL.MaskDINO.NHEADS ret["dim_feedforward"] = cfg.MODEL.MaskDINO.DIM_FEEDFORWARD ret["dec_layers"] = cfg.MODEL.MaskDINO.DEC_LAYERS ret["enforce_input_project"] = cfg.MODEL.MaskDINO.ENFORCE_INPUT_PROJ ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM ret["two_stage"] =cfg.MODEL.MaskDINO.TWO_STAGE ret["initialize_box_type"] = cfg.MODEL.MaskDINO.INITIALIZE_BOX_TYPE # ['no', 'bitmask', 'mask2box'] ret["dn"]=cfg.MODEL.MaskDINO.DN ret["noise_scale"] =cfg.MODEL.MaskDINO.DN_NOISE_SCALE ret["dn_num"] =cfg.MODEL.MaskDINO.DN_NUM ret["initial_pred"] =cfg.MODEL.MaskDINO.INITIAL_PRED ret["learn_tgt"] = cfg.MODEL.MaskDINO.LEARN_TGT ret["total_num_feature_levels"] = cfg.MODEL.SEM_SEG_HEAD.TOTAL_NUM_FEATURE_LEVELS ret["semantic_ce_loss"] = cfg.MODEL.MaskDINO.TEST.SEMANTIC_ON and cfg.MODEL.MaskDINO.SEMANTIC_CE_LOSS and ~cfg.MODEL.MaskDINO.TEST.PANOPTIC_ON ret["cross_track_layer"] = cfg.MODEL.CROSS_TRACK return ret def prepare_for_dn(self, targets, tgt, refpoint_emb, batch_size,task): """ modified from dn-detr. You can refer to dn-detr https://github.com/IDEA-Research/DN-DETR/blob/main/models/dn_dab_deformable_detr/dn_components.py for more details :param dn_args: scalar, noise_scale :param tgt: original tgt (content) in the matching part :param refpoint_emb: positional anchor queries in the matching part :param batch_size: bs """ if self.training: scalar, noise_scale = self.dn_num,self.noise_scale known = [(torch.ones_like(t['labels'])).cuda() for t in targets] know_idx = [torch.nonzero(t) for t in known] known_num = [sum(k) for k in known] # use fix number of dn queries if max(known_num)>0: scalar = scalar//(int(max(known_num))) else: scalar = 0 if scalar == 0: input_query_label = None input_query_bbox = None attn_mask = None mask_dict = None return input_query_label, input_query_bbox, attn_mask, mask_dict # can be modified to selectively denosie some label or boxes; also known label prediction unmask_bbox = unmask_label = torch.cat(known) labels = torch.cat([t['labels'] for t in targets]) boxes = torch.cat([t['boxes'] for t in targets]) batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)]) # known known_indice = torch.nonzero(unmask_label + unmask_bbox) known_indice = known_indice.view(-1) # noise known_indice = known_indice.repeat(scalar, 1).view(-1) known_labels = labels.repeat(scalar, 1).view(-1) known_bid = batch_idx.repeat(scalar, 1).view(-1) known_bboxs = boxes.repeat(scalar, 1) known_labels_expaned = known_labels.clone() known_bbox_expand = known_bboxs.clone() # noise on the label if noise_scale > 0: p = torch.rand_like(known_labels_expaned.float()) chosen_indice = torch.nonzero(p < (noise_scale * 0.5)).view(-1) # half of bbox prob new_label = torch.randint_like(chosen_indice, 0, self.num_classes[task]) # randomly put a new one here known_labels_expaned.scatter_(0, chosen_indice, new_label) if noise_scale > 0: diff = torch.zeros_like(known_bbox_expand) diff[:, :2] = known_bbox_expand[:, 2:] / 2 diff[:, 2:] = known_bbox_expand[:, 2:] known_bbox_expand += torch.mul((torch.rand_like(known_bbox_expand) * 2 - 1.0), diff).cuda() * noise_scale known_bbox_expand = known_bbox_expand.clamp(min=0.0, max=1.0) m = known_labels_expaned.long().to('cuda') input_label_embed = self.label_enc[task](m) input_bbox_embed = inverse_sigmoid(known_bbox_expand) single_pad = int(max(known_num)) pad_size = int(single_pad * scalar) padding_label = torch.zeros(pad_size, self.hidden_dim).cuda() padding_bbox = torch.zeros(pad_size, 4).cuda() if not refpoint_emb is None: input_query_label = torch.cat([padding_label, tgt], dim=0).repeat(batch_size, 1, 1) input_query_bbox = torch.cat([padding_bbox, refpoint_emb], dim=0).repeat(batch_size, 1, 1) else: input_query_label=padding_label.repeat(batch_size, 1, 1) input_query_bbox = padding_bbox.repeat(batch_size, 1, 1) # map map_known_indice = torch.tensor([]).to('cuda') if len(known_num): map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num]) # [1,2, 1,2,3] map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(scalar)]).long() if len(known_bid): input_query_label[(known_bid.long(), map_known_indice)] = input_label_embed input_query_bbox[(known_bid.long(), map_known_indice)] = input_bbox_embed tgt_size = pad_size + self.num_queries attn_mask = torch.ones(tgt_size, tgt_size).to('cuda') < 0 # match query cannot see the reconstruct attn_mask[pad_size:, :pad_size] = True # reconstruct cannot see each other for i in range(scalar): if i == 0: attn_mask[single_pad * i:single_pad * (i + 1), single_pad * (i + 1):pad_size] = True if i == scalar - 1: attn_mask[single_pad * i:single_pad * (i + 1), :single_pad * i] = True else: attn_mask[single_pad * i:single_pad * (i + 1), single_pad * (i + 1):pad_size] = True attn_mask[single_pad * i:single_pad * (i + 1), :single_pad * i] = True mask_dict = { 'known_indice': torch.as_tensor(known_indice).long(), 'batch_idx': torch.as_tensor(batch_idx).long(), 'map_known_indice': torch.as_tensor(map_known_indice).long(), 'known_lbs_bboxes': (known_labels, known_bboxs), 'know_idx': know_idx, 'pad_size': pad_size, 'scalar': scalar, } else: if not refpoint_emb is None: input_query_label = tgt.repeat(batch_size, 1, 1) input_query_bbox = refpoint_emb.repeat(batch_size, 1, 1) else: input_query_label=None input_query_bbox=None attn_mask = None mask_dict=None # 100*batch*256 if not input_query_bbox is None: input_query_label = input_query_label input_query_bbox = input_query_bbox return input_query_label,input_query_bbox,attn_mask,mask_dict def dn_post_process(self,outputs_class,outputs_score,outputs_coord,mask_dict,outputs_mask): """ post process of dn after output from the transformer put the dn part in the mask_dict """ assert mask_dict['pad_size'] > 0 output_known_class = outputs_class[:, :, :mask_dict['pad_size'], :] outputs_class = outputs_class[:, :, mask_dict['pad_size']:, :] output_known_score = outputs_score[:, :, :mask_dict['pad_size'], :] outputs_score = outputs_score[:, :, mask_dict['pad_size']:, :] output_known_coord = outputs_coord[:, :, :mask_dict['pad_size'], :] outputs_coord = outputs_coord[:, :, mask_dict['pad_size']:, :] if outputs_mask is not None: output_known_mask = outputs_mask[:, :, :mask_dict['pad_size'], :] outputs_mask = outputs_mask[:, :, mask_dict['pad_size']:, :] out = {'pred_logits': output_known_class[-1], 'pred_scores':output_known_score[-1],'pred_boxes': output_known_coord[-1],'pred_masks': output_known_mask[-1]} out['aux_outputs'] = self._set_aux_loss(output_known_class, output_known_score, output_known_mask, output_known_coord) mask_dict['output_known_lbs_bboxes']=out return outputs_class, outputs_score, outputs_coord, outputs_mask def get_valid_ratio(self, mask): _, H, W = mask.shape valid_H = torch.sum(~mask[:, :, 0], 1) valid_W = torch.sum(~mask[:, 0, :], 1) valid_ratio_h = valid_H.float() / H valid_ratio_w = valid_W.float() / W valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) return valid_ratio def pred_box(self, reference, hs, ref0=None): """ :param reference: reference box coordinates from each decoder layer :param hs: content :param ref0: whether there are prediction from the first layer """ device = reference[0].device if ref0 is None: outputs_coord_list = [] else: outputs_coord_list = [ref0.to(device)] for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate(zip(reference[:-1], self.bbox_embed, hs)): layer_delta_unsig = layer_bbox_embed(layer_hs).to(device) layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig).to(device) layer_outputs_unsig = layer_outputs_unsig.sigmoid() outputs_coord_list.append(layer_outputs_unsig) outputs_coord_list = torch.stack(outputs_coord_list) return outputs_coord_list def forward(self, x, mask_features, extra, task, masks, targets=None): """ :param x: input, a list of multi-scale feature :param mask_features: is the per-pixel embeddings with resolution 1/4 of the original image, obtained by fusing backbone encoder encoded features. This is used to produce binary masks. :param masks: mask in the original image :param targets: used for denoising training """ if 'spatial_query_pos_mask' in extra: visual_P = True else: visual_P = False assert len(x) == self.num_feature_levels device = x[0].device size_list = [] # disable mask, it does not affect performance enable_mask = 0 if masks is not None: for src in x: if src.size(2) % 32 or src.size(3) % 32: enable_mask = 1 if enable_mask == 0: masks = [torch.zeros((src.size(0), src.size(2), src.size(3)), device=src.device, dtype=torch.bool) for src in x] src_flatten = [] mask_flatten = [] spatial_shapes = [] for i in range(self.num_feature_levels): idx=self.num_feature_levels-1-i bs, c , h, w=x[idx].shape size_list.append(x[i].shape[-2:]) spatial_shapes.append(x[idx].shape[-2:]) src_flatten.append(self.input_proj[idx](x[idx]).flatten(2).transpose(1, 2)) mask_flatten.append(masks[i].flatten(1)) src_flatten = torch.cat(src_flatten, 1) # bs, \sum{hxw}, c mask_flatten = torch.cat(mask_flatten, 1) # bs, \sum{hxw} spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device) level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) predictions_federate = [] predictions_score = [] predictions_class = [] predictions_mask = [] if self.two_stage: output_memory, output_proposals = gen_encoder_output_proposals(src_flatten, mask_flatten, spatial_shapes) output_memory = self.enc_output_norm(self.enc_output(output_memory)) if task in ['grounding','rvos']: class_embed = output_memory @ self.category_embed enc_outputs_class_unselected = torch.einsum("bqc,bc->bq", class_embed, extra['grounding_class']).unsqueeze(-1) #[bz,numq,1] elif visual_P: enc_outputs_class_unselected = self.confidence_score(output_memory) else: class_embed = output_memory @ self.category_embed # [bz,num_q,projectdim] enc_outputs_class_unselected = torch.einsum("bqc,nc->bqn", class_embed, extra['class_embeddings']) #[bz,n,80] enc_outputs_coord_unselected = self._bbox_embed( output_memory) + output_proposals # (bs, \sum{hw}, 4) unsigmoid topk = self.num_queries topk_proposals = torch.topk(enc_outputs_class_unselected.max(-1)[0], topk, dim=1)[1] refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) # unsigmoid refpoint_embed = refpoint_embed_undetach.detach() #[bz,num_q,4] tgt_undetach = torch.gather(output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.hidden_dim)) # unsigmoid #[bz,num_q.256] conf_score, outputs_class, outputs_mask,_ = self.forward_prediction_heads(tgt_undetach.transpose(0, 1), mask_features, task, extra, mask_dict = None) tgt = tgt_undetach.detach() if self.learn_tgt: tgt = self.query_feat.weight[None].repeat(bs, 1, 1) interm_outputs=dict() interm_outputs['pred_logits'] = outputs_class interm_outputs['pred_scores'] = conf_score interm_outputs['pred_boxes'] = refpoint_embed_undetach.sigmoid() interm_outputs['pred_masks'] = outputs_mask elif not self.two_stage: tgt = self.query_feat.weight[None].repeat(bs, 1, 1) refpoint_embed = self.query_embed.weight[None].repeat(bs, 1, 1) tgt_mask = None mask_dict = None if self.dn != "no" and self.training: assert targets is not None input_query_label, input_query_bbox, tgt_mask, mask_dict = \ self.prepare_for_dn(targets, None, None, x[0].shape[0],task) if mask_dict is not None: tgt=torch.cat([input_query_label, tgt],dim=1) # direct prediction from the matching and denoising part in the begining if self.initial_pred: conf_score, outputs_class, outputs_mask, pred_federat = self.forward_prediction_heads(tgt.transpose(0, 1), mask_features, task, extra, mask_dict, self.training) predictions_score.append(conf_score) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) predictions_federate.append(pred_federat) if self.dn != "no" and self.training and mask_dict is not None: refpoint_embed=torch.cat([input_query_bbox,refpoint_embed],dim=1) hs, references, cross_track_embed = self.decoder( tgt=tgt.transpose(0, 1), memory=src_flatten.transpose(0, 1), memory_key_padding_mask=mask_flatten, pos=None, refpoints_unsigmoid=refpoint_embed.transpose(0, 1), level_start_index=level_start_index, spatial_shapes=spatial_shapes, valid_ratios=valid_ratios, tgt_mask=tgt_mask, task=task, extra=extra, ) for i, output in enumerate(hs): conf_score, outputs_class, outputs_mask,pred_federat = self.forward_prediction_heads(output.transpose(0, 1), mask_features, task, extra, mask_dict, self.training or (i == len(hs)-1)) predictions_score.append(conf_score) predictions_class.append(outputs_class) predictions_mask.append(outputs_mask) predictions_federate.append(pred_federat) # iteratively box prediction if self.initial_pred: out_boxes = self.pred_box(references, hs, refpoint_embed.sigmoid()) assert len(predictions_class) == self.num_layers + 1 else: out_boxes = self.pred_box(references, hs) if mask_dict is not None: predictions_mask=torch.stack(predictions_mask) predictions_class=torch.stack(predictions_class) predictions_score = torch.stack(predictions_score) predictions_class, predictions_score, out_boxes, predictions_mask=\ self.dn_post_process(predictions_class, predictions_score, out_boxes,mask_dict,predictions_mask) predictions_class, predictions_score, predictions_mask=list(predictions_class), list(predictions_score), list(predictions_mask) elif self.training: # this is to insure self.label_enc participate in the model predictions_class[-1] += 0.0*self.label_enc[task].weight.sum() if mask_dict is not None: track_embed = hs[-1][:, mask_dict['pad_size']:, :] else: track_embed = hs[-1] out = { 'pred_federat':predictions_federate[-1], 'pred_logits': predictions_class[-1], 'pred_scores': predictions_score[-1], 'pred_masks': predictions_mask[-1], 'pred_boxes':out_boxes[-1], 'pred_track_embed': track_embed, 'visual_P': visual_P, 'aux_outputs': self._set_aux_loss( predictions_class if self.mask_classification else None, predictions_score, predictions_mask, out_boxes, predictions_federate, visual_P ) } if self.two_stage: out['interm_outputs'] = interm_outputs return out, mask_dict def forward_prediction_heads(self, output, mask_features, task, extra,mask_dict, pred_mask=True, visual_P=False): decoder_output = self.decoder_norm(output) decoder_output = decoder_output.transpose(0, 1) # outputs_class = self.class_embed(decoder_output) conf_score = self.confidence_score(decoder_output) # if visual_P else None class_embed = decoder_output @ self.category_embed # [bz,num_q,projectdim] if task in ['grounding', 'rvos']: outputs_class = torch.einsum("bqc,bc->bq", class_embed, extra['grounding_class']).unsqueeze(-1) #[bz,numq,1] else: outputs_class = torch.einsum("bqc,nc->bqn", class_embed, extra['class_embeddings']) #[bz,n,80] outputs_mask = None if pred_mask: mask_embed = self.mask_embed(decoder_output) outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) return conf_score, outputs_class, outputs_mask, None @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_score, outputs_seg_masks, out_boxes, predictions_federate=None, visual_P=False): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. # if self.mask_classification: if predictions_federate is None: return [ {"pred_logits": a, "pred_scores": b, "pred_masks": c, "pred_boxes":d, 'visual_P': visual_P} for a, b, c, d in zip(outputs_class[:-1], outputs_score[:-1], outputs_seg_masks[:-1], out_boxes[:-1]) ] else: return [ {"pred_logits": a, "pred_scores": b, "pred_masks": c, "pred_boxes":d, 'pred_federat':e,'visual_P': visual_P} for a, b, c, d, e in zip(outputs_class[:-1], outputs_score[:-1], outputs_seg_masks[:-1], out_boxes[:-1], predictions_federate[:-1]) ] ================================================ FILE: thirdparty/GLEE/glee/models/vos_utils.py ================================================ import torch import torch.nn.functional as F from torch import nn from timm.models.layers import DropPath class VLFuse(torch.nn.Module): """ Early Fusion Module """ def __init__(self, ): super(VLFuse, self).__init__() self.init_configs() # early fusion module # bi-direction (text->image, image->text) self.b_attn = BiAttentionBlockForCheckpoint(v_dim=self.img_dim, # 256 l_dim=self.lang_dim, # 768 embed_dim=self.embed_dim, # 2048 num_heads=self.n_head, # 8 dropout=0.1, drop_path=.0, init_values=1.0 / 6, ) def init_configs(self, ): # common params self.img_dim = 256 self.max_query_len = 256 self.n_layers =1 # mha params self.n_head = 8 self.embed_dim = 2048 # 2048 by default self.lang_dim = 256 def forward(self, x, task=None): visual_features = x["visual"] language_dict_features = x["lang"] fused_visual_features, language_features = self.b_attn( visual_features, language_dict_features['hidden'], language_dict_features['masks'], task) language_dict_features['hidden'] = language_features fused_language_dict_features = language_dict_features features_dict = {"visual": fused_visual_features, "lang": fused_language_dict_features} return features_dict def masks_to_boxes(masks): """Compute the bounding boxes around the provided masks The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. Returns a [N, 4] tensors, with the boxes in xyxy format """ if masks.numel() == 0: return torch.zeros((0, 4), device=masks.device) h, w = masks.shape[-2:] y = torch.arange(0, h, dtype=torch.float, device=masks.device) x = torch.arange(0, w, dtype=torch.float, device=masks.device) y, x = torch.meshgrid(y, x) x_mask = (masks * x.unsqueeze(0)) x_max = x_mask.flatten(1).max(-1)[0] x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] y_mask = (masks * y.unsqueeze(0)) y_max = y_mask.flatten(1).max(-1)[0] y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] return torch.stack([x_min, y_min, x_max, y_max], 1) class FeatureFuser(nn.Module): """ Feature Fuser for SOT (inspired by CondInst) """ def __init__(self, in_channels, channels=256): super().__init__() self.refine = nn.ModuleList() for in_channel in in_channels: self.refine.append(nn.Conv2d(in_channel, channels, 3, padding=1)) def forward(self, features): # -4, -3, -2, -1 corresponds to P3, P4, P5, P6 for i, f in enumerate([-3, -2, -1]): if i == 0: x = self.refine[i](features[f]) else: x_p = self.refine[i](features[f]) target_h, target_w = x.size()[2:] h, w = x_p.size()[2:] assert target_h % h == 0 assert target_w % w == 0 factor_h, factor_w = target_h // h, target_w // w assert factor_h == factor_w x_p = aligned_bilinear(x_p, factor_h) x = x + x_p return x def aligned_bilinear(tensor, factor): assert tensor.dim() == 4 assert factor >= 1 assert int(factor) == factor if factor == 1: return tensor h, w = tensor.size()[2:] tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate") oh = factor * h + 1 ow = factor * w + 1 tensor = F.interpolate( tensor, size=(oh, ow), mode='bilinear', align_corners=True ) tensor = F.pad( tensor, pad=(factor // 2, 0, factor // 2, 0), mode="replicate" ) return tensor[:, :, :oh - 1, :ow - 1] class BiMultiHeadAttention(nn.Module): def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1): super(BiMultiHeadAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads self.head_dim = embed_dim // num_heads self.v_dim = v_dim self.l_dim = l_dim assert ( self.head_dim * self.num_heads == self.embed_dim ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." self.scale = self.head_dim ** (-0.5) self.dropout = dropout self.v_proj = nn.Linear(self.v_dim, self.embed_dim) self.l_proj = nn.Linear(self.l_dim, self.embed_dim) self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim) self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim) self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim) self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim) self.stable_softmax_2d = False self.clamp_min_for_underflow = True self.clamp_max_for_overflow = True self._reset_parameters() def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def _reset_parameters(self): nn.init.xavier_uniform_(self.v_proj.weight) self.v_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.l_proj.weight) self.l_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.values_v_proj.weight) self.values_v_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.values_l_proj.weight) self.values_l_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.out_v_proj.weight) self.out_v_proj.bias.data.fill_(0) nn.init.xavier_uniform_(self.out_l_proj.weight) self.out_l_proj.bias.data.fill_(0) def forward(self, v, l, attention_mask_l=None): bsz, tgt_len, embed_dim = v.size() query_states = self.v_proj(v) * self.scale key_states = self._shape(self.l_proj(l), -1, bsz) value_v_states = self._shape(self.values_v_proj(v), -1, bsz) value_l_states = self._shape(self.values_l_proj(l), -1, bsz) proj_shape = (bsz * self.num_heads, -1, self.head_dim) # (bs * 8, -1, embed_dim//8) query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) # (bs * 8, seq_len_img, embed_dim//8) key_states = key_states.view(*proj_shape) # (bs * 8, seq_len_text, embed_dim//8) value_v_states = value_v_states.view(*proj_shape) value_l_states = value_l_states.view(*proj_shape) src_len = key_states.size(1) attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # (bs * 8, seq_len_img, seq_len_text) if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): raise ValueError( f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" ) # attn_weights_l = nn.functional.softmax(attn_weights.transpose(1, 2), dim=-1) if self.stable_softmax_2d: attn_weights = attn_weights - attn_weights.max() if self.clamp_min_for_underflow: attn_weights = torch.clamp(attn_weights, min=-50000) # Do not increase -50000, data type half has quite limited range if self.clamp_max_for_overflow: attn_weights = torch.clamp(attn_weights, max=50000) # Do not increase 50000, data type half has quite limited range attn_weights_T = attn_weights.transpose(1, 2) attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[ 0]) if self.clamp_min_for_underflow: attn_weights_l = torch.clamp(attn_weights_l, min=-50000) # Do not increase -50000, data type half has quite limited range if self.clamp_max_for_overflow: attn_weights_l = torch.clamp(attn_weights_l, max=50000) # Do not increase 50000, data type half has quite limited range attn_weights_l = attn_weights_l.softmax(dim=-1) # assert attention_mask_l.dtype == torch.int64 if attention_mask_l is not None: assert (attention_mask_l.dim() == 2) # (bs, seq_len) attention_mask = attention_mask_l.unsqueeze(1).unsqueeze(1) # (bs, 1, 1, seq_len) attention_mask = attention_mask.expand(bsz, 1, tgt_len, src_len) attention_mask = attention_mask.masked_fill(attention_mask == 0, -9e15) if attention_mask.size() != (bsz, 1, tgt_len, src_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}" ) attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights_v = nn.functional.softmax(attn_weights, dim=-1) attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training) attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training) attn_output_v = torch.bmm(attn_probs_v, value_l_states) attn_output_l = torch.bmm(attn_probs_l, value_v_states) if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim): raise ValueError( f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}" ) if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim): raise ValueError( f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}" ) attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output_v = attn_output_v.transpose(1, 2) attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim) attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim) attn_output_l = attn_output_l.transpose(1, 2) attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim) attn_output_v = self.out_v_proj(attn_output_v) attn_output_l = self.out_l_proj(attn_output_l) return attn_output_v, attn_output_l class BiAttentionBlockForCheckpoint(nn.Module): def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, drop_path=.0, init_values=1e-4, ): """ Inputs: embed_dim - Dimensionality of input and attention feature vectors num_heads - Number of heads to use in the Multi-Head Attention block dropout - Amount of dropout to apply in the feed-forward network """ super(BiAttentionBlockForCheckpoint, self).__init__() # pre layer norm self.layer_norm_v = nn.LayerNorm(v_dim) self.layer_norm_l = nn.LayerNorm(l_dim) self.attn = BiMultiHeadAttention(v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, ) # add layer scale for training stability self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True) self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True) def forward(self, v, l, attention_mask_l=None, task=None): # v: visual features, (bs, sigma(HW), 256) # l: language features, (bs, seq_len, 768) v = self.layer_norm_v(v) l = self.layer_norm_l(l) delta_v, delta_l = self.attn(v, l, attention_mask_l=attention_mask_l) # v, l = v + delta_v, l + delta_l v = v + self.drop_path(self.gamma_v * delta_v) l = l + self.drop_path(self.gamma_l * delta_l) return v, l ================================================ FILE: thirdparty/GLEE/glee/modules/__init__.py ================================================ from .position_encoding import * from .attention import * from .postprocessing import * from .point_features import * ================================================ FILE: thirdparty/GLEE/glee/modules/attention.py ================================================ # Code copy from PyTorch, modified by Xueyan Zou import warnings from typing import Optional, Tuple import torch import torch.nn as nn from torch import Tensor from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ from torch.nn.parameter import Parameter from torch.overrides import has_torch_function, handle_torch_function from torch.nn.functional import pad, linear, softmax, dropout def multi_head_attention_forward( query: Tensor, key: Tensor, value: Tensor, embed_dim_to_check: int, num_heads: int, in_proj_weight: Tensor, in_proj_bias: Tensor, bias_k: Optional[Tensor], bias_v: Optional[Tensor], add_zero_attn: bool, dropout_p: float, out_proj_weight: Tensor, out_proj_bias: Tensor, training: bool = True, key_padding_mask: Optional[Tensor] = None, need_weights: bool = True, attn_mask: Optional[Tensor] = None, use_separate_proj_weight: bool = False, q_proj_weight: Optional[Tensor] = None, k_proj_weight: Optional[Tensor] = None, v_proj_weight: Optional[Tensor] = None, static_k: Optional[Tensor] = None, static_v: Optional[Tensor] = None, ) -> Tuple[Tensor, Optional[Tensor]]: r""" Args: query, key, value: map a query and a set of key-value pairs to an output. See "Attention Is All You Need" for more details. embed_dim_to_check: total dimension of the model. num_heads: parallel attention heads. in_proj_weight, in_proj_bias: input projection weight and bias. bias_k, bias_v: bias of the key and value sequences to be added at dim=0. add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. dropout_p: probability of an element to be zeroed. out_proj_weight, out_proj_bias: the output projection weight and bias. training: apply dropout if is ``True``. key_padding_mask: if provided, specified padding elements in the key will be ignored by the attention. This is an binary mask. When the value is True, the corresponding value on the attention layer will be filled with -inf. need_weights: output attn_output_weights. attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all the batches while a 3D mask allows to specify a different mask for the entries of each batch. use_separate_proj_weight: the function accept the proj. weights for query, key, and value in different forms. If false, in_proj_weight will be used, which is a combination of q_proj_weight, k_proj_weight, v_proj_weight. q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias. static_k, static_v: static key and value used for attention operators. Shape: Inputs: - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is the embedding dimension. - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions will be unchanged. If a BoolTensor is provided, the positions with the value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length, S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor is provided, it will be added to the attention weight. - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. E/num_heads is the head dimension. Outputs: - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - attn_output_weights: :math:`(N, L, S)` where N is the batch size, L is the target sequence length, S is the source sequence length. """ tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias) if has_torch_function(tens_ops): return handle_torch_function( multi_head_attention_forward, tens_ops, query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training=training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask, use_separate_proj_weight=use_separate_proj_weight, q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight, v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v, ) tgt_len, bsz, embed_dim = query.size() assert embed_dim == embed_dim_to_check # allow MHA to have different sizes for the feature dimension assert key.size(0) == value.size(0) and key.size(1) == value.size(1) head_dim = embed_dim // num_heads assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" scaling = float(head_dim) ** -0.5 if not use_separate_proj_weight: if (query is key or torch.equal(query, key)) and (key is value or torch.equal(key, value)): # self-attention q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1) elif key is value or torch.equal(key, value): # encoder-decoder attention # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = 0 _end = embed_dim _w = in_proj_weight[_start:_end, :] if _b is not None: _b = _b[_start:_end] q = linear(query, _w, _b) if key is None: assert value is None k = None v = None else: # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = embed_dim _end = None _w = in_proj_weight[_start:, :] if _b is not None: _b = _b[_start:] k, v = linear(key, _w, _b).chunk(2, dim=-1) else: # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = 0 _end = embed_dim _w = in_proj_weight[_start:_end, :] if _b is not None: _b = _b[_start:_end] q = linear(query, _w, _b) # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = embed_dim _end = embed_dim * 2 _w = in_proj_weight[_start:_end, :] if _b is not None: _b = _b[_start:_end] k = linear(key, _w, _b) # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias _start = embed_dim * 2 _end = None _w = in_proj_weight[_start:, :] if _b is not None: _b = _b[_start:] v = linear(value, _w, _b) else: q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight) len1, len2 = q_proj_weight_non_opt.size() assert len1 == embed_dim and len2 == query.size(-1) k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight) len1, len2 = k_proj_weight_non_opt.size() assert len1 == embed_dim and len2 == key.size(-1) v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight) len1, len2 = v_proj_weight_non_opt.size() assert len1 == embed_dim and len2 == value.size(-1) if in_proj_bias is not None: q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim]) k = linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim : (embed_dim * 2)]) v = linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2) :]) else: q = linear(query, q_proj_weight_non_opt, in_proj_bias) k = linear(key, k_proj_weight_non_opt, in_proj_bias) v = linear(value, v_proj_weight_non_opt, in_proj_bias) q = q * scaling if attn_mask is not None: assert ( attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool ), "Only float, byte, and bool types are supported for attn_mask, not {}".format(attn_mask.dtype) if attn_mask.dtype == torch.uint8: warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.") attn_mask = attn_mask.to(torch.bool) if attn_mask.dim() == 2: attn_mask = attn_mask.unsqueeze(0) if list(attn_mask.size()) != [1, query.size(0), key.size(0)]: raise RuntimeError("The size of the 2D attn_mask is not correct.") elif attn_mask.dim() == 3: if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]: raise RuntimeError("The size of the 3D attn_mask is not correct.") else: raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim())) # attn_mask's dim is 3 now. # convert ByteTensor key_padding_mask to bool if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8: warnings.warn( "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead." ) key_padding_mask = key_padding_mask.to(torch.bool) if bias_k is not None and bias_v is not None: if static_k is None and static_v is None: k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1)) if key_padding_mask is not None: key_padding_mask = pad(key_padding_mask, (0, 1)) else: assert static_k is None, "bias cannot be added to static key." assert static_v is None, "bias cannot be added to static value." else: assert bias_k is None assert bias_v is None q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) if k is not None: k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) if v is not None: v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1) if static_k is not None: assert static_k.size(0) == bsz * num_heads assert static_k.size(2) == head_dim k = static_k if static_v is not None: assert static_v.size(0) == bsz * num_heads assert static_v.size(2) == head_dim v = static_v src_len = k.size(1) if key_padding_mask is not None: # assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if add_zero_attn: src_len += 1 k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1) v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1)) if key_padding_mask is not None: key_padding_mask = pad(key_padding_mask, (0, 1)) attn_output_weights = torch.bmm(q, k.transpose(1, 2)) assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len] if attn_mask is not None: if attn_mask.dtype == torch.bool: attn_output_weights.masked_fill_(attn_mask, float("-inf")) else: attn_output_weights += attn_mask if key_padding_mask is not None: attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) attn_output_weights = attn_output_weights.masked_fill( key_padding_mask.unsqueeze(1), float("-inf"), ) attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len) attn_output_weights = softmax(attn_output_weights, dim=-1) attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training) attn_output = torch.bmm(attn_output_weights, v) assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim] attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) if need_weights: # average attention weights over heads attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) return attn_output, attn_output_weights.sum(dim=1) / num_heads else: return attn_output, None # This class exists solely for Transformer; it has an annotation stating # that bias is never None, which appeases TorchScript class _LinearWithBias(nn.Linear): bias: Tensor # type: ignore def __init__(self, in_features: int, out_features: int) -> None: super().__init__(in_features, out_features, bias=True) # type: ignore class MultiheadAttention(nn.Module): r"""Allows the model to jointly attend to information from different representation subspaces. See `Attention Is All You Need `_ .. math:: \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. Args: embed_dim: total dimension of the model. num_heads: parallel attention heads. dropout: a Dropout layer on attn_output_weights. Default: 0.0. bias: add bias as module parameter. Default: True. add_bias_kv: add bias to the key and value sequences at dim=0. add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. kdim: total number of features in key. Default: None. vdim: total number of features in value. Default: None. Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set to :attr:`embed_dim` such that query, key, and value have the same number of features. Examples:: >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) >>> attn_output, attn_output_weights = multihead_attn(query, key, value) """ bias_k: Optional[torch.Tensor] bias_v: Optional[torch.Tensor] def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None): super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if self._qkv_same_embed_dim is False: self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim)) self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim)) self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim)) self.register_parameter('in_proj_weight', None) else: self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim)) self.register_parameter('q_proj_weight', None) self.register_parameter('k_proj_weight', None) self.register_parameter('v_proj_weight', None) if bias: self.in_proj_bias = Parameter(torch.empty(3 * embed_dim)) else: self.register_parameter('in_proj_bias', None) self.out_proj = _LinearWithBias(embed_dim, embed_dim) if add_bias_kv: self.bias_k = Parameter(torch.empty(1, 1, embed_dim)) self.bias_v = Parameter(torch.empty(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self._reset_parameters() def _reset_parameters(self): if self._qkv_same_embed_dim: xavier_uniform_(self.in_proj_weight) else: xavier_uniform_(self.q_proj_weight) xavier_uniform_(self.k_proj_weight) xavier_uniform_(self.v_proj_weight) if self.in_proj_bias is not None: constant_(self.in_proj_bias, 0.) constant_(self.out_proj.bias, 0.) if self.bias_k is not None: xavier_normal_(self.bias_k) if self.bias_v is not None: xavier_normal_(self.bias_v) def __setstate__(self, state): # Support loading old MultiheadAttention checkpoints generated by v1.1.0 if '_qkv_same_embed_dim' not in state: state['_qkv_same_embed_dim'] = True super(MultiheadAttention, self).__setstate__(state) def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None, need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]: r""" Args: query, key, value: map a query and a set of key-value pairs to an output. See "Attention Is All You Need" for more details. key_padding_mask: if provided, specified padding elements in the key will be ignored by the attention. When given a binary mask and a value is True, the corresponding value on the attention layer will be ignored. When given a byte mask and a value is non-zero, the corresponding value on the attention layer will be ignored need_weights: output attn_output_weights. attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all the batches while a 3D mask allows to specify a different mask for the entries of each batch. Shapes for inputs: - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is the embedding dimension. - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is the embedding dimension. - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length. If a ByteTensor is provided, the non-zero positions will be ignored while the position with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. - attn_mask: if a 2D mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length. If a 3D mask: :math:`(N\cdot\text{num\_heads}, L, S)` where N is the batch size, L is the target sequence length, S is the source sequence length. ``attn_mask`` ensure that position i is allowed to attend the unmasked positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor is provided, it will be added to the attention weight. Shapes for outputs: - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - attn_output_weights: :math:`(N, L, S)` where N is the batch size, L is the target sequence length, S is the source sequence length. """ if not self._qkv_same_embed_dim: return multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.bias_k, self.bias_v, self.add_zero_attn, self.dropout, self.out_proj.weight, self.out_proj.bias, training=self.training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, v_proj_weight=self.v_proj_weight) else: return multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, self.in_proj_weight, self.in_proj_bias, self.bias_k, self.bias_v, self.add_zero_attn, self.dropout, self.out_proj.weight, self.out_proj.bias, training=self.training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask) ================================================ FILE: thirdparty/GLEE/glee/modules/point_features.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch from torch.nn import functional as F from detectron2.layers import cat, shapes_to_tensor from detectron2.structures import BitMasks, Boxes # from ..layers import cat, shapes_to_tensor # from ..structures import BitMasks, Boxes """ Shape shorthand in this module: N: minibatch dimension size, i.e. the number of RoIs for instance segmenation or the number of images for semantic segmenation. R: number of ROIs, combined over all images, in the minibatch P: number of points """ def point_sample(input, point_coords, **kwargs): """ A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors. Unlike :function:`torch.nn.functional.grid_sample` it assumes `point_coords` to lie inside [0, 1] x [0, 1] square. Args: input (Tensor): A tensor of shape (N, C, H, W) that contains features map on a H x W grid. point_coords (Tensor): A tensor of shape (N, P, 2) or (N, Hgrid, Wgrid, 2) that contains [0, 1] x [0, 1] normalized point coordinates. Returns: output (Tensor): A tensor of shape (N, C, P) or (N, C, Hgrid, Wgrid) that contains features for points in `point_coords`. The features are obtained via bilinear interplation from `input` the same way as :function:`torch.nn.functional.grid_sample`. """ add_dim = False if point_coords.dim() == 3: add_dim = True point_coords = point_coords.unsqueeze(2) output = F.grid_sample(input, 2.0 * point_coords - 1.0, **kwargs) if add_dim: output = output.squeeze(3) return output def generate_regular_grid_point_coords(R, side_size, device): """ Generate regular square grid of points in [0, 1] x [0, 1] coordinate space. Args: R (int): The number of grids to sample, one for each region. side_size (int): The side size of the regular grid. device (torch.device): Desired device of returned tensor. Returns: (Tensor): A tensor of shape (R, side_size^2, 2) that contains coordinates for the regular grids. """ aff = torch.tensor([[[0.5, 0, 0.5], [0, 0.5, 0.5]]], device=device) r = F.affine_grid(aff, torch.Size((1, 1, side_size, side_size)), align_corners=False) return r.view(1, -1, 2).expand(R, -1, -1) def get_uncertain_point_coords_with_randomness( coarse_logits, uncertainty_func, num_points, oversample_ratio, importance_sample_ratio ): """ Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The unceratinties are calculated for each point using 'uncertainty_func' function that takes point's logit prediction as input. See PointRend paper for details. Args: coarse_logits (Tensor): A tensor of shape (N, C, Hmask, Wmask) or (N, 1, Hmask, Wmask) for class-specific or class-agnostic prediction. uncertainty_func: A function that takes a Tensor of shape (N, C, P) or (N, 1, P) that contains logit predictions for P points and returns their uncertainties as a Tensor of shape (N, 1, P). num_points (int): The number of points P to sample. oversample_ratio (int): Oversampling parameter. importance_sample_ratio (float): Ratio of points that are sampled via importnace sampling. Returns: point_coords (Tensor): A tensor of shape (N, P, 2) that contains the coordinates of P sampled points. """ assert oversample_ratio >= 1 assert importance_sample_ratio <= 1 and importance_sample_ratio >= 0 num_boxes = coarse_logits.shape[0] num_sampled = int(num_points * oversample_ratio) point_coords = torch.rand(num_boxes, num_sampled, 2, device=coarse_logits.device, dtype=coarse_logits.dtype) point_logits = point_sample(coarse_logits, point_coords, align_corners=False) # It is crucial to calculate uncertainty based on the sampled prediction value for the points. # Calculating uncertainties of the coarse predictions first and sampling them for points leads # to incorrect results. # To illustrate this: assume uncertainty_func(logits)=-abs(logits), a sampled point between # two coarse predictions with -1 and 1 logits has 0 logits, and therefore 0 uncertainty value. # However, if we calculate uncertainties for the coarse predictions first, # both will have -1 uncertainty, and the sampled point will get -1 uncertainty. point_uncertainties = uncertainty_func(point_logits) num_uncertain_points = int(importance_sample_ratio * num_points) num_random_points = num_points - num_uncertain_points idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] shift = num_sampled * torch.arange(num_boxes, dtype=torch.long, device=coarse_logits.device) idx += shift[:, None] point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view( num_boxes, num_uncertain_points, 2 ) if num_random_points > 0: point_coords = cat( [ point_coords, torch.rand(num_boxes, num_random_points, 2, device=coarse_logits.device), ], dim=1, ) return point_coords def get_uncertain_point_coords_on_grid(uncertainty_map, num_points): """ Find `num_points` most uncertain points from `uncertainty_map` grid. Args: uncertainty_map (Tensor): A tensor of shape (N, 1, H, W) that contains uncertainty values for a set of points on a regular H x W grid. num_points (int): The number of points P to select. Returns: point_indices (Tensor): A tensor of shape (N, P) that contains indices from [0, H x W) of the most uncertain points. point_coords (Tensor): A tensor of shape (N, P, 2) that contains [0, 1] x [0, 1] normalized coordinates of the most uncertain points from the H x W grid. """ R, _, H, W = uncertainty_map.shape h_step = 1.0 / float(H) w_step = 1.0 / float(W) num_points = min(H * W, num_points) point_indices = torch.topk(uncertainty_map.view(R, H * W), k=num_points, dim=1)[1] point_coords = torch.zeros(R, num_points, 2, dtype=torch.float, device=uncertainty_map.device) point_coords[:, :, 0] = w_step / 2.0 + (point_indices % W).to(torch.float) * w_step point_coords[:, :, 1] = h_step / 2.0 + (point_indices // W).to(torch.float) * h_step return point_indices, point_coords def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords): """ Get features from feature maps in `features_list` that correspond to specific point coordinates inside each bounding box from `boxes`. Args: features_list (list[Tensor]): A list of feature map tensors to get features from. feature_scales (list[float]): A list of scales for tensors in `features_list`. boxes (list[Boxes]): A list of I Boxes objects that contain R_1 + ... + R_I = R boxes all together. point_coords (Tensor): A tensor of shape (R, P, 2) that contains [0, 1] x [0, 1] box-normalized coordinates of the P sampled points. Returns: point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled from all features maps in feature_list for P sampled points for all R boxes in `boxes`. point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level coordinates of P points. """ cat_boxes = Boxes.cat(boxes) num_boxes = [b.tensor.size(0) for b in boxes] point_coords_wrt_image = get_point_coords_wrt_image(cat_boxes.tensor, point_coords) split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes) point_features = [] for idx_img, point_coords_wrt_image_per_image in enumerate(split_point_coords_wrt_image): point_features_per_image = [] for idx_feature, feature_map in enumerate(features_list): h, w = feature_map.shape[-2:] scale = shapes_to_tensor([w, h]) / feature_scales[idx_feature] point_coords_scaled = point_coords_wrt_image_per_image / scale.to(feature_map.device) point_features_per_image.append( point_sample( feature_map[idx_img].unsqueeze(0), point_coords_scaled.unsqueeze(0), align_corners=False, ) .squeeze(0) .transpose(1, 0) ) point_features.append(cat(point_features_per_image, dim=1)) return cat(point_features, dim=0), point_coords_wrt_image def get_point_coords_wrt_image(boxes_coords, point_coords): """ Convert box-normalized [0, 1] x [0, 1] point cooordinates to image-level coordinates. Args: boxes_coords (Tensor): A tensor of shape (R, 4) that contains bounding boxes. coordinates. point_coords (Tensor): A tensor of shape (R, P, 2) that contains [0, 1] x [0, 1] box-normalized coordinates of the P sampled points. Returns: point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-normalized coordinates of P sampled points. """ with torch.no_grad(): point_coords_wrt_image = point_coords.clone() point_coords_wrt_image[:, :, 0] = point_coords_wrt_image[:, :, 0] * ( boxes_coords[:, None, 2] - boxes_coords[:, None, 0] ) point_coords_wrt_image[:, :, 1] = point_coords_wrt_image[:, :, 1] * ( boxes_coords[:, None, 3] - boxes_coords[:, None, 1] ) point_coords_wrt_image[:, :, 0] += boxes_coords[:, None, 0] point_coords_wrt_image[:, :, 1] += boxes_coords[:, None, 1] return point_coords_wrt_image def sample_point_labels(instances, point_coords): """ Sample point labels from ground truth mask given point_coords. Args: instances (list[Instances]): A list of N Instances, where N is the number of images in the batch. So, i_th elememt of the list contains R_i objects and R_1 + ... + R_N is equal to R. The ground-truth gt_masks in each instance will be used to compute labels. points_coords (Tensor): A tensor of shape (R, P, 2), where R is the total number of instances and P is the number of points for each instance. The coordinates are in the absolute image pixel coordinate space, i.e. [0, H] x [0, W]. Returns: Tensor: A tensor of shape (R, P) that contains the labels of P sampled points. """ with torch.no_grad(): gt_mask_logits = [] point_coords_splits = torch.split( point_coords, [len(instances_per_image) for instances_per_image in instances] ) for i, instances_per_image in enumerate(instances): if len(instances_per_image) == 0: continue assert isinstance( instances_per_image.gt_masks, BitMasks ), "Point head works with GT in 'bitmask' format. Set INPUT.MASK_FORMAT to 'bitmask'." gt_bit_masks = instances_per_image.gt_masks.tensor h, w = instances_per_image.gt_masks.image_size scale = torch.tensor([w, h], dtype=torch.float, device=gt_bit_masks.device) points_coord_grid_sample_format = point_coords_splits[i] / scale gt_mask_logits.append( point_sample( gt_bit_masks.to(torch.float32).unsqueeze(1), points_coord_grid_sample_format, align_corners=False, ).squeeze(1) ) point_labels = cat(gt_mask_logits) return point_labels ================================================ FILE: thirdparty/GLEE/glee/modules/position_encoding.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. ## Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py """ Various positional encodings for the transformer. """ import math import torch from torch import nn class PositionEmbeddingSine(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images. """ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, x, mask=None): if mask is None: mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) not_mask = ~mask y_embed = not_mask.cumsum(1, dtype=x.dtype) x_embed = not_mask.cumsum(2, dtype=x.dtype) if self.normalize: eps = 1e-6 y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=x.dtype, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) pos_x = x_embed[:, :, :, None] / dim_t pos_y = y_embed[:, :, :, None] / dim_t pos_x = torch.stack( (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos_y = torch.stack( (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 ).flatten(3) pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) return pos def __repr__(self, _repr_indent=4): head = "Positional encoding " + self.__class__.__name__ body = [ "num_pos_feats: {}".format(self.num_pos_feats), "temperature: {}".format(self.temperature), "normalize: {}".format(self.normalize), "scale: {}".format(self.scale), ] # _repr_indent = 4 lines = [head] + [" " * _repr_indent + line for line in body] return "\n".join(lines) ================================================ FILE: thirdparty/GLEE/glee/modules/postprocessing.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. import torch from torch.nn import functional as F from detectron2.structures import Instances, ROIMasks # perhaps should rename to "resize_instance" def detector_postprocess( results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5 ): """ Resize the output instances. The input images are often resized when entering an object detector. As a result, we often need the outputs of the detector in a different resolution from its inputs. This function will resize the raw outputs of an R-CNN detector to produce outputs according to the desired output resolution. Args: results (Instances): the raw outputs from the detector. `results.image_size` contains the input image resolution the detector sees. This object might be modified in-place. output_height, output_width: the desired output resolution. Returns: Instances: the resized output from the model, based on the output resolution """ if isinstance(output_width, torch.Tensor): # This shape might (but not necessarily) be tensors during tracing. # Converts integer tensors to float temporaries to ensure true # division is performed when computing scale_x and scale_y. output_width_tmp = output_width.float() output_height_tmp = output_height.float() new_size = torch.stack([output_height, output_width]) else: new_size = (output_height, output_width) output_width_tmp = output_width output_height_tmp = output_height scale_x, scale_y = ( output_width_tmp / results.image_size[1], output_height_tmp / results.image_size[0], ) results = Instances(new_size, **results.get_fields()) if results.has("pred_boxes"): output_boxes = results.pred_boxes elif results.has("proposal_boxes"): output_boxes = results.proposal_boxes else: output_boxes = None assert output_boxes is not None, "Predictions must contain boxes!" output_boxes.scale(scale_x, scale_y) output_boxes.clip(results.image_size) results = results[output_boxes.nonempty()] if results.has("pred_masks"): if isinstance(results.pred_masks, ROIMasks): roi_masks = results.pred_masks else: # pred_masks is a tensor of shape (N, 1, M, M) roi_masks = ROIMasks(results.pred_masks[:, 0, :, :]) results.pred_masks = roi_masks.to_bitmasks( results.pred_boxes, output_height, output_width, mask_threshold ).tensor # TODO return ROIMasks/BitMask object in the future if results.has("pred_keypoints"): results.pred_keypoints[:, :, 0] *= scale_x results.pred_keypoints[:, :, 1] *= scale_y return results def bbox_postprocess(result, input_size, img_size, output_height, output_width): """ result: [xc,yc,w,h] range [0,1] to [x1,y1,x2,y2] range [0,w], [0,h] """ if result is None: return None scale = torch.tensor([input_size[1], input_size[0], input_size[1], input_size[0]])[None,:].to(result.device) result = result.sigmoid() * scale x1,y1,x2,y2 = result[:,0] - result[:,2]/2, result[:,1] - result[:,3]/2, result[:,0] + result[:,2]/2, result[:,1] + result[:,3]/2 h,w = img_size x1 = x1.clamp(min=0, max=w) y1 = y1.clamp(min=0, max=h) x2 = x2.clamp(min=0, max=w) y2 = y2.clamp(min=0, max=h) box = torch.stack([x1,y1,x2,y2]).permute(1,0) scale = torch.tensor([output_width/w, output_height/h, output_width/w, output_height/h])[None,:].to(result.device) box = box*scale return box def sem_seg_postprocess(result, img_size, output_height, output_width): """ Return semantic segmentation predictions in the original resolution. The input images are often resized when entering semantic segmentor. Moreover, in same cases, they also padded inside segmentor to be divisible by maximum network stride. As a result, we often need the predictions of the segmentor in a different resolution from its inputs. Args: result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W), where C is the number of classes, and H, W are the height and width of the prediction. img_size (tuple): image size that segmentor is taking as input. output_height, output_width: the desired output resolution. Returns: semantic segmentation prediction (Tensor): A tensor of the shape (C, output_height, output_width) that contains per-pixel soft predictions. """ result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1) result = F.interpolate( result, size=(output_height, output_width), mode="bilinear", align_corners=False )[0] return result ================================================ FILE: thirdparty/GLEE/glee/utils/__init__.py ================================================ from .config import * from .misc import * from .box_ops import * from .it_contrastive import * ================================================ FILE: thirdparty/GLEE/glee/utils/box_ops.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ Utilities for bounding box manipulation and GIoU. """ import torch from torchvision.ops.boxes import box_area def box_cxcywh_to_xyxy(x): x_c, y_c, w, h = x.unbind(-1) b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return torch.stack(b, dim=-1) def box_xyxy_to_cxcywh(x): x0, y0, x1, y1 = x.unbind(-1) b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] return torch.stack(b, dim=-1) def box_xywh_to_xyxy(x): x0, y0, x1, y1 = x.unbind(-1) b = [x0, y0, (x0 + x1), (y0 + y1)] return torch.stack(b, dim=-1) # modified from torchvision to also return the union def box_iou(boxes1, boxes2): area1 = box_area(boxes1) area2 = box_area(boxes2) lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] wh = (rb - lt).clamp(min=0) # [N,M,2] inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] union = area1[:, None] + area2 - inter iou = inter / union return iou, union def generalized_box_iou(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/ The boxes should be in [x0, y0, x1, y1] format Returns a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) """ # degenerate boxes gives inf / nan results # so do an early check assert (boxes1[:, 2:] >= boxes1[:, :2]).all() assert (boxes2[:, 2:] >= boxes2[:, :2]).all() iou, union = box_iou(boxes1, boxes2) lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) wh = (rb - lt).clamp(min=0) # [N,M,2] area = wh[:, :, 0] * wh[:, :, 1] return iou - (area - union) / area def masks_to_boxes(masks): """Compute the bounding boxes around the provided masks The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. Returns a [N, 4] tensors, with the boxes in xyxy format """ if masks.numel() == 0: return torch.zeros((0, 4), device=masks.device) h, w = masks.shape[-2:] y = torch.arange(0, h, dtype=torch.float) x = torch.arange(0, w, dtype=torch.float) y, x = torch.meshgrid(y, x) x_mask = (masks * x.unsqueeze(0)) x_max = x_mask.flatten(1).max(-1)[0] x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] y_mask = (masks * y.unsqueeze(0)) y_max = y_mask.flatten(1).max(-1)[0] y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] return torch.stack([x_min, y_min, x_max, y_max], 1) ================================================ FILE: thirdparty/GLEE/glee/utils/config.py ================================================ # -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import functools import inspect def configurable(init_func=None, *, from_config=None): """ Decorate a function or a class's __init__ method so that it can be called with a :class:`CfgNode` object using a :func:`from_config` function that translates :class:`CfgNode` to arguments. Examples: :: # Usage 1: Decorator on __init__: class A: @configurable def __init__(self, a, b=2, c=3): pass @classmethod def from_config(cls, cfg): # 'cfg' must be the first argument # Returns kwargs to be passed to __init__ return {"a": cfg.A, "b": cfg.B} a1 = A(a=1, b=2) # regular construction a2 = A(cfg) # construct with a cfg a3 = A(cfg, b=3, c=4) # construct with extra overwrite # Usage 2: Decorator on any function. Needs an extra from_config argument: @configurable(from_config=lambda cfg: {"a: cfg.A, "b": cfg.B}) def a_func(a, b=2, c=3): pass a1 = a_func(a=1, b=2) # regular call a2 = a_func(cfg) # call with a cfg a3 = a_func(cfg, b=3, c=4) # call with extra overwrite Args: init_func (callable): a class's ``__init__`` method in usage 1. The class must have a ``from_config`` classmethod which takes `cfg` as the first argument. from_config (callable): the from_config function in usage 2. It must take `cfg` as its first argument. """ if init_func is not None: assert ( inspect.isfunction(init_func) and from_config is None and init_func.__name__ == "__init__" ), "Incorrect use of @configurable. Check API documentation for examples." @functools.wraps(init_func) def wrapped(self, *args, **kwargs): try: from_config_func = type(self).from_config except AttributeError as e: raise AttributeError( "Class with @configurable must have a 'from_config' classmethod." ) from e if not inspect.ismethod(from_config_func): raise TypeError("Class with @configurable must have a 'from_config' classmethod.") if _called_with_cfg(*args, **kwargs): explicit_args = _get_args_from_config(from_config_func, *args, **kwargs) init_func(self, **explicit_args) else: init_func(self, *args, **kwargs) return wrapped else: if from_config is None: return configurable # @configurable() is made equivalent to @configurable assert inspect.isfunction( from_config ), "from_config argument of configurable must be a function!" def wrapper(orig_func): @functools.wraps(orig_func) def wrapped(*args, **kwargs): if _called_with_cfg(*args, **kwargs): explicit_args = _get_args_from_config(from_config, *args, **kwargs) return orig_func(**explicit_args) else: return orig_func(*args, **kwargs) wrapped.from_config = from_config return wrapped return wrapper def _called_with_cfg(*args, **kwargs): """ Returns: bool: whether the arguments contain CfgNode and should be considered forwarded to from_config. """ from omegaconf import DictConfig if len(args) and isinstance(args[0], (dict)): return True if isinstance(kwargs.pop("cfg", None), (dict)): return True # `from_config`'s first argument is forced to be "cfg". # So the above check covers all cases. return False def _get_args_from_config(from_config_func, *args, **kwargs): """ Use `from_config` to obtain explicit arguments. Returns: dict: arguments to be used for cls.__init__ """ signature = inspect.signature(from_config_func) if list(signature.parameters.keys())[0] != "cfg": if inspect.isfunction(from_config_func): name = from_config_func.__name__ else: name = f"{from_config_func.__self__}.from_config" raise TypeError(f"{name} must take 'cfg' as the first argument!") support_var_arg = any( param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD] for param in signature.parameters.values() ) if support_var_arg: # forward all arguments to from_config, if from_config accepts them ret = from_config_func(*args, **kwargs) else: # forward supported arguments to from_config supported_arg_names = set(signature.parameters.keys()) extra_kwargs = {} for name in list(kwargs.keys()): if name not in supported_arg_names: extra_kwargs[name] = kwargs.pop(name) ret = from_config_func(*args, **kwargs) # forward the other arguments to __init__ ret.update(extra_kwargs) return ret ================================================ FILE: thirdparty/GLEE/glee/utils/it_contrastive.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F def is_dist_initialized(): return torch.distributed.is_initialized() def get_world_size(): if is_dist_initialized(): return torch.distributed.get_world_size() return 1 def all_gather_grad(x): if get_world_size() > 1: all_x = [torch.zeros_like(x) for _ in range(get_world_size())] torch.distributed.all_gather(all_x, x) all_x[torch.distributed.get_rank()] = x x = torch.cat(all_x, dim=0) return x @torch.no_grad() def all_gather_nograd(tensor): # from albef """ Performs all_gather operation on the provided tensors. *** Warning ***: torch.distributed.all_gather has no gradient. """ if get_world_size() > 1: tensors_gather = [torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size())] torch.distributed.all_gather(tensors_gather, tensor, async_op=False) tensor = torch.cat(tensors_gather, dim=0) return tensor def image_text_contrastive_loss(image_feat, text_feat, temperature, image_id=None, text_id=None): # add the following 4 lines image_feat = all_gather_grad(image_feat) text_feat = all_gather_grad(text_feat) logits = torch.matmul(image_feat, text_feat.t()) logits /= temperature if image_id is None and text_id is None: gt = torch.arange(logits.shape[0], device=logits.device) loss1 = F.cross_entropy(logits, gt) loss2 = F.cross_entropy(logits.t(), gt) else: image_id = all_gather_grad(image_id) text_id = all_gather_grad(text_id) gt_image = image_id.reshape((-1, 1)) == image_id.reshape((1, -1)) gt_text = text_id.reshape((-1, 1)) == text_id.reshape((1, -1)) gt = torch.logical_or(gt_image, gt_text) loss1 = -torch.sum(gt * F.log_softmax(logits, dim=1)) / gt.sum() loss2 = -torch.sum(gt.t() * F.log_softmax(logits.t(), dim=1)) / gt.sum() return (loss1 + loss2) / 2 * get_world_size() # scale it up by the number of GPUs ================================================ FILE: thirdparty/GLEE/glee/utils/misc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py # Modified by Xueyan Zou """ Misc functions, including distributed helpers. Mostly copy-paste from torchvision references. """ from typing import List, Optional import torch import torch.distributed as dist import torchvision from torch import Tensor def _max_by_axis(the_list): # type: (List[List[int]]) -> List[int] maxes = the_list[0] for sublist in the_list[1:]: for index, item in enumerate(sublist): maxes[index] = max(maxes[index], item) return maxes class NestedTensor(object): def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask def to(self, device): # type: (Device) -> NestedTensor # noqa cast_tensor = self.tensors.to(device) mask = self.mask if mask is not None: assert mask is not None cast_mask = mask.to(device) else: cast_mask = None return NestedTensor(cast_tensor, cast_mask) def decompose(self): return self.tensors, self.mask def __repr__(self): return str(self.tensors) def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): # TODO make this more general if tensor_list[0].ndim == 3: if torchvision._is_tracing(): # nested_tensor_from_tensor_list() does not export well to ONNX # call _onnx_nested_tensor_from_tensor_list() instead return _onnx_nested_tensor_from_tensor_list(tensor_list) # TODO make it support different-sized images max_size = _max_by_axis([list(img.shape) for img in tensor_list]) # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) batch_shape = [len(tensor_list)] + max_size b, c, h, w = batch_shape dtype = tensor_list[0].dtype device = tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, h, w), dtype=torch.bool, device=device) for img, pad_img, m in zip(tensor_list, tensor, mask): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) m[: img.shape[1], : img.shape[2]] = False elif tensor_list[0].ndim == 2: if torchvision._is_tracing(): # nested_tensor_from_tensor_list() does not export well to ONNX # call _onnx_nested_tensor_from_tensor_list() instead return _onnx_nested_tensor_from_tensor_list(tensor_list) # TODO make it support different-sized images max_size = _max_by_axis([list(txt.shape) for txt in tensor_list]) # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) batch_shape = [len(tensor_list)] + max_size b, c, l = batch_shape dtype = tensor_list[0].dtype device = tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, l), dtype=torch.bool, device=device) for txt, pad_txt, m in zip(tensor_list, tensor, mask): pad_txt[: txt.shape[0], : txt.shape[1]] = txt m[: txt.shape[1]] = False else: raise ValueError("not supported") return NestedTensor(tensor, mask) def _collate_and_pad_divisibility(tensor_list: list, div=32): max_size = [] for i in range(tensor_list[0].dim()): max_size_i = torch.max( torch.tensor([img.shape[i] for img in tensor_list]).to(torch.float32) ).to(torch.int64) max_size.append(max_size_i) max_size = tuple(max_size) c,h,w = max_size pad_h = (div - h % div) if h % div != 0 else 0 pad_w = (div - w % div) if w % div != 0 else 0 max_size = (c,h+pad_h,w+pad_w) # work around for # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) # m[: img.shape[1], :img.shape[2]] = False # which is not yet supported in onnx padded_imgs = [] padded_masks = [] for img in tensor_list: padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) padded_imgs.append(padded_img) m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) padded_masks.append(padded_mask.to(torch.bool)) return padded_imgs # _onnx_nested_tensor_from_tensor_list() is an implementation of # nested_tensor_from_tensor_list() that is supported by ONNX tracing. @torch.jit.unused def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: max_size = [] for i in range(tensor_list[0].dim()): max_size_i = torch.max( torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) ).to(torch.int64) max_size.append(max_size_i) max_size = tuple(max_size) # work around for # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) # m[: img.shape[1], :img.shape[2]] = False # which is not yet supported in onnx padded_imgs = [] padded_masks = [] for img in tensor_list: padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) padded_imgs.append(padded_img) m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) padded_masks.append(padded_mask.to(torch.bool)) tensor = torch.stack(padded_imgs) mask = torch.stack(padded_masks) return NestedTensor(tensor, mask=mask) def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True def get_iou(gt_masks, pred_masks, ignore_label=-1): rev_ignore_mask = ~(gt_masks == ignore_label) gt_masks = gt_masks.bool() n,h,w = gt_masks.shape intersection = ((gt_masks & pred_masks) & rev_ignore_mask).reshape(n,h*w).sum(dim=-1) union = ((gt_masks | pred_masks) & rev_ignore_mask).reshape(n,h*w).sum(dim=-1) ious = (intersection / union) return ious ================================================ FILE: thirdparty/GLEE/glee/utils/utils.py ================================================ import torch import copy from torch import nn, Tensor import os import math import torch.nn.functional as F from torch import nn class MLP(nn.Module): """ Very simple multi-layer perceptron (also called FFN)""" def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x def inverse_sigmoid(x, eps=1e-5): x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1/x2) def gen_encoder_output_proposals(memory:Tensor, memory_padding_mask:Tensor, spatial_shapes:Tensor): """ Input: - memory: bs, \sum{hw}, d_model - memory_padding_mask: bs, \sum{hw} - spatial_shapes: nlevel, 2 Output: - output_memory: bs, \sum{hw}, d_model - output_proposals: bs, \sum{hw}, 4 """ N_, S_, C_ = memory.shape base_scale = 4.0 proposals = [] _cur = 0 for lvl, (H_, W_) in enumerate(spatial_shapes): mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1) valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device)) grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl) proposal = torch.cat((grid, wh), -1).view(N_, -1, 4) proposals.append(proposal) _cur += (H_ * W_) output_proposals = torch.cat(proposals, 1) output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) output_proposals = torch.log(output_proposals / (1 - output_proposals)) output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf')) output_memory = memory output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) return output_memory, output_proposals def gen_sineembed_for_position(pos_tensor): # n_query, bs, _ = pos_tensor.size() # sineembed_tensor = torch.zeros(n_query, bs, 256) scale = 2 * math.pi dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) dim_t = 10000 ** (2 * (dim_t // 2) / 128) x_embed = pos_tensor[:, :, 0] * scale y_embed = pos_tensor[:, :, 1] * scale pos_x = x_embed[:, :, None] / dim_t pos_y = y_embed[:, :, None] / dim_t pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) if pos_tensor.size(-1) == 2: pos = torch.cat((pos_y, pos_x), dim=2) elif pos_tensor.size(-1) == 4: w_embed = pos_tensor[:, :, 2] * scale pos_w = w_embed[:, :, None] / dim_t pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) h_embed = pos_tensor[:, :, 3] * scale pos_h = h_embed[:, :, None] / dim_t pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) else: raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1))) return pos def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu if activation == "prelu": return nn.PReLU() if activation == "selu": return F.selu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") def _get_clones(module, N, layer_share=False): if layer_share: return nn.ModuleList([module for i in range(N)]) else: return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) def _get_clones_advanced(module, N, N_valid): assert N_valid <= N layers = [] for i in range(N): if i < N_valid: layers.append(copy.deepcopy(module)) else: layers.append(nn.Identity()) return nn.ModuleList(layers)