Repository: woodfrog/maptracker Branch: main Commit: ee8321fa5dac Files: 161 Total size: 1.1 MB Directory structure: gitextract_kr8oe7q1/ ├── .gitignore ├── LICENSE ├── LICENSE_GPL ├── README.md ├── docs/ │ ├── data_preparation.md │ ├── getting_started.md │ └── installation.md ├── plugin/ │ ├── __init__.py │ ├── configs/ │ │ ├── _base_/ │ │ │ ├── datasets/ │ │ │ │ ├── coco_instance.py │ │ │ │ ├── kitti-3d-3class.py │ │ │ │ ├── kitti-3d-car.py │ │ │ │ ├── lyft-3d.py │ │ │ │ ├── nuim_instance.py │ │ │ │ ├── nus-3d.py │ │ │ │ ├── nus-mono3d.py │ │ │ │ ├── range100_lyft-3d.py │ │ │ │ ├── s3dis_seg-3d-13class.py │ │ │ │ ├── scannet-3d-18class.py │ │ │ │ ├── scannet_seg-3d-20class.py │ │ │ │ ├── sunrgbd-3d-10class.py │ │ │ │ ├── waymoD5-3d-3class.py │ │ │ │ └── waymoD5-3d-car.py │ │ │ ├── default_runtime.py │ │ │ ├── models/ │ │ │ │ ├── 3dssd.py │ │ │ │ ├── cascade_mask_rcnn_r50_fpn.py │ │ │ │ ├── centerpoint_01voxel_second_secfpn_nus.py │ │ │ │ ├── centerpoint_02pillar_second_secfpn_nus.py │ │ │ │ ├── fcos3d.py │ │ │ │ ├── groupfree3d.py │ │ │ │ ├── h3dnet.py │ │ │ │ ├── hv_pointpillars_fpn_lyft.py │ │ │ │ ├── hv_pointpillars_fpn_nus.py │ │ │ │ ├── hv_pointpillars_fpn_range100_lyft.py │ │ │ │ ├── hv_pointpillars_secfpn_kitti.py │ │ │ │ ├── hv_pointpillars_secfpn_waymo.py │ │ │ │ ├── hv_second_secfpn_kitti.py │ │ │ │ ├── hv_second_secfpn_waymo.py │ │ │ │ ├── imvotenet_image.py │ │ │ │ ├── mask_rcnn_r50_fpn.py │ │ │ │ ├── paconv_cuda_ssg.py │ │ │ │ ├── paconv_ssg.py │ │ │ │ ├── parta2.py │ │ │ │ ├── pointnet2_msg.py │ │ │ │ ├── pointnet2_ssg.py │ │ │ │ └── votenet.py │ │ │ └── schedules/ │ │ │ ├── cosine.py │ │ │ ├── cyclic_20e.py │ │ │ ├── cyclic_40e.py │ │ │ ├── mmdet_schedule_1x.py │ │ │ ├── schedule_2x.py │ │ │ ├── schedule_3x.py │ │ │ ├── seg_cosine_150e.py │ │ │ ├── seg_cosine_200e.py │ │ │ └── seg_cosine_50e.py │ │ └── maptracker/ │ │ ├── av2_newsplit/ │ │ │ ├── maptracker_av2_100x50_newsplit_5frame_span10_stage1_bev_pretrain.py │ │ │ ├── maptracker_av2_100x50_newsplit_5frame_span10_stage2_warmup.py │ │ │ ├── maptracker_av2_100x50_newsplit_5frame_span10_stage3_joint_finetune.py │ │ │ ├── maptracker_av2_newsplit_5frame_span10_stage1_bev_pretrain.py │ │ │ ├── maptracker_av2_newsplit_5frame_span10_stage2_warmup.py │ │ │ └── maptracker_av2_newsplit_5frame_span10_stage3_joint_finetune.py │ │ ├── av2_oldsplit/ │ │ │ ├── maptracker_av2_oldsplit_5frame_span10_stage1_bev_pretrain.py │ │ │ ├── maptracker_av2_oldsplit_5frame_span10_stage2_warmup.py │ │ │ └── maptracker_av2_oldsplit_5frame_span10_stage3_joint_finetune.py │ │ ├── nuscenes_newsplit/ │ │ │ ├── maptracker_nusc_newsplit_5frame_span10_stage1_bev_pretrain.py │ │ │ ├── maptracker_nusc_newsplit_5frame_span10_stage2_warmup.py │ │ │ └── maptracker_nusc_newsplit_5frame_span10_stage3_joint_finetune.py │ │ └── nuscenes_oldsplit/ │ │ ├── maptracker_nusc_oldsplit_5frame_span10_stage1_bev_pretrain.py │ │ ├── maptracker_nusc_oldsplit_5frame_span10_stage2_warmup.py │ │ └── maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py │ ├── core/ │ │ ├── apis/ │ │ │ ├── __init__.py │ │ │ ├── mmdet_train.py │ │ │ ├── test.py │ │ │ └── train.py │ │ └── evaluation/ │ │ ├── __init__.py │ │ └── eval_hooks.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── argo_dataset.py │ │ ├── base_dataset.py │ │ ├── builder.py │ │ ├── evaluation/ │ │ │ ├── AP.py │ │ │ ├── __init__.py │ │ │ ├── distance.py │ │ │ ├── raster_eval.py │ │ │ └── vector_eval.py │ │ ├── map_utils/ │ │ │ ├── av2map_extractor.py │ │ │ ├── nuscmap_extractor.py │ │ │ └── utils.py │ │ ├── nusc_dataset.py │ │ ├── pipelines/ │ │ │ ├── __init__.py │ │ │ ├── formating.py │ │ │ ├── loading.py │ │ │ ├── rasterize.py │ │ │ ├── transform.py │ │ │ └── vectorize.py │ │ ├── samplers/ │ │ │ ├── __init__.py │ │ │ ├── distributed_sampler.py │ │ │ ├── group_sampler.py │ │ │ └── sampler.py │ │ └── visualize/ │ │ └── renderer.py │ └── models/ │ ├── __init__.py │ ├── assigner/ │ │ ├── __init__.py │ │ ├── assigner.py │ │ └── match_cost.py │ ├── backbones/ │ │ ├── __init__.py │ │ ├── bevformer/ │ │ │ ├── __init__.py │ │ │ ├── custom_base_transformer_layer.py │ │ │ ├── encoder.py │ │ │ ├── grid_mask.py │ │ │ ├── multi_scale_deformable_attn_function.py │ │ │ ├── spatial_cross_attention.py │ │ │ ├── temporal_net.py │ │ │ ├── temporal_self_attention.py │ │ │ └── transformer.py │ │ └── bevformer_backbone.py │ ├── heads/ │ │ ├── MapDetectorHead.py │ │ ├── MapSegHead.py │ │ ├── __init__.py │ │ └── base_map_head.py │ ├── losses/ │ │ ├── __init__.py │ │ ├── detr_loss.py │ │ └── seg_loss.py │ ├── mapers/ │ │ ├── MapTracker.py │ │ ├── __init__.py │ │ ├── base_mapper.py │ │ └── vector_memory.py │ ├── necks/ │ │ ├── __init__.py │ │ └── gru.py │ ├── transformer_utils/ │ │ ├── CustomMSDeformableAttention.py │ │ ├── MapTransformer.py │ │ ├── __init__.py │ │ ├── base_transformer.py │ │ ├── deformable_transformer.py │ │ └── fp16_dattn.py │ └── utils/ │ ├── __init__.py │ ├── query_update.py │ └── renderer_track.py ├── requirements.txt └── tools/ ├── benchmark.py ├── data_converter/ │ ├── __init__.py │ ├── argoverse_converter.py │ ├── av2_train_split.txt │ ├── av2_val_split.txt │ ├── nusc_split.py │ └── nuscenes_converter.py ├── dist_test.sh ├── dist_train.sh ├── mmdet_test.py ├── mmdet_train.py ├── slurm_test.sh ├── slurm_train.sh ├── test.py ├── tracking/ │ ├── calculate_cmap.py │ ├── cmap_utils/ │ │ ├── __init__.py │ │ ├── data_utils.py │ │ ├── match_utils.py │ │ └── utils.py │ ├── prepare_gt_tracks.py │ └── prepare_pred_tracks.py ├── train.py └── visualization/ ├── vis_global.py └── vis_per_frame.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class *.ipynb # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # cython generated cpp data .vscode .idea # custom *.pkl *.gif *.pkl.json *.log.json work_dirs/ work_dirs_bak/ debug_img/ model_file/ exps/ *~ mmdet3d/.mim # Pytorch *.pth # demo demo/ *.jpg *.png *.obj *.ply *.zip *.tar *.tar.gz *.json # datasets /datasets /data_ann /datasets_local # softlinks av2 nuScenes # viz viz viz_bak *pkl* demo mmdetection3d work_dirs vis_global vis_local ================================================ FILE: LICENSE ================================================ The code, data, and model weights in this repository are not allowed for commercial usage. For research purposes, the terms follow the GPLv3 as in the separate file "LICENSE_GPL". -- Authors of the paper "MapTracker: Tracking with Strided Memory Fusion for Consistent Vector HD Mapping". ================================================ FILE: LICENSE_GPL ================================================ GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ================================================ FILE: README.md ================================================

MapTracker: Tracking with Strided Memory Fusion for
Consistent Vector HD Mapping

ECCV 2024 (Oral)

[Jiacheng Chen*1](https://jcchen.me) , [Yuefan Wu*1](https://ivenwu.com/) , [Jiaqi Tan*1](https://www.linkedin.com/in/jiaqi-christina-tan-800697158/), [Hang Ma1](https://www.cs.sfu.ca/~hangma/), [Yasutaka Furukawa1,2](https://www2.cs.sfu.ca/~furukawa/) 1 Simon Fraser University 2 Wayve ([arXiv](https://arxiv.org/abs/2403.15951), [Project page](https://map-tracker.github.io/))
https://github.com/woodfrog/maptracker/assets/13405255/1c0e072a-cb77-4000-b81b-5b9fd40f8f39 This repository provides the official implementation of the paper [MapTracker: Tracking with Strided Memory Fusion for Consistent Vector HD Mapping](https://arxiv.org/abs/2403.15951). MapTracker reconstructs temporally consistent vector HD maps, and the local maps can be progressively merged into a global reconstruction. This repository is built upon [StreamMapNet](https://github.com/yuantianyuan01/StreamMapNet). ## Table of Contents - [Introduction](#introduction) - [Model Architecture](#model-architecture) - [Installation](#installation) - [Data preparation](#data-preparation) - [Getting Started](#getting-started) - [Acknowledgements](#acknowledgements) - [Citation](#citation) - [License](#license) ## Introduction This paper presents a vector HD-mapping algorithm that formulates the mapping as a tracking task and uses a history of memory latents to ensure consistent reconstructions over time. Our method, MapTracker, accumulates a sensor stream into memory buffers of two latent representations: 1) Raster latents in the bird's-eye-view (BEV) space and 2) Vector latents over the road elements (i.e., pedestrian-crossings, lane-dividers, and road-boundaries). The approach borrows the query propagation paradigm from the tracking literature that explicitly associates tracked road elements from the previous frame to the current, while fusing a subset of memory latents selected with distance strides to further enhance temporal consistency. A vector latent is decoded to reconstruct the geometry of a road element. The paper further makes benchmark contributions by 1) Improving processing code for existing datasets to produce consistent ground truth with temporal alignments and 2) Augmenting existing mAP metrics with consistency checks. MapTracker significantly outperforms existing methods on both nuScenes and Agroverse2 datasets by over 8% and 19% on the conventional and the new consistency-aware metrics, respectively. ## Model Architecture ![visualization](docs/fig/arch.png) (Top) The architecture of MapTracker, consistsing of the BEV and VEC Modules and their memory buffers. (Bottom) The close-up views of the BEV and the vector fusion layers. The **BEV Module** takes ConvNet features of onboard perspective images, the BEV memory buffer ${M_{\text{BEV}}(t-1), M_{\text{BEV}}(t-2),\ ... }$ and vehicle motions ${P^t_{t-1}, P^t_{t-2},\ ... }$ as input. It propagates the previous BEV memory $M_{\text{BEV}}(t-1)$ based on vehicle motion to initialize $M_{\text{BEV}}(t)$. In the BEV Memory Fusion layer, $M_{\text{BEV}}(t)$ is integrated with selected history BEV memories $\{M_{\text{BEV}}^{*}(t'), t'\in \pi(t)\}$, which is used for semantic segmentation and passed to the VEC Module. The **VEC Module** propagates the previous latent vector memory $M_{\text{VEC}}(t-1)$ with a PropMLP to initialize the vector queries $M_{\text{VEC}}(t)$. In Vector Memory Fusion layer, each propagated $M_{\text{VEC}}(t)$ is fused with its selected history vector memories $\{M_{\text{VEC}}^{*}(t'), t' \in \pi(t)\}$. The final vector latents are decoded to reconstruct the road elements. ## Installation Please refer to the [installation guide](docs/installation.md) to set up the environment. ## Data preparation For how to download and prepare data for the nuScenes and Argoverse2 datasets, as well as downloading our checkpoints, please see the [data preparation guide](docs/data_preparation.md). ## Getting Started For instructions on how to run training, inference, evaluation, and visualization, please follow [getting started guide](docs/getting_started.md). ## Acknowledgements We're grateful to the open-source projects below, their great work made our project possible: * BEV perception: [BEVFormer](https://github.com/fundamentalvision/BEVFormer) ![GitHub stars](https://img.shields.io/github/stars/fundamentalvision/BEVFormer.svg?style=flat&label=Star) * Vector HD mapping: [StreamMapNet](https://github.com/yuantianyuan01/StreamMapNet) ![GitHub stars](https://img.shields.io/github/stars/yuantianyuan01/StreamMapNet.svg?style=flat&label=Star), [MapTR](https://github.com/hustvl/MapTR) ![GitHub stars](https://img.shields.io/github/stars/hustvl/MapTR.svg?style=flat&label=Star) ## Citation If you find MapTracker useful in your research or applications, please consider citing: ``` @inproceedings{chen2024maptrakcer, author = {Chen, Jiacheng and Wu, Yuefan and Tan, Jiaqi and Ma, Hang and Furukawa, Yasutaka}, title = {MapTracker: Tracking with Strided Memory Fusion for Consistent Vector HD Mapping}, journal = {arXiv preprint arXiv:2403.15951}, year = {2024} } ``` ## License This project is licensed under GPL, see the [license file](LICENSE) for details. ================================================ FILE: docs/data_preparation.md ================================================ # Data Preparation Compared to the data preparation procedure of StreamMapNet or MapTR, we have one more step to generate the ground truth tracking information (Step 3). We noticed that the track generation results can be slighly different when running on different machines (potentially because Shapely's behaviors are slightly different across different machines), **so please always run the Step 3 below on the training machine to generate the gt tracking information**. ## nuScenes **Step 1.** Download [nuScenes](https://www.nuscenes.org/download) dataset to `./datasets/nuscenes`. **Step 2.** Generate annotation files for NuScenes dataset (the same as StreamMapNet) ``` python tools/data_converter/nuscenes_converter.py --data-root ./datasets/nuscenes ``` Add ``--newsplit`` to generate the metadata for the new split (geographical-based split) provided by StreamMapNet. **Step 3.** Generate the tracking ground truth by ``` python tools/tracking/prepare_gt_tracks.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py --out-dir tracking_gts/nuscenes --visualize ``` Add the ``--visualize`` flag to visualize the data with element IDs derived from our track generation process, or remove it to save disk memory. For generating the G.T. tracks of the new split, change the config file accordingly. ## Argoverse2 **Step 1.** Download [Argoverse2 (sensor)](https://argoverse.github.io/user-guide/getting_started.html#download-the-datasets) dataset to `./datasets/av2`. **Step 2.** Generate annotation files for Argoverse2 dataset. ``` python tools/data_converter/argoverse_converter.py --data-root ./datasets/av2 ``` **Step 3.** Generate the tracking ground truth by ``` python tools/tracking/prepare_gt_tracks.py plugin/configs/maptracker/av2_oldsplit/maptracker_av2_oldsplit_5frame_span10_stage3_joint_finetune.py --out-dir tracking_gts/av2 --visualize ``` ## Checkpoints We provide the checkpoints at [this Dropbox link](https://www.dropbox.com/scl/fo/miulg8q9oby7q2x5vemme/ALoxX1HyxGlfR9y3xlqfzeE?rlkey=i3rw4mbq7lacblc7xsnjkik1u&dl=0) or [this HuggingFace repo](https://huggingface.co/cccjc/maptracker/tree/main). Please download and place them as ``./work_dirs/pretrained_ckpts``. ## File structures Make sure the final file structures look like below: ``` maptracker ├── mmdetection3d ├── tools ├── plugin │ ├── configs │ ├── models │ ├── datasets │ ├── ... ├── work_dirs │ ├── pretrained_ckpts │ │ ├── maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune │ │ │ ├── latest.pth │ │ ├── ... │ ├── .... ├── datasets │ ├── nuscenes │ │ ├── maps <-- used │ │ ├── samples <-- key frames │ │ ├── v1.0-test <-- metadata | | ├── v1.0-trainval <-- metadata and annotations │ │ ├── nuscenes_map_infos_train_{newsplit}.pkl <-- train annotations │ │ ├── nuscenes_map_infos_train_{newsplit}_gt_tracks.pkl <-- train gt tracks │ │ ├── nuscenes_map_infos_val_{newsplit}.pkl <-- val annotations │ │ ├── nuscenes_map_infos_val_{newsplit}_gt_trakcs.pkl <-- val gt tracks │ ├── av2 │ │ ├── train │ │ ├── val │ │ ├── test │ │ ├── maptrv2_val_samples_info.pkl <-- maptr's av2 metadata, used to align the val set │ │ ├── av2_map_infos_train_{newsplit}.pkl <-- train annotations │ │ ├── av2_map_infos_train_{newsplit}_gt_tracks.pkl <-- train gt tracks │ │ ├── av2_map_infos_val_{newsplit}.pkl <-- val annotations │ │ ├── av2_map_infos_val_{newsplit}_gt_trakcs.pkl <-- val gt tracks ``` ================================================ FILE: docs/getting_started.md ================================================ # Getting started with MapTracker In this document, we provide the commands for running inference/evaluation, training, and visualization. ## Inference and evaluation ### Inference and evaluate with Chamfer-based mAP Run the following command to do inference and evaluation using the pretrained checkpoints, assuming 8 GPUs are used. ``` CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash tools/dist_test.sh plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py work_dirs/pretrained_ckpts/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune/latest.pth 8 --eval --eval-options save_semantic=True ``` Set the ``--eval-options save_semantic=True`` to also save the semantic segmentation results of the BEV module. ### Evaluate with C-mAP Generate prediction matching by ``` python tools/tracking/prepare_pred_tracks.py ${CONFIG} --result_path ${SUBMISSION_FILE} --cons_frames ${COMEBACK_FRAMES} ``` Evaluate with C-mAP by ``` python tools/tracking/calculate_cmap.py ${CONFIG} --result_path ${PRED_MATCHING_INFO} ``` An example evaluation: ``` python tools/tracking/calculate_cmap.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py --result_path ./work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune/pos_predictions.pkl ``` ### Results By running with the checkpoints we provided in the [data preparation guide](docs/data_preparation.md), the expected results are: | Dataset | Split | Divider | Crossing | Boundary | mAP | C-mAP | |:------------------------------------------------------------------------:|:--------:|:-------:|:--------:|:--------:|:---------:|:-------------------------------------------------------------------------------------------:| | nuScenes | old | 74.14 | 80.04 | 74.06 | 76.08 | 69.13 | | nuScenes | new | 30.10 | 45.86 | 45.06 | 40.34 | 32.50 | | Argoverse2 | old | 76.99 | 79.97 | 73.66 | 76.87 | 68.35 | | Argoverse2 | new | 75.11 | 69.96 | 68.95 | 71.34 | 63.11 | ## Training The training consists of three stages as detailed in the paper. We train the models on 8 Nvidia RTX A5000 GPUs. **Stage 1**: BEV pretraining with semantic segmentation losses: ``` bash ./tools/dist_train.sh plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage1_bev_pretrain.py 8 ``` **Stage 2**: Vector module warmup with a large batch size while freezing the BEV module: ``` bash ./tools/dist_train.sh plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage2_warmup.py 8 ``` Set up the ``load_from=...`` properly in the config file to load the checkpoint from stage 1. **Stage 3**: Joint finetuning: ``` bash ./tools/dist_train.sh plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py 8 ``` Set up the ``load_from=...`` properly in the config file to load the checkpoint from stage 2. ## Visualization ### Global merged reconstruction (merged from local HD maps) ```bash python tools/visualization/vis_global.py [path to method configuration file under plugin/configs] \ --data_path [path to the .pkl file] \ --out_dir [path to the output folder] \ --option [vis-pred / vis-gt: visualize predicted vectors / visualize ground truth vectors] \ --per_frame_result 1 ``` Set the ``--per_frame_result`` to 1 to generate the per-frame video, the visualization is a bit slow; set it to 0 to only produce the final merged global reconstruction. Examples: ```bash # Visualize MapTracker's prediction python tools/visualization/vis_global.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py \ --data_path work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune/pos_predictions.pkl \ --out_dir vis_global/nuscenes_old/maptracker \ --option vis-pred --per_frame_result 1 # Visualize groud truth data python tools/visualization/vis_global.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py \ --data_path datasets/nuscenes/nuscenes_map_infos_val_gt_tracks.pkl \ --out_dir vis_global/nuscenes_old/gt \ --option vis-gt --per_frame_result 0 ``` ### Local HD map reconstruction ```bash python tools/visualization/vis_per_frame.py [path to method configuration file under plugin/configs] \ --data_path [path to the .pkl file] \ --out_dir [path to the data folder] \ --option [vis-pred / vis-gt: visualize predicted vectors / visualize ground truth vectors and input video streams] ``` Note that the input perspective-view videos will be saved when generating the ground truth visualization. Examples: ```bash # Visualize MapTracker's prediction python tools/visualization/vis_per_frame.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py \ --data_path work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune/pos_predictions.pkl \ --out_dir vis_local/nuscenes_old/maptracker \ --option vis-pred # Visualize groud truth data python tools/visualization/vis_per_frame.py plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py \ --data_path datasets/nuscenes/nuscenes_map_infos_val_gt_tracks.pkl \ --out_dir vis_local/nuscenes_old/gt \ --option vis-gt ``` ================================================ FILE: docs/installation.md ================================================ # Environment Setup We use the same environment as StreamMapNet and the environment setup is largely borrowed from their repo. **Step 1.** Create conda environment and activate: ``` conda create --name maptracker python=3.8 -y conda activate maptracker ``` **Step 2.** Install PyTorch. ``` pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html ``` **Step 3.** Install MMCV series. ``` # Install mmcv-series pip install mmcv-full==1.6.0 pip install mmdet==2.28.2 pip install mmsegmentation==0.30.0 git clone https://github.com/open-mmlab/mmdetection3d.git cd mmdetection3d git checkout v1.0.0rc6 pip install -e . ``` **Step 4.** Install other requirements. ``` pip install -r requirements.txt ``` ================================================ FILE: plugin/__init__.py ================================================ from .models import * from .datasets import * ================================================ FILE: plugin/configs/_base_/datasets/coco_instance.py ================================================ dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(metric=['bbox', 'segm']) ================================================ FILE: plugin/configs/_base_/datasets/kitti-3d-3class.py ================================================ # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] point_cloud_range = [0, -40, -3, 70.4, 40, 1] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), classes=class_names, sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6)) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://kitti_data/')) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0.5], global_rot_range=[0.0, 0.0], rot_range=[-0.78539816, 0.78539816]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=6, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=1, pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/kitti-3d-car.py ================================================ # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Car'] point_cloud_range = [0, -40, -3, 70.4, 40, 1] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), classes=class_names, sample_groups=dict(Car=15)) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://kitti_data/')) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0.5], global_rot_range=[0.0, 0.0], rot_range=[-0.78539816, 0.78539816]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=6, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=1, pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/lyft-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-80, -80, -5, 80, 80, 3] # For Lyft we usually do 9-class detection class_names = [ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal' ] dataset_type = 'LyftDataset' data_root = 'data/lyft/' # Input modality for Lyft dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/lyft/': 's3://lyft/lyft/', # 'data/lyft/': 's3://lyft/lyft/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_test.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True)) # For Lyft dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/nuim_instance.py ================================================ dataset_type = 'CocoDataset' data_root = 'data/nuimages/' class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=[(1280, 720), (1920, 1080)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1600, 900), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/nuimages_v1.0-train.json', img_prefix=data_root, classes=class_names, pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=data_root + 'annotations/nuimages_v1.0-val.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/nuimages_v1.0-val.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline)) evaluation = dict(metric=['bbox', 'segm']) ================================================ FILE: plugin/configs/_base_/datasets/nus-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-50, -50, -5, 50, 50, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' # Input modality for nuScenes dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/nuscenes/': 's3://nuscenes/nuscenes/', # 'data/nuscenes/': 's3://nuscenes/nuscenes/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) # For nuScenes dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/nus-mono3d.py ================================================ dataset_type = 'NuScenesMonoDataset' data_root = 'data/nuscenes/' class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] # Input modality for nuScenes dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='LoadAnnotations3D', with_bbox=True, with_label=True, with_attr_label=True, with_bbox_3d=True, with_label_3d=True, with_bbox_depth=True), dict(type='Resize', img_scale=(1600, 900), keep_ratio=True), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=[ 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d', 'gt_labels_3d', 'centers2d', 'depths' ]), ] test_pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='MultiScaleFlipAug', scale_factor=1.0, flip=False, transforms=[ dict(type='RandomFlip3D'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img']), ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img']) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json', img_prefix=data_root, classes=class_names, pipeline=train_pipeline, modality=input_modality, test_mode=False, box_type_3d='Camera'), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline, modality=input_modality, test_mode=True, box_type_3d='Camera'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline, modality=input_modality, test_mode=True, box_type_3d='Camera')) evaluation = dict(interval=2) ================================================ FILE: plugin/configs/_base_/datasets/range100_lyft-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-100, -100, -5, 100, 100, 3] # For Lyft we usually do 9-class detection class_names = [ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal' ] dataset_type = 'LyftDataset' data_root = 'data/lyft/' # Input modality for Lyft dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/lyft/': 's3://lyft/lyft/', # 'data/lyft/': 's3://lyft/lyft/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_test.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True)) # For Lyft dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/s3dis_seg-3d-13class.py ================================================ # dataset settings dataset_type = 'S3DISSegDataset' data_root = './data/s3dis/' class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door', 'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter') num_points = 4096 train_area = [1, 2, 3, 4, 6] test_area = 5 train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='LoadAnnotations3D', with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=tuple(range(len(class_names))), max_cat_id=13), dict( type='IndoorPatchPointSample', num_points=num_points, block_size=1.0, ignore_index=len(class_names), use_normalized_coord=True, enlarge_size=0.2, min_unique_num=None), dict(type='NormalizePointsColor', color_mean=None), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict(type='NormalizePointsColor', color_mean=None), dict( # a wrapper in order to successfully call test function # actually we don't perform test-time-aug type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.0, flip_ratio_bev_vertical=0.0), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) # we need to load gt seg_mask! eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='LoadAnnotations3D', with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=tuple(range(len(class_names))), max_cat_id=13), dict( type='DefaultFormatBundle3D', with_label=False, class_names=class_names), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] data = dict( samples_per_gpu=8, workers_per_gpu=4, # train on area 1, 2, 3, 4, 6 # test on area 5 train=dict( type=dataset_type, data_root=data_root, ann_files=[ data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area ], pipeline=train_pipeline, classes=class_names, test_mode=False, ignore_index=len(class_names), scene_idxs=[ data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area ]), val=dict( type=dataset_type, data_root=data_root, ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, ignore_index=len(class_names), scene_idxs=data_root + f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'), test=dict( type=dataset_type, data_root=data_root, ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, ignore_index=len(class_names))) evaluation = dict(pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/scannet-3d-18class.py ================================================ # dataset settings dataset_type = 'ScanNetDataset' data_root = './data/scannet/' class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin') train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_mask_3d=True, with_seg_3d=True), dict(type='GlobalAlignment', rotation_axis=2), dict( type='PointSegClassMapping', valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39), max_cat_id=40), dict(type='IndoorPointSample', num_points=40000), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.087266, 0.087266], scale_ratio_range=[1.0, 1.0], shift_height=True), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=[ 'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask', 'pts_instance_mask' ]) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict(type='GlobalAlignment', rotation_axis=2), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='IndoorPointSample', num_points=40000), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, load_dim=6, use_dim=[0, 1, 2]), dict(type='GlobalAlignment', rotation_axis=2), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=8, workers_per_gpu=4, train=dict( type='RepeatDataset', times=5, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_train.pkl', pipeline=train_pipeline, filter_empty_gt=False, classes=class_names, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='Depth')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth')) evaluation = dict(pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/scannet_seg-3d-20class.py ================================================ # dataset settings dataset_type = 'ScanNetSegDataset' data_root = './data/scannet/' class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'otherfurniture') num_points = 8192 train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='LoadAnnotations3D', with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39), max_cat_id=40), dict( type='IndoorPatchPointSample', num_points=num_points, block_size=1.5, ignore_index=len(class_names), use_normalized_coord=False, enlarge_size=0.2, min_unique_num=None), dict(type='NormalizePointsColor', color_mean=None), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict(type='NormalizePointsColor', color_mean=None), dict( # a wrapper in order to successfully call test function # actually we don't perform test-time-aug type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.0, flip_ratio_bev_vertical=0.0), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) # we need to load gt seg_mask! eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='LoadAnnotations3D', with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39), max_cat_id=40), dict( type='DefaultFormatBundle3D', with_label=False, class_names=class_names), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] data = dict( samples_per_gpu=8, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_train.pkl', pipeline=train_pipeline, classes=class_names, test_mode=False, ignore_index=len(class_names), scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, ignore_index=len(class_names)), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, ignore_index=len(class_names))) evaluation = dict(pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/sunrgbd-3d-10class.py ================================================ dataset_type = 'SUNRGBDDataset' data_root = 'data/sunrgbd/' class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', 'night_stand', 'bookshelf', 'bathtub') train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict(type='LoadAnnotations3D'), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, ), dict( type='GlobalRotScaleTrans', rot_range=[-0.523599, 0.523599], scale_ratio_range=[0.85, 1.15], shift_height=True), dict(type='IndoorPointSample', num_points=20000), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, ), dict(type='IndoorPointSample', num_points=20000), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, load_dim=6, use_dim=[0, 1, 2]), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=16, workers_per_gpu=4, train=dict( type='RepeatDataset', times=5, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'sunrgbd_infos_train.pkl', pipeline=train_pipeline, classes=class_names, filter_empty_gt=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='Depth')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'sunrgbd_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'sunrgbd_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth')) evaluation = dict(pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/waymoD5-3d-3class.py ================================================ # dataset settings # D5 in the config name means the whole dataset is divided into 5 folds # We only use one fold for efficient experiments dataset_type = 'WaymoDataset' data_root = 'data/waymo/kitti_format/' file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://waymo_data/')) class_names = ['Car', 'Pedestrian', 'Cyclist'] point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'waymo_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), classes=class_names, sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args)) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=2, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_train.pkl', split='training', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', # load one frame every five frames load_interval=5)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/datasets/waymoD5-3d-car.py ================================================ # dataset settings # D5 in the config name means the whole dataset is divided into 5 folds # We only use one fold for efficient experiments dataset_type = 'WaymoDataset' data_root = 'data/waymo/kitti_format/' file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://waymo_data/')) class_names = ['Car'] point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'waymo_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), classes=class_names, sample_groups=dict(Car=15), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args)) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=2, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_train.pkl', split='training', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', # load one frame every five frames load_interval=5)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: plugin/configs/_base_/default_runtime.py ================================================ checkpoint_config = dict(interval=1) # yapf:disable push # By default we use textlogger hook and tensorboard # For more loggers see # https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = None resume_from = None workflow = [('train', 1)] ================================================ FILE: plugin/configs/_base_/models/3dssd.py ================================================ model = dict( type='SSD3DNet', backbone=dict( type='PointNet2SAMSG', in_channels=4, num_points=(4096, 512, (256, 256)), radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)), num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)), sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 64, 128), (64, 96, 128)), ((128, 128, 256), (128, 192, 256), (128, 256, 256))), aggregation_channels=(64, 128, 256), fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')), fps_sample_range_lists=((-1), (-1), (512, -1)), norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), sa_cfg=dict( type='PointSAModuleMSG', pool_mod='max', use_xyz=True, normalize_xyz=False)), bbox_head=dict( type='SSD3DHead', in_channels=256, vote_module_cfg=dict( in_channels=256, num_points=256, gt_per_seed=1, conv_channels=(128, ), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), with_res_feat=False, vote_xyz_range=(3.0, 3.0, 2.0)), vote_aggregation_cfg=dict( type='PointSAModuleMSG', num_point=256, radii=(4.8, 6.4), sample_nums=(16, 32), mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)), norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), use_xyz=True, normalize_xyz=False, bias=True), pred_layer_cfg=dict( in_channels=1536, shared_conv_channels=(512, 128), cls_conv_channels=(128, ), reg_conv_channels=(128, ), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), objectness_loss=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0), center_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), corner_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)), # model training and testing settings train_cfg=dict( sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05), test_cfg=dict( nms_cfg=dict(type='nms', iou_thr=0.1), sample_mod='spec', score_thr=0.0, per_class_proposal=True, max_output_num=100)) ================================================ FILE: plugin/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py ================================================ # model settings model = dict( type='CascadeRCNN', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), roi_head=dict( type='CascadeRoIHead', num_stages=3, stage_loss_weights=[1, 0.5, 0.25], bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=80, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ]), test_cfg=dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5))) ================================================ FILE: plugin/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py ================================================ voxel_size = [0.1, 0.1, 0.2] model = dict( type='CenterPoint', pts_voxel_layer=dict( max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)), pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), pts_middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[41, 1024, 1024], output_channels=128, order=('conv', 'norm', 'act'), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='CenterHead', in_channels=sum([256, 256]), tasks=[ dict(num_class=1, class_names=['car']), dict(num_class=2, class_names=['truck', 'construction_vehicle']), dict(num_class=2, class_names=['bus', 'trailer']), dict(num_class=1, class_names=['barrier']), dict(num_class=2, class_names=['motorcycle', 'bicycle']), dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), ], common_heads=dict( reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), share_conv_channel=64, bbox_coder=dict( type='CenterPointBBoxCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_num=500, score_threshold=0.1, out_size_factor=8, voxel_size=voxel_size[:2], code_size=9), separate_head=dict( type='SeparateHead', init_bias=-2.19, final_kernel=3), loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), norm_bbox=True), # model training and testing settings train_cfg=dict( pts=dict( grid_size=[1024, 1024, 40], voxel_size=voxel_size, out_size_factor=8, dense_reg=1, gaussian_overlap=0.1, max_objs=500, min_radius=2, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), test_cfg=dict( pts=dict( post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_per_img=500, max_pool_nms=False, min_radius=[4, 12, 10, 1, 0.85, 0.175], score_threshold=0.1, out_size_factor=8, voxel_size=voxel_size[:2], nms_type='rotate', pre_max_size=1000, post_max_size=83, nms_thr=0.2))) ================================================ FILE: plugin/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py ================================================ voxel_size = [0.2, 0.2, 8] model = dict( type='CenterPoint', pts_voxel_layer=dict( max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)), pts_voxel_encoder=dict( type='PillarFeatureNet', in_channels=5, feat_channels=[64], with_distance=False, voxel_size=(0.2, 0.2, 8), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), legacy=False), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)), pts_backbone=dict( type='SECOND', in_channels=64, out_channels=[64, 128, 256], layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], out_channels=[128, 128, 128], upsample_strides=[0.5, 1, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='CenterHead', in_channels=sum([128, 128, 128]), tasks=[ dict(num_class=1, class_names=['car']), dict(num_class=2, class_names=['truck', 'construction_vehicle']), dict(num_class=2, class_names=['bus', 'trailer']), dict(num_class=1, class_names=['barrier']), dict(num_class=2, class_names=['motorcycle', 'bicycle']), dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), ], common_heads=dict( reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), share_conv_channel=64, bbox_coder=dict( type='CenterPointBBoxCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_num=500, score_threshold=0.1, out_size_factor=4, voxel_size=voxel_size[:2], code_size=9), separate_head=dict( type='SeparateHead', init_bias=-2.19, final_kernel=3), loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), norm_bbox=True), # model training and testing settings train_cfg=dict( pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, out_size_factor=4, dense_reg=1, gaussian_overlap=0.1, max_objs=500, min_radius=2, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), test_cfg=dict( pts=dict( post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_per_img=500, max_pool_nms=False, min_radius=[4, 12, 10, 1, 0.85, 0.175], score_threshold=0.1, pc_range=[-51.2, -51.2], out_size_factor=4, voxel_size=voxel_size[:2], nms_type='rotate', pre_max_size=1000, post_max_size=83, nms_thr=0.2))) ================================================ FILE: plugin/configs/_base_/models/fcos3d.py ================================================ model = dict( type='FCOSMono3D', pretrained='open-mmlab://detectron2/resnet101_caffe', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='caffe'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs='on_output', num_outs=5, relu_before_extra_convs=True), bbox_head=dict( type='FCOSMono3DHead', num_classes=10, in_channels=256, stacked_convs=2, feat_channels=256, use_direction_classifier=True, diff_rad_by_sin=True, pred_attrs=True, pred_velo=True, dir_offset=0.7854, # pi/4 strides=[8, 16, 32, 64, 128], group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo cls_branch=(256, ), reg_branch=( (256, ), # offset (256, ), # depth (256, ), # size (256, ), # rot () # velo ), dir_branch=(256, ), attr_branch=(256, ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_attr=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_centerness=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), norm_on_bbox=True, centerness_on_reg=True, center_sampling=True, conv_bias=True, dcn_on_last_conv=True), train_cfg=dict( allowed_border=0, code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05], pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=1000, nms_thr=0.8, score_thr=0.05, min_bbox_size=0, max_per_img=200)) ================================================ FILE: plugin/configs/_base_/models/groupfree3d.py ================================================ model = dict( type='GroupFree3DNet', backbone=dict( type='PointNet2SASSG', in_channels=3, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 288)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True)), bbox_head=dict( type='GroupFree3DHead', in_channels=288, num_decoder_layers=6, num_proposal=256, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict( type='GroupFree3DMHA', embed_dims=288, num_heads=8, attn_drop=0.1, dropout_layer=dict(type='Dropout', drop_prob=0.1)), ffn_cfgs=dict( embed_dims=288, feedforward_channels=2048, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True)), operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')), pred_layer_cfg=dict( in_channels=288, shared_conv_channels=(288, 288), bias=True), sampling_objectness_loss=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=8.0), objectness_loss=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), center_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), # model training and testing settings train_cfg=dict(sample_mod='kps'), test_cfg=dict( sample_mod='kps', nms_thr=0.25, score_thr=0.0, per_class_proposal=True, prediction_stages='last')) ================================================ FILE: plugin/configs/_base_/models/h3dnet.py ================================================ primitive_z_cfg = dict( type='PrimitiveHead', num_dims=2, num_classes=18, primitive_mode='z', upper_thresh=100.0, surface_thresh=0.5, vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=1, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=1024, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.4, 0.6], reduction='mean', loss_weight=30.0), center_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_reg_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_cls_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), train_cfg=dict( dist_thresh=0.2, var_thresh=1e-2, lower_thresh=1e-6, num_point=100, num_point_line=10, line_thresh=0.2)) primitive_xy_cfg = dict( type='PrimitiveHead', num_dims=1, num_classes=18, primitive_mode='xy', upper_thresh=100.0, surface_thresh=0.5, vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=1, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=1024, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.4, 0.6], reduction='mean', loss_weight=30.0), center_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_reg_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_cls_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), train_cfg=dict( dist_thresh=0.2, var_thresh=1e-2, lower_thresh=1e-6, num_point=100, num_point_line=10, line_thresh=0.2)) primitive_line_cfg = dict( type='PrimitiveHead', num_dims=0, num_classes=18, primitive_mode='line', upper_thresh=100.0, surface_thresh=0.5, vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=1, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=1024, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.4, 0.6], reduction='mean', loss_weight=30.0), center_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=1.0, loss_dst_weight=1.0), semantic_reg_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=1.0, loss_dst_weight=1.0), semantic_cls_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=2.0), train_cfg=dict( dist_thresh=0.2, var_thresh=1e-2, lower_thresh=1e-6, num_point=100, num_point_line=10, line_thresh=0.2)) model = dict( type='H3DNet', backbone=dict( type='MultiBackbone', num_streams=4, suffixes=['net0', 'net1', 'net2', 'net3'], conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01), act_cfg=dict(type='ReLU'), backbones=dict( type='PointNet2SASSG', in_channels=4, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 256)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True))), rpn_head=dict( type='VoteHead', vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=3, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=256, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), pred_layer_cfg=dict( in_channels=128, shared_conv_channels=(128, 128), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), roi_head=dict( type='H3DRoIHead', primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg], bbox_head=dict( type='H3DBboxHead', gt_per_seed=3, num_proposal=256, suface_matching_cfg=dict( type='PointSAModule', num_point=256 * 6, radius=0.5, num_sample=32, mlp_channels=[128 + 6, 128, 64, 32], use_xyz=True, normalize_xyz=True), line_matching_cfg=dict( type='PointSAModule', num_point=256 * 12, radius=0.5, num_sample=32, mlp_channels=[128 + 12, 128, 64, 32], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), primitive_refine_channels=[128, 128, 128], upper_thresh=100.0, surface_thresh=0.5, line_thresh=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), cues_objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.3, 0.7], reduction='mean', loss_weight=5.0), cues_semantic_loss=dict( type='CrossEntropyLoss', class_weight=[0.3, 0.7], reduction='mean', loss_weight=5.0), proposal_objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='none', loss_weight=5.0), primitive_center_loss=dict( type='MSELoss', reduction='none', loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), rpn_proposal=dict(use_nms=False), rcnn=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote', far_threshold=0.6, near_threshold=0.3, mask_surface_threshold=0.3, label_surface_threshold=0.3, mask_line_threshold=0.3, label_line_threshold=0.3)), test_cfg=dict( rpn=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True, use_nms=False), rcnn=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True))) ================================================ FILE: plugin/configs/_base_/models/hv_pointpillars_fpn_lyft.py ================================================ _base_ = './hv_pointpillars_fpn_nus.py' # model settings (based on nuScenes model settings) # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. model = dict( pts_voxel_layer=dict( max_num_points=20, point_cloud_range=[-80, -80, -5, 80, 80, 3], max_voxels=(60000, 60000)), pts_voxel_encoder=dict( feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]), pts_middle_encoder=dict(output_shape=[640, 640]), pts_bbox_head=dict( num_classes=9, anchor_generator=dict( ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]), bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), # model training settings (based on nuScenes model settings) train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) ================================================ FILE: plugin/configs/_base_/models/hv_pointpillars_fpn_nus.py ================================================ # model settings # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. voxel_size = [0.25, 0.25, 8] model = dict( type='MVXFasterRCNN', pts_voxel_layer=dict( max_num_points=64, point_cloud_range=[-50, -50, -5, 50, 50, 3], voxel_size=voxel_size, max_voxels=(30000, 40000)), pts_voxel_encoder=dict( type='HardVFE', in_channels=4, feat_channels=[64, 64], with_distance=False, voxel_size=voxel_size, with_cluster_center=True, with_voxel_center=True, point_cloud_range=[-50, -50, -5, 50, 50, 3], norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]), pts_backbone=dict( type='SECOND', in_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], out_channels=[64, 128, 256]), pts_neck=dict( type='FPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), act_cfg=dict(type='ReLU'), in_channels=[64, 128, 256], out_channels=256, start_level=0, num_outs=3), pts_bbox_head=dict( type='Anchor3DHead', num_classes=10, in_channels=256, feat_channels=256, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-50, -50, -1.8, 50, 50, -1.8]], scales=[1, 2, 4], sizes=[ [0.8660, 2.5981, 1.], # 1.5/sqrt(3) [0.5774, 1.7321, 1.], # 1/sqrt(3) [1., 1., 1.], [0.4, 0.4, 1], ], custom_values=[0, 0], rotations=[0, 1.57], reshape_out=True), assigner_per_size=False, diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( pts=dict( assigner=dict( type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], pos_weight=-1, debug=False)), test_cfg=dict( pts=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=1000, nms_thr=0.2, score_thr=0.05, min_bbox_size=0, max_num=500))) ================================================ FILE: plugin/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py ================================================ _base_ = './hv_pointpillars_fpn_nus.py' # model settings (based on nuScenes model settings) # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. model = dict( pts_voxel_layer=dict( max_num_points=20, point_cloud_range=[-100, -100, -5, 100, 100, 3], max_voxels=(60000, 60000)), pts_voxel_encoder=dict( feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]), pts_middle_encoder=dict(output_shape=[800, 800]), pts_bbox_head=dict( num_classes=9, anchor_generator=dict( ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]), bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), # model training settings (based on nuScenes model settings) train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) ================================================ FILE: plugin/configs/_base_/models/hv_pointpillars_secfpn_kitti.py ================================================ voxel_size = [0.16, 0.16, 4] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=32, # max_points_per_voxel point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1], voxel_size=voxel_size, max_voxels=(16000, 40000) # (training, testing) max_voxels ), voxel_encoder=dict( type='PillarFeatureNet', in_channels=4, feat_channels=[64], with_distance=False, voxel_size=voxel_size, point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]), middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]), backbone=dict( type='SECOND', in_channels=64, layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], out_channels=[64, 128, 256]), neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=384, feat_channels=384, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[ [0, -39.68, -0.6, 70.4, 39.68, -0.6], [0, -39.68, -0.6, 70.4, 39.68, -0.6], [0, -39.68, -1.78, 70.4, 39.68, -1.78], ], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), ], allowed_border=0, pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) ================================================ FILE: plugin/configs/_base_/models/hv_pointpillars_secfpn_waymo.py ================================================ # model settings # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. voxel_size = [0.32, 0.32, 6] model = dict( type='MVXFasterRCNN', pts_voxel_layer=dict( max_num_points=20, point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], voxel_size=voxel_size, max_voxels=(32000, 32000)), pts_voxel_encoder=dict( type='HardVFE', in_channels=5, feat_channels=[64], with_distance=False, voxel_size=voxel_size, with_cluster_center=True, with_voxel_center=True, point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]), pts_backbone=dict( type='SECOND', in_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), layer_nums=[3, 5, 5], layer_strides=[1, 2, 2], out_channels=[64, 128, 256]), pts_neck=dict( type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=384, feat_channels=384, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345], [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188], [-74.88, -74.88, 0, 74.88, 74.88, 0]], sizes=[ [2.08, 4.73, 1.77], # car [0.84, 1.81, 1.77], # cyclist [0.84, 0.91, 1.74] # pedestrian ], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( pts=dict( assigner=[ dict( # car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), dict( # pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), ], allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], pos_weight=-1, debug=False)), test_cfg=dict( pts=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=4096, nms_thr=0.25, score_thr=0.1, min_bbox_size=0, max_num=500))) ================================================ FILE: plugin/configs/_base_/models/hv_second_secfpn_kitti.py ================================================ voxel_size = [0.05, 0.05, 0.1] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=5, point_cloud_range=[0, -40, -3, 70.4, 40, 1], voxel_size=voxel_size, max_voxels=(16000, 40000)), voxel_encoder=dict(type='HardSimpleVFE'), middle_encoder=dict( type='SparseEncoder', in_channels=4, sparse_shape=[41, 1600, 1408], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=256, layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), ], allowed_border=0, pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) ================================================ FILE: plugin/configs/_base_/models/hv_second_secfpn_waymo.py ================================================ # model settings # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. voxel_size = [0.08, 0.08, 0.1] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=10, point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4], voxel_size=voxel_size, max_voxels=(80000, 90000)), voxel_encoder=dict(type='HardSimpleVFE', num_features=5), middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[61, 1280, 1920], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=384, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345], [-76.8, -51.2, 0, 76.8, 51.2, 0], [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]], sizes=[ [2.08, 4.73, 1.77], # car [0.84, 0.91, 1.74], # pedestrian [0.84, 1.81, 1.77] # cyclist ], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), dict( # cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1) ], allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=4096, nms_thr=0.25, score_thr=0.1, min_bbox_size=0, max_num=500)) ================================================ FILE: plugin/configs/_base_/models/imvotenet_image.py ================================================ model = dict( type='ImVoteNet', img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='caffe'), img_neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), img_rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), img_roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0))), # model training and testing settings train_cfg=dict( img_rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), img_rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), img_rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)), test_cfg=dict( img_rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), img_rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))) ================================================ FILE: plugin/configs/_base_/models/mask_rcnn_r50_fpn.py ================================================ # model settings model = dict( type='MaskRCNN', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=80, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)), test_cfg=dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5))) ================================================ FILE: plugin/configs/_base_/models/paconv_cuda_ssg.py ================================================ _base_ = './paconv_ssg.py' model = dict( backbone=dict( sa_cfg=dict( type='PAConvCUDASAModule', scorenet_cfg=dict(mlp_channels=[8, 16, 16])))) ================================================ FILE: plugin/configs/_base_/models/paconv_ssg.py ================================================ # model settings model = dict( type='EncoderDecoder3D', backbone=dict( type='PointNet2SASSG', in_channels=9, # [xyz, rgb, normalized_xyz] num_points=(1024, 256, 64, 16), radius=(None, None, None, None), # use kNN instead of ball query num_samples=(32, 32, 32, 32), sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256, 512)), fp_channels=(), norm_cfg=dict(type='BN2d', momentum=0.1), sa_cfg=dict( type='PAConvSAModule', pool_mod='max', use_xyz=True, normalize_xyz=False, paconv_num_kernels=[16, 16, 16], paconv_kernel_input='w_neighbor', scorenet_input='w_neighbor_dist', scorenet_cfg=dict( mlp_channels=[16, 16, 16], score_norm='softmax', temp_factor=1.0, last_bn=False))), decode_head=dict( type='PAConvHead', # PAConv model's decoder takes skip connections from beckbone # different from PointNet++, it also concats input features in the last # level of decoder, leading to `128 + 6` as the channel number fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), (128 + 6, 128, 128, 128)), channels=128, dropout_ratio=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, class_weight=None, # should be modified with dataset loss_weight=1.0)), # correlation loss to regularize PAConv's kernel weights loss_regularization=dict( type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='slide')) ================================================ FILE: plugin/configs/_base_/models/parta2.py ================================================ # model settings voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1] model = dict( type='PartA2', voxel_layer=dict( max_num_points=5, # max_points_per_voxel point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(16000, 40000) # (training, testing) max_voxels ), voxel_encoder=dict(type='HardSimpleVFE'), middle_encoder=dict( type='SparseUNet', in_channels=4, sparse_shape=[41, 1600, 1408], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=256, layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), rpn_head=dict( type='PartA2RPNHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78]], sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, assigner_per_size=True, assign_per_class=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), roi_head=dict( type='PartAggregationROIHead', num_classes=3, semantic_head=dict( type='PointwiseSemanticHead', in_channels=16, extra_width=0.2, seg_score_thr=0.3, num_classes=3, loss_seg=dict( type='FocalLoss', use_sigmoid=True, reduction='sum', gamma=2.0, alpha=0.25, loss_weight=1.0), loss_part=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), seg_roi_extractor=dict( type='Single3DRoIAwareExtractor', roi_layer=dict( type='RoIAwarePool3d', out_size=14, max_pts_per_voxel=128, mode='max')), part_roi_extractor=dict( type='Single3DRoIAwareExtractor', roi_layer=dict( type='RoIAwarePool3d', out_size=14, max_pts_per_voxel=128, mode='avg')), bbox_head=dict( type='PartA2BboxHead', num_classes=3, seg_in_channels=16, part_in_channels=4, seg_conv_channels=[64, 64], part_conv_channels=[64, 64], merge_conv_channels=[128, 128], down_conv_channels=[128, 256], bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), shared_fc_channels=[256, 512, 512, 512], cls_channels=[256, 256], reg_channels=[256, 256], dropout_ratio=0.1, roi_feat_size=14, with_corner_loss=True, loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, reduction='sum', loss_weight=1.0), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1) ], allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=9000, nms_post=512, max_num=512, nms_thr=0.8, score_thr=0, use_rotate_nms=False), rcnn=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1) ], sampler=dict( type='IoUNegPiecewiseSampler', num=128, pos_fraction=0.55, neg_piece_fractions=[0.8, 0.2], neg_iou_piece_thrs=[0.55, 0.1], neg_pos_ub=-1, add_gt_as_proposals=False, return_iou=True), cls_pos_thr=0.75, cls_neg_thr=0.25)), test_cfg=dict( rpn=dict( nms_pre=1024, nms_post=100, max_num=100, nms_thr=0.7, score_thr=0, use_rotate_nms=True), rcnn=dict( use_rotate_nms=True, use_raw_score=True, nms_thr=0.01, score_thr=0.1))) ================================================ FILE: plugin/configs/_base_/models/pointnet2_msg.py ================================================ _base_ = './pointnet2_ssg.py' # model settings model = dict( backbone=dict( _delete_=True, type='PointNet2SAMSG', in_channels=6, # [xyz, rgb], should be modified with dataset num_points=(1024, 256, 64, 16), radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)), num_samples=((16, 32), (16, 32), (16, 32), (16, 32)), sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96, 128)), ((128, 196, 256), (128, 196, 256)), ((256, 256, 512), (256, 384, 512))), aggregation_channels=(None, None, None, None), fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')), fps_sample_range_lists=((-1), (-1), (-1), (-1)), dilated_group=(False, False, False, False), out_indices=(0, 1, 2, 3), sa_cfg=dict( type='PointSAModuleMSG', pool_mod='max', use_xyz=True, normalize_xyz=False)), decode_head=dict( fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128), (128, 128, 128, 128)))) ================================================ FILE: plugin/configs/_base_/models/pointnet2_ssg.py ================================================ # model settings model = dict( type='EncoderDecoder3D', backbone=dict( type='PointNet2SASSG', in_channels=6, # [xyz, rgb], should be modified with dataset num_points=(1024, 256, 64, 16), radius=(0.1, 0.2, 0.4, 0.8), num_samples=(32, 32, 32, 32), sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256, 512)), fp_channels=(), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=False)), decode_head=dict( type='PointNet2Head', fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), (128, 128, 128, 128)), channels=128, dropout_ratio=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, class_weight=None, # should be modified with dataset loss_weight=1.0)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='slide')) ================================================ FILE: plugin/configs/_base_/models/votenet.py ================================================ model = dict( type='VoteNet', backbone=dict( type='PointNet2SASSG', in_channels=4, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 256)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True)), bbox_head=dict( type='VoteHead', vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=3, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=256, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), pred_layer_cfg=dict( in_channels=128, shared_conv_channels=(128, 128), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), # model training and testing settings train_cfg=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), test_cfg=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True)) ================================================ FILE: plugin/configs/_base_/schedules/cosine.py ================================================ # This schedule is mainly used by models with dynamic voxelization # optimizer lr = 0.003 # max learning rate optimizer = dict( type='AdamW', lr=lr, betas=(0.95, 0.99), # the momentum is change during training weight_decay=0.001) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 10, min_lr_ratio=1e-5) momentum_config = None runner = dict(type='EpochBasedRunner', max_epochs=40) ================================================ FILE: plugin/configs/_base_/schedules/cyclic_20e.py ================================================ # For nuScenes dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 20. Please change the interval accordingly if you do not # use a default schedule. # optimizer # This schedule is mainly used by models on nuScenes dataset optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01) # max_norm=10 is better for SECOND optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4, ) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4, ) # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=20) ================================================ FILE: plugin/configs/_base_/schedules/cyclic_40e.py ================================================ # The schedule is usually used by models trained on KITTI dataset # The learning rate set in the cyclic schedule is the initial learning rate # rather than the max learning rate. Since the target_ratio is (10, 1e-4), # the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4 lr = 0.0018 # The optimizer follows the setting in SECOND.Pytorch, but here we use # the offcial AdamW optimizer implemented by PyTorch. optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) # We use cyclic learning rate and momentum schedule following SECOND.Pytorch # https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69 # noqa # We implement them in mmcv, for more details, please refer to # https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327 # noqa # https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130 # noqa lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4, ) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4, ) # Although the max_epochs is 40, this schedule is usually used we # RepeatDataset with repeat ratio N, thus the actual max epoch # number could be Nx40 runner = dict(type='EpochBasedRunner', max_epochs=40) ================================================ FILE: plugin/configs/_base_/schedules/mmdet_schedule_1x.py ================================================ # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[8, 11]) runner = dict(type='EpochBasedRunner', max_epochs=12) ================================================ FILE: plugin/configs/_base_/schedules/schedule_2x.py ================================================ # optimizer # This schedule is mainly used by models on nuScenes dataset optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01) # max_norm=10 is better for SECOND optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 1000, step=[20, 23]) momentum_config = None # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=24) ================================================ FILE: plugin/configs/_base_/schedules/schedule_3x.py ================================================ # optimizer # This schedule is mainly used by models on indoor dataset, # e.g., VoteNet on SUNRGBD and ScanNet lr = 0.008 # max learning rate optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) lr_config = dict(policy='step', warmup=None, step=[24, 32]) # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=36) ================================================ FILE: plugin/configs/_base_/schedules/seg_cosine_150e.py ================================================ # optimizer # This schedule is mainly used on S3DIS dataset in segmentation task optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9) optimizer_config = dict(grad_clip=None) lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002) momentum_config = None # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=150) ================================================ FILE: plugin/configs/_base_/schedules/seg_cosine_200e.py ================================================ # optimizer # This schedule is mainly used on ScanNet dataset in segmentation task optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01) optimizer_config = dict(grad_clip=None) lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) momentum_config = None # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=200) ================================================ FILE: plugin/configs/_base_/schedules/seg_cosine_50e.py ================================================ # optimizer # This schedule is mainly used on S3DIS dataset in segmentation task optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001) optimizer_config = dict(grad_clip=None) lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) momentum_config = None # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=50) ================================================ FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_100x50_newsplit_5frame_span10_stage1_bev_pretrain.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 608 img_w = 608 img_size = (img_h, img_w) num_cams = 7 num_gpus = 8 batch_size = 1 num_iters_per_epoch = 29293 // (num_gpus * batch_size) num_epochs = 12 num_epochs_interval = num_epochs // 6 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (100, 50) # bev range, 100m in x-axis, 50m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=True, freeze_bev=False, track_fp_aug=False, use_memory=False, mem_len=4, mem_warmup_iters=500, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, num_cams=num_cams, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, num_cams=num_cams, ), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img',], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) match_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, interval=4, ), val=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, eval_semantic=True, ), test=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, eval_semantic=True, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=5e-2) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True ================================================ FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_100x50_newsplit_5frame_span10_stage2_warmup.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 608 img_w = 608 img_size = (img_h, img_w) num_cams = 7 num_gpus = 8 batch_size = 6 num_iters_per_epoch = 29293 // (num_gpus * batch_size) num_epochs = 3 num_epochs_interval = num_epochs total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (100, 50) # bev range, 100m in x-axis, 50m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=True, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=500, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, num_cams=num_cams, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, num_cams=num_cams, ), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='PV_Map', img_shape=img_size, feat_down_sample=8, thickness=1, coords_dim=coords_dim, pv_mask=True, num_cams=num_cams, num_coords=3, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask', 'pv_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img',], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) match_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, interval=4, ), val=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), test=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=0.95) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = 'work_dirs/maptracker_av2_100x50_newsplit_5frame_span10_stage1_bev_pretrain/latest.pth' ================================================ FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_100x50_newsplit_5frame_span10_stage3_joint_finetune.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 608 img_w = 608 img_size = (img_h, img_w) num_cams = 7 num_gpus = 8 batch_size = 2 num_iters_per_epoch = 29293 // (num_gpus * batch_size) num_epochs = 20 num_epochs_interval = num_epochs // 5 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (100, 50) # bev range, 100m in x-axis, 50m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=False, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=-1, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, num_cams=num_cams, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, num_cams=num_cams, ), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img',], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) match_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, interval=4, ), val=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), test=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'backbone.img_backbone': dict(lr_mult=0.1), 'backbone.img_neck': dict(lr_mult=0.5), 'backbone.transformer': dict(lr_mult=0.5), 'backbone.positional_encoding': dict(lr_mult=0.5), 'seg_decoder': dict(lr_mult=0.5), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=3e-3) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = 'work_dirs/maptracker_av2_100x50_newsplit_5frame_span10_stage2_warmup/latest.pth' ================================================ FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_newsplit_5frame_span10_stage1_bev_pretrain.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 608 img_w = 608 img_size = (img_h, img_w) num_cams = 7 num_gpus = 8 batch_size = 1 num_iters_per_epoch = 29293 // (num_gpus * batch_size) num_epochs = 12 num_epochs_interval = num_epochs // 6 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=True, freeze_bev=False, track_fp_aug=False, use_memory=False, mem_len=4, mem_warmup_iters=500, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, num_cams=num_cams, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, num_cams=num_cams, ), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img',], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) match_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, interval=4, ), val=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, eval_semantic=True, ), test=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, eval_semantic=True, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=5e-2) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True ================================================ FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_newsplit_5frame_span10_stage2_warmup.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 608 img_w = 608 img_size = (img_h, img_w) num_cams = 7 num_gpus = 8 batch_size = 6 num_iters_per_epoch = 29293 // (num_gpus * batch_size) num_epochs = 3 num_epochs_interval = num_epochs total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=True, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=500, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, num_cams=num_cams, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, num_cams=num_cams, ), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='PV_Map', img_shape=img_size, feat_down_sample=8, thickness=1, coords_dim=coords_dim, pv_mask=True, num_cams=num_cams, num_coords=3, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask', 'pv_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img',], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) match_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, interval=4, ), val=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), test=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=0.95) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = 'work_dirs/maptracker_av2_newsplit_5frame_span10_stage1_bev_pretrain/latest.pth' ================================================ FILE: plugin/configs/maptracker/av2_newsplit/maptracker_av2_newsplit_5frame_span10_stage3_joint_finetune.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 608 img_w = 608 img_size = (img_h, img_w) num_cams = 7 num_gpus = 8 batch_size = 2 num_iters_per_epoch = 29293 // (num_gpus * batch_size) num_epochs = 20 num_epochs_interval = num_epochs // 5 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=False, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=-1, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, num_cams=num_cams, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, num_cams=num_cams, ), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img',], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) match_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, interval=4, ), val=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), test=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'backbone.img_backbone': dict(lr_mult=0.1), 'backbone.img_neck': dict(lr_mult=0.5), 'backbone.transformer': dict(lr_mult=0.5), 'backbone.positional_encoding': dict(lr_mult=0.5), 'seg_decoder': dict(lr_mult=0.5), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=3e-3) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = 'work_dirs/maptracker_av2_newsplit_5frame_span10_stage2_warmup/latest.pth' ================================================ FILE: plugin/configs/maptracker/av2_oldsplit/maptracker_av2_oldsplit_5frame_span10_stage1_bev_pretrain.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 608 img_w = 608 img_size = (img_h, img_w) num_cams = 7 num_gpus = 8 batch_size = 1 num_iters_per_epoch = 27243 // (num_gpus * batch_size) num_epochs = 12 num_epochs_interval = num_epochs // 6 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=True, freeze_bev=False, track_fp_aug=False, use_memory=False, mem_len=4, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, num_cams=num_cams, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, num_cams=num_cams, ), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img',], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) match_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, interval=4, ), val=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, eval_semantic=True, ), test=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, eval_semantic=True, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=5e-2) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True ================================================ FILE: plugin/configs/maptracker/av2_oldsplit/maptracker_av2_oldsplit_5frame_span10_stage2_warmup.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 608 img_w = 608 img_size = (img_h, img_w) num_cams = 7 num_gpus = 8 batch_size = 6 num_iters_per_epoch = 27243 // (num_gpus * batch_size) num_epochs = 3 num_epochs_interval = num_epochs total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=True, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=500, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', #pretrained='torchvision://resnet18', depth=50, #depth=18, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], #in_channels=[128, 256, 512], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, num_cams=num_cams, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, num_cams=num_cams, ), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, # operation_order=('norm', 'self_attn', 'norm', 'cross_attn', # 'norm', 'ffn',) operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='PV_Map', img_shape=img_size, feat_down_sample=8, thickness=1, coords_dim=coords_dim, pv_mask=True, num_cams=num_cams, num_coords=3, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask', 'pv_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img',], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) match_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, interval=4, ), val=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), test=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=0.95) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = 'work_dirs/maptracker_av2_oldsplit_5frame_span10_stage1_bev_pretrain/latest.pth' ================================================ FILE: plugin/configs/maptracker/av2_oldsplit/maptracker_av2_oldsplit_5frame_span10_stage3_joint_finetune.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 608 img_w = 608 img_size = (img_h, img_w) num_cams = 7 num_gpus = 8 batch_size = 2 num_iters_per_epoch = 27243 // (num_gpus * batch_size) num_epochs = 20 num_epochs_interval = num_epochs // 5 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=False, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=-1, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, num_cams=num_cams, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, num_cams=num_cams, ), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img',], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) match_config = dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=4, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_train.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, interval=4, ), val=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), test=dict( type='AV2Dataset', ann_file='./datasets/av2/av2_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, interval=4, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'backbone.img_backbone': dict(lr_mult=0.1), 'backbone.img_neck': dict(lr_mult=0.5), 'backbone.transformer': dict(lr_mult=0.5), 'backbone.positional_encoding': dict(lr_mult=0.5), 'seg_decoder': dict(lr_mult=0.5), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=3e-3) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = 'work_dirs/maptracker_av2_oldsplit_5frame_span10_stage2_warmup/latest.pth' ================================================ FILE: plugin/configs/maptracker/nuscenes_newsplit/maptracker_nusc_newsplit_5frame_span10_stage1_bev_pretrain.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 480 img_w = 800 img_size = (img_h, img_w) num_cams = 6 num_gpus = 8 batch_size = 3 num_iters_per_epoch = 27846 // (num_gpus * batch_size) num_epochs = 18 num_epochs_interval = num_epochs // 6 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=True, freeze_bev=False, track_fp_aug=False, use_memory=False, mem_len=4, mem_warmup_iters=500, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) match_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'ego2cam', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name', 'img_filenames', 'cam_intrinsics', 'cam_extrinsics', 'lidar2ego_translation', 'lidar2ego_rotation']) ], interval=1, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, sampling_span=10, ), val=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, eval_semantic=True, ), test=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, eval_semantic=True, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=5e-2) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True ================================================ FILE: plugin/configs/maptracker/nuscenes_newsplit/maptracker_nusc_newsplit_5frame_span10_stage2_warmup.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 480 img_w = 800 img_size = (img_h, img_w) num_gpus = 8 batch_size = 8 num_iters_per_epoch = 27846 // (num_gpus * batch_size) num_epochs = 4 num_epochs_interval = num_epochs total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=True, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=500, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, use_grid_mask=True, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], #in_channels=[128, 256, 512], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, ) ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, # operation_order=('norm', 'self_attn', 'norm', 'cross_attn', # 'norm', 'ffn',) operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) match_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=10, train=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, sampling_span=10, ), val=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, ), test=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=0.95) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = "work_dirs/maptracker_nusc_newsplit_5frame_span10_stage1_bev_pretrain/latest.pth" ================================================ FILE: plugin/configs/maptracker/nuscenes_newsplit/maptracker_nusc_newsplit_5frame_span10_stage3_joint_finetune.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 480 img_w = 800 img_size = (img_h, img_w) num_gpus = 8 batch_size = 4 num_iters_per_epoch = 27846 // (num_gpus * batch_size) num_epochs = 36 num_epochs_interval = num_epochs // 6 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=False, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=-1, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, use_grid_mask=True, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, ) ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, # operation_order=('norm', 'self_attn', 'norm', 'cross_attn', # 'norm', 'ffn',) operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) match_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_train_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, sampling_span=10, ), val=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, ), test=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val_newsplit.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'backbone.img_backbone': dict(lr_mult=0.1), 'backbone.img_neck': dict(lr_mult=0.5), 'backbone.transformer': dict(lr_mult=0.5), 'backbone.positional_encoding': dict(lr_mult=0.5), 'seg_decoder': dict(lr_mult=0.5), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=3e-3) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = "work_dirs/maptracker_nusc_newsplit_5frame_span10_stage2_warmup/latest.pth" ================================================ FILE: plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage1_bev_pretrain.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 480 img_w = 800 img_size = (img_h, img_w) num_cams = 6 num_gpus = 8 batch_size = 1 num_iters_per_epoch = 27968 // (num_gpus * batch_size) num_epochs = 18 num_epochs_interval = num_epochs // 6 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=True, freeze_bev=False, track_fp_aug=False, use_memory=False, mem_len=4, mem_warmup_iters=500, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, use_grid_mask=True, history_steps=4, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims), ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, # operation_order=('norm', 'self_attn', 'norm', 'cross_attn', # 'norm', 'ffn',) operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) match_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'ego2cam', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name', 'img_filenames', 'cam_intrinsics', 'cam_extrinsics', 'lidar2ego_translation', 'lidar2ego_rotation']) ], interval=1, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_train.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, sampling_span=10, ), val=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, eval_semantic=True, ), test=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, eval_semantic=True, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=5e-2) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True ================================================ FILE: plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage2_warmup.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 480 img_w = 800 img_size = (img_h, img_w) num_gpus = 8 batch_size = 6 num_iters_per_epoch = 27968 // (num_gpus * batch_size) num_epochs = 4 num_epochs_interval = num_epochs total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=True, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=500, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, use_grid_mask=True, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, ) ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, # operation_order=('norm', 'self_attn', 'norm', 'cross_attn', # 'norm', 'ffn',) operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) match_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_train.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, sampling_span=10, ), val=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, ), test=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=0.95) # only slightly decay evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = "work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage1_bev_pretrain/latest.pth" ================================================ FILE: plugin/configs/maptracker/nuscenes_oldsplit/maptracker_nusc_oldsplit_5frame_span10_stage3_joint_finetune.py ================================================ _base_ = [ '../../_base_/default_runtime.py' ] # model type type = 'Mapper' plugin = True # plugin code dir plugin_dir = 'plugin/' [] # img configs img_norm_cfg = dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) img_h = 480 img_w = 800 img_size = (img_h, img_w) num_gpus = 8 batch_size = 2 num_iters_per_epoch = 27968 // (num_gpus * batch_size) num_epochs = 48 num_epochs_interval = num_epochs // 8 total_iters = num_epochs * num_iters_per_epoch num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 # bev configs roi_size = (60, 30) # bev range, 60m in x-axis, 30m in y-axis bev_h = 50 bev_w = 100 pc_range = [-roi_size[0]/2, -roi_size[1]/2, -3, roi_size[0]/2, roi_size[1]/2, 5] # vectorize params coords_dim = 2 sample_dist = -1 sample_num = -1 simplify = True # rasterize params (for temporal matching use) canvas_size = (200, 100) # bev feature size thickness = 3 # thickness of rasterized polylines # meta info for submission pkl meta = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False, output_format='vector') # model configs bev_embed_dims = 256 embed_dims = 512 num_feat_levels = 3 norm_cfg = dict(type='BN2d') num_class = max(list(cat2id.values()))+1 num_points = 20 permute = True model = dict( type='MapTracker', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, test_time_history_steps=20, mem_select_dist_ranges=[1, 5, 10, 15], skip_vector_head=False, freeze_bev=False, track_fp_aug=False, use_memory=True, mem_len=4, mem_warmup_iters=-1, backbone_cfg=dict( type='BEVFormerBackbone', roi_size=roi_size, bev_h=bev_h, bev_w=bev_w, history_steps=4, use_grid_mask=True, img_backbone=dict( type='ResNet', with_cp=False, # pretrained='./resnet50_checkpoint.pth', pretrained='open-mmlab://detectron2/resnet50_caffe', depth=50, num_stages=4, out_indices=(1, 2, 3), frozen_stages=-1, norm_cfg=norm_cfg, norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True) ), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=bev_embed_dims, start_level=0, add_extra_convs=True, num_outs=num_feat_levels, norm_cfg=norm_cfg, relu_before_extra_convs=True), transformer=dict( type='PerceptionTransformer', embed_dims=bev_embed_dims, encoder=dict( type='BEVFormerEncoder', num_layers=2, pc_range=pc_range, num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=bev_embed_dims, num_levels=1), dict( type='SpatialCrossAttention', deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=bev_embed_dims, num_points=8, num_levels=num_feat_levels), embed_dims=bev_embed_dims, ) ], feedforward_channels=bev_embed_dims*2, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ), ), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=bev_embed_dims//2, row_num_embed=bev_h, col_num_embed=bev_w, ), ), head_cfg=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=bev_embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, trans_loss_weight=0.1, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, prop_add_stage=1, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, ## an addtional cross attention for vector memory fusion operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=5.0 ), loss_reg=dict( type='LinesL1Loss', loss_weight=50.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=5.0), reg_cost=dict(type='LinesL1Cost', weight=50.0, beta=0.01, permute=permute), ), ), ), seg_cfg=dict( type='MapSegHead', num_classes=num_class, in_channels=bev_embed_dims, embed_dims=bev_embed_dims, bev_size=(bev_w, bev_h), canvas_size=canvas_size, loss_seg=dict( type='MaskFocalLoss', use_sigmoid=True, loss_weight=10.0, ), loss_dice=dict( type='MaskDiceLoss', loss_weight=1.0, ) ), model_name='SingleStage' ) # data processing pipelines train_pipeline = [ dict( type='VectorizeMap', coords_dim=coords_dim, roi_size=roi_size, sample_num=num_points, normalize=True, permute=permute, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, semantic_mask=True, ), dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='PhotoMetricDistortionMultiViewImage'), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img', 'vectors', 'semantic_mask'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # data processing pipelines test_pipeline = [ dict(type='LoadMultiViewImagesFromFiles', to_float32=True), dict(type='ResizeMultiViewImages', size=img_size, # H, W change_intrinsics=True, ), dict(type='Normalize3D', **img_norm_cfg), dict(type='PadMultiViewImages', size_divisor=32), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['img'], meta_keys=( 'token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name')) ] # configs for evaluation code # DO NOT CHANGE eval_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=True, normalize=False, roi_size=roi_size ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors',], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) match_config = dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=[ dict( type='VectorizeMap', coords_dim=coords_dim, simplify=False, normalize=True, roi_size=roi_size, sample_num=num_points, ), dict( type='RasterizeMap', roi_size=roi_size, coords_dim=coords_dim, canvas_size=canvas_size, thickness=thickness, ), dict(type='FormatBundleMap'), dict(type='Collect3D', keys=['vectors', 'semantic_mask'], meta_keys=['token', 'ego2img', 'sample_idx', 'ego2global_translation', 'ego2global_rotation', 'img_shape', 'scene_name']) ], interval=1, ) # dataset configs data = dict( samples_per_gpu=batch_size, workers_per_gpu=8, train=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_train.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=train_pipeline, seq_split_num=-2, matching=True, multi_frame=5, sampling_span=10, ), val=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, ), test=dict( type='NuscDataset', data_root='./datasets/nuscenes', ann_file='./datasets/nuscenes/nuscenes_map_infos_val.pkl', meta=meta, roi_size=roi_size, cat2id=cat2id, pipeline=test_pipeline, eval_config=eval_config, test_mode=True, seq_split_num=1, ), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') ) # optimizer optimizer = dict( type='AdamW', lr=5e-4, paramwise_cfg=dict( custom_keys={ 'backbone.img_backbone': dict(lr_mult=0.1), 'backbone.img_neck': dict(lr_mult=0.5), 'backbone.transformer': dict(lr_mult=0.5), 'backbone.positional_encoding': dict(lr_mult=0.5), 'seg_decoder': dict(lr_mult=0.5), }), weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # learning policy & schedule lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=3e-3) evaluation = dict(interval=num_epochs_interval*num_iters_per_epoch) #evaluation = dict(interval=1) # for debugging use.. find_unused_parameters = True #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_epochs_interval*num_iters_per_epoch) runner = dict( type='MyRunnerWrapper', max_iters=num_epochs * num_iters_per_epoch) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) SyncBN = True load_from = "work_dirs/maptracker_nusc_oldsplit_5frame_span10_stage2_warmup/latest.pth" ================================================ FILE: plugin/core/apis/__init__.py ================================================ from .train import custom_train_model from .mmdet_train import custom_train_detector # from .test import custom_multi_gpu_test ================================================ FILE: plugin/core/apis/mmdet_train.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- # --------------------------------------------- # Modified by Shihao Wang # --------------------------------------------- import random import warnings import numpy as np import torch import torch.distributed as dist from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner, IterBasedRunner, RUNNERS, Fp16OptimizerHook, OptimizerHook, build_optimizer, build_runner, get_dist_info) from mmcv.utils import build_from_cfg from mmdet.core import EvalHook from mmdet.datasets import (build_dataset, replace_ImageToTensor) from mmdet.utils import get_root_logger import time import os.path as osp from ...datasets.builder import build_dataloader from ..evaluation.eval_hooks import CustomDistEvalHook @RUNNERS.register_module() class MyRunnerWrapper(IterBasedRunner): def train(self, data_loader, **kwargs): self.model.module.num_iter = self._iter self.model.train() self.mode = 'train' self.data_loader = data_loader self._epoch = data_loader.epoch self.model.module.num_epoch = self._epoch data_batch = next(data_loader) self.call_hook('before_train_iter') outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) if not isinstance(outputs, dict): raise TypeError('model.train_step() must return a dict') if 'log_vars' in outputs: self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) self.outputs = outputs self.call_hook('after_train_iter') self._inner_iter += 1 self._iter += 1 def custom_train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, eval_model=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] #assert len(dataset)==1s if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), runner_type=cfg.runner, ) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) if eval_model is not None: eval_model = MMDistributedDataParallel( eval_model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) if eval_model is not None: eval_model = MMDataParallel( eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) if 'runner' not in cfg: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) else: if 'total_epochs' in cfg: assert cfg.total_epochs == cfg.runner.max_epochs if eval_model is not None: runner = build_runner( cfg.runner, default_args=dict( model=model, eval_model=eval_model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) else: runner = build_runner( cfg.runner, default_args=dict( model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) # register profiler hook #trace_config = dict(type='tb_trace', dir_name='work_dir') #profiler_config = dict(on_trace_ready=trace_config) #runner.register_profiler_hook(profiler_config) if distributed: if isinstance(runner, EpochBasedRunner): runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: assert False # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), ) eval_cfg = cfg.get('evaluation', {}) #eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_cfg['by_epoch'] = not isinstance(runner, IterBasedRunner) eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_')) eval_hook = CustomDistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow) ================================================ FILE: plugin/core/apis/test.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- import os.path as osp import pickle import shutil import tempfile import time import mmcv import torch import torch.distributed as dist from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info from mmdet.core import encode_mask_results import mmcv import numpy as np import pycocotools.mask as mask_util def custom_encode_mask_results(mask_results): """Encode bitmap mask to RLE code. Semantic Masks only Args: mask_results (list | tuple[list]): bitmap mask results. In mask scoring rcnn, mask_results is a tuple of (segm_results, segm_cls_score). Returns: list | tuple: RLE encoded mask. """ cls_segms = mask_results num_classes = len(cls_segms) encoded_mask_results = [] for i in range(len(cls_segms)): encoded_mask_results.append( mask_util.encode( np.array( cls_segms[i][:, :, np.newaxis], order='F', dtype='uint8'))[0]) # encoded with RLE return [encoded_mask_results] def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): """Test model with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' it encodes results to gpu tensors and use gpu communication for results collection. On cpu mode it saves the results on different gpus to 'tmpdir' and collects them by the rank 0 worker. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. Returns: list: The prediction results. """ model.eval() bbox_results = [] mask_results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) time.sleep(2) # This line can prevent deadlock problem in some cases. have_mask = False for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) # encode mask results if isinstance(result, dict): if 'bbox_results' in result.keys(): bbox_result = result['bbox_results'] batch_size = len(result['bbox_results']) bbox_results.extend(bbox_result) if 'mask_results' in result.keys() and result['mask_results'] is not None: mask_result = custom_encode_mask_results(result['mask_results']) mask_results.extend(mask_result) have_mask = True else: batch_size = len(result) bbox_results.extend(result) if rank == 0: for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks if gpu_collect: bbox_results = collect_results_gpu(bbox_results, len(dataset)) if have_mask: mask_results = collect_results_gpu(mask_results, len(dataset)) else: mask_results = None else: bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir) tmpdir = tmpdir+'_mask' if tmpdir is not None else None if have_mask: mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir) else: mask_results = None if mask_results is None: return bbox_results return {'bbox_results': bbox_results, 'mask_results': mask_results} def collect_results_cpu(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: mmcv.mkdir_or_exist('.dist_test') tmpdir = tempfile.mkdtemp(dir='.dist_test') tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, f'part_{i}.pkl') part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] ''' bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample, ''' #for res in zip(*part_list): for res in part_list: ordered_results.extend(list(res)) # the dataloader may pad some samples print(f'\ntruncate {size} samples from {len(ordered_results)}') ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results def collect_results_gpu(result_part, size): collect_results_cpu(result_part, size) ================================================ FILE: plugin/core/apis/train.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- # --------------------------------------------- # Modified by Shihao Wang # --------------------------------------------- from .mmdet_train import custom_train_detector from mmseg.apis import train_segmentor from mmdet.apis import train_detector def custom_train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, eval_model=None, meta=None): """A function wrapper for launching model training according to cfg. Because we need different eval_hook in runner. Should be deprecated in the future. """ if cfg.model.type in ['EncoderDecoder3D']: assert False else: custom_train_detector( model, dataset, cfg, distributed=distributed, validate=validate, timestamp=timestamp, eval_model=eval_model, meta=meta) def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """A function wrapper for launching model training according to cfg. Because we need different eval_hook in runner. Should be deprecated in the future. """ if cfg.model.type in ['EncoderDecoder3D']: train_segmentor( model, dataset, cfg, distributed=distributed, validate=validate, timestamp=timestamp, meta=meta) else: train_detector( model, dataset, cfg, distributed=distributed, validate=validate, timestamp=timestamp, meta=meta) ================================================ FILE: plugin/core/evaluation/__init__.py ================================================ from .eval_hooks import CustomDistEvalHook ================================================ FILE: plugin/core/evaluation/eval_hooks.py ================================================ # Note: Considering that MMCV's EvalHook updated its interface in V1.3.16, # in order to avoid strong version dependency, we did not directly # inherit EvalHook but BaseDistEvalHook. import bisect import os.path as osp import mmcv import torch.distributed as dist from mmcv.runner import DistEvalHook as BaseDistEvalHook from mmcv.runner import EvalHook as BaseEvalHook from torch.nn.modules.batchnorm import _BatchNorm from mmdet.core.evaluation.eval_hooks import DistEvalHook def _calc_dynamic_intervals(start_interval, dynamic_interval_list): assert mmcv.is_list_of(dynamic_interval_list, tuple) dynamic_milestones = [0] dynamic_milestones.extend( [dynamic_interval[0] for dynamic_interval in dynamic_interval_list]) dynamic_intervals = [start_interval] dynamic_intervals.extend( [dynamic_interval[1] for dynamic_interval in dynamic_interval_list]) return dynamic_milestones, dynamic_intervals class CustomDistEvalHook(BaseDistEvalHook): def __init__(self, *args, dynamic_intervals=None, **kwargs): super(CustomDistEvalHook, self).__init__(*args, **kwargs) self.use_dynamic_intervals = dynamic_intervals is not None if self.use_dynamic_intervals: self.dynamic_milestones, self.dynamic_intervals = \ _calc_dynamic_intervals(self.interval, dynamic_intervals) def _decide_interval(self, runner): if self.use_dynamic_intervals: progress = runner.epoch if self.by_epoch else runner.iter step = bisect.bisect(self.dynamic_milestones, (progress + 1)) # Dynamically modify the evaluation interval self.interval = self.dynamic_intervals[step - 1] def before_train_epoch(self, runner): """Evaluate the model only at the start of training by epoch.""" self._decide_interval(runner) super().before_train_epoch(runner) def before_train_iter(self, runner): self._decide_interval(runner) super().before_train_iter(runner) def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" # Synchronization of BatchNorm's buffer (running_mean # and running_var) is not supported in the DDP of pytorch, # which may cause the inconsistent performance of models in # different ranks, so we broadcast BatchNorm's buffers # of rank 0 to other ranks to avoid this. if self.broadcast_bn_buffer: model = runner.model for name, module in model.named_modules(): if isinstance(module, _BatchNorm) and module.track_running_stats: dist.broadcast(module.running_var, 0) dist.broadcast(module.running_mean, 0) if not self._should_evaluate(runner): return tmpdir = self.tmpdir if tmpdir is None: tmpdir = osp.join(runner.work_dir, '.eval_hook') from ..apis.test import custom_multi_gpu_test # to solve circlur import results = custom_multi_gpu_test( runner.model, self.dataloader, tmpdir=tmpdir, gpu_collect=self.gpu_collect) if runner.rank == 0: print('\n') runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) key_score = self.evaluate(runner, results) if self.save_best: self._save_ckpt(runner, key_score) ================================================ FILE: plugin/datasets/__init__.py ================================================ from .pipelines import * from .argo_dataset import AV2Dataset from .nusc_dataset import NuscDataset ================================================ FILE: plugin/datasets/argo_dataset.py ================================================ from .base_dataset import BaseMapDataset from .map_utils.av2map_extractor import AV2MapExtractor from mmdet.datasets import DATASETS import numpy as np from .visualize.renderer import Renderer from time import time import mmcv from pyquaternion import Quaternion import pickle import os @DATASETS.register_module() class AV2Dataset(BaseMapDataset): """Argoverse2 map dataset class. Args: ann_file (str): annotation file path cat2id (dict): category to class id roi_size (tuple): bev range eval_config (Config): evaluation config meta (dict): meta information pipeline (Config): data processing pipeline config, interval (int): annotation load interval work_dir (str): path to work dir test_mode (bool): whether in test mode """ def __init__(self, **kwargs,): super().__init__(**kwargs) self.map_extractor = AV2MapExtractor(self.roi_size, self.id2map) self.renderer = Renderer(self.cat2id, self.roi_size, 'av2') def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations. """ start_time = time() ann = mmcv.load(ann_file) self.id2map = ann['id2map'] samples = ann['samples'] if 'newsplit' not in ann_file: if 'val' in ann_file: # For the old split testing, we make sure that the test set matches exactly with the MapTR codebase # NOTE: simply sort&sampling will produce slightly different results compared to MapTR's samples # so we have to directly use the saved meta information from MapTR codebase to get the samples maptr_meta_path = os.path.join(os.path.dirname(ann_file), 'maptrv2_val_samples_info.pkl') with open(maptr_meta_path, 'rb') as f: maptr_meta = pickle.load(f) maptr_unique_tokens = [x['token'] for x in maptr_meta['samples_meta']] unique_token2samples = {} for sample in samples: unique_token2samples[f'{sample["log_id"]}_{sample["token"]}'] = sample samples = [unique_token2samples[x] for x in maptr_unique_tokens] else: # For the old split training, we follow MapTR's data loading, which # sorts the samples based on the token, then do sub-sampling samples = list(sorted(samples, key=lambda e: e['token'])) samples = samples[::self.interval] else: # For the new split, we simply follow StreamMapNet, do not sort based on the token # In this way, the intervals between consecutive frames are uniform... samples = samples[::self.interval] # Since the sorted order copied from MapTR does not strictly enforce that # samples of the same scene are consecutive, need to re-arrange scene_name2idx = {} for idx, sample in enumerate(samples): scene = sample['log_id'] if scene not in scene_name2idx: scene_name2idx[scene] = [] scene_name2idx[scene].append(idx) samples_rearrange = [] for scene_name in scene_name2idx: scene_sample_ids = scene_name2idx[scene_name] for sample_id in scene_sample_ids: samples_rearrange.append(samples[sample_id]) samples = samples_rearrange print(f'collected {len(samples)} samples in {(time() - start_time):.2f}s') self.samples = samples def load_matching(self, matching_file): with open(matching_file, 'rb') as pf: data = pickle.load(pf) total_samples = 0 for scene_name, info in data.items(): total_samples += len(info['sample_ids']) assert total_samples == len(self.samples), 'Matching info not matched with data samples' self.matching_meta = data print(f'loaded matching meta for {len(data)} scenes') def get_sample(self, idx): """Get data sample. For each sample, map extractor will be applied to extract map elements. Args: idx (int): data index Returns: result (dict): dict of input """ sample = self.samples[idx] log_id = sample['log_id'] map_geoms = self.map_extractor.get_map_geom(log_id, sample['e2g_translation'], sample['e2g_rotation']) map_label2geom = {} for k, v in map_geoms.items(): if k in self.cat2id.keys(): map_label2geom[self.cat2id[k]] = v ego2img_rts = [] for c in sample['cams'].values(): extrinsic, intrinsic = np.array( c['extrinsics']), np.array(c['intrinsics']) ego2cam_rt = extrinsic viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic ego2cam_rt = (viewpad @ ego2cam_rt) ego2img_rts.append(ego2cam_rt) # pdb.set_trace() input_dict = { 'token': sample['token'], 'img_filenames': [c['img_fpath'] for c in sample['cams'].values()], # intrinsics are 3x3 Ks 'cam_intrinsics': [c['intrinsics'] for c in sample['cams'].values()], # extrinsics are 4x4 tranform matrix, NOTE: **ego2cam** 'cam_extrinsics': [c['extrinsics'] for c in sample['cams'].values()], 'ego2img': ego2img_rts, 'map_geoms': map_label2geom, # {0: List[ped_crossing(LineString)], 1: ...} 'ego2global_translation': sample['e2g_translation'], 'ego2global_rotation': sample['e2g_rotation'].tolist(), 'sample_idx': sample['modified_sample_idx'], 'scene_name': sample['scene_name'], 'lidar_path': sample['lidar_fpath'] } return input_dict ================================================ FILE: plugin/datasets/base_dataset.py ================================================ import numpy as np import os import os.path as osp import mmcv from .evaluation.raster_eval import RasterEvaluate from .evaluation.vector_eval import VectorEvaluate from mmdet3d.datasets.pipelines import Compose from mmdet.datasets import DATASETS from torch.utils.data import Dataset from mmcv.parallel import DataContainer as DC import warnings import pickle warnings.filterwarnings("ignore") @DATASETS.register_module() class BaseMapDataset(Dataset): """Map dataset base class. Args: ann_file (str): annotation file path cat2id (dict): category to class id roi_size (tuple): bev range eval_config (Config): evaluation config meta (dict): meta information pipeline (Config): data processing pipeline config, interval (int): annotation load interval work_dir (str): path to work dir test_mode (bool): whether in test mode """ def __init__(self, ann_file, cat2id, roi_size, meta, pipeline, interval=1, seq_split_num=1, work_dir=None, eval_config=None, test_mode=False, multi_frame=False, sampling_span=10, matching=False, eval_semantic=False, ): super().__init__() self.ann_file = ann_file self.multi_frame = multi_frame self.sampling_span = sampling_span self.matching = matching self.meta = meta self.classes = list(cat2id.keys()) self.num_classes = len(self.classes) self.cat2id = cat2id self.interval = interval self.seq_split_num = seq_split_num self.eval_semantic = eval_semantic self.load_annotations(self.ann_file) if matching: assert self.multi_frame, 'The matching info has to loaded under the multi-frame setting' self.matching_file = ann_file[:-4] + '_gt_tracks.pkl' assert os.path.isfile(self.matching_file) self.load_matching(self.matching_file) self.idx2token = {} for i, s in enumerate(self.samples): self.idx2token[i] = s['token'] self.token2idx = {v: k for k, v in self.idx2token.items()} if pipeline is not None: self.pipeline = Compose(pipeline) else: self.pipeline = None # dummy flags to fit with mmdet dataset self.flag = np.zeros(len(self), dtype=np.uint8) self.roi_size = roi_size self.work_dir = work_dir self.eval_config = eval_config if self.eval_config is not None: assert test_mode, "eval_config is valid only in test_mode" # record the sequence information, prepare for two-frame data loading self._set_sequence_info() self._set_sequence_group_flag() def _set_sequence_info(self): """Compute and record the sequence id and local index of each sample """ scene_name2idx = {} for idx, sample in enumerate(self.samples): self.samples[idx]['modified_sample_idx'] = idx scene = sample['scene_name'] if scene not in scene_name2idx: scene_name2idx[scene] = [] self.samples[idx]['prev'] = -1 scene_name2idx[scene].append(idx) self.scene_name2idx = scene_name2idx print('Prepare sequence information for {}'.format(self.ann_file)) idx2scene = {} for scene_name, scene_info in scene_name2idx.items(): for local_idx, global_idx in enumerate(scene_info): idx2scene[global_idx] = (scene_name, local_idx, len(scene_info)) self.idx2scene = idx2scene def _set_sequence_group_flag(self): """ Set each sequence to be a different group """ if self.seq_split_num == -1: self.flag = np.arange(len(self.samples)) return elif self.seq_split_num == -2: return res = [] curr_sequence = -1 for idx in range(len(self.samples)): if self.samples[idx]['prev'] == -1: # new sequence curr_sequence += 1 res.append(curr_sequence) self.flag = np.array(res, dtype=np.int64) if self.seq_split_num != 1: bin_counts = np.bincount(self.flag) new_flags = [] curr_new_flag = 0 for curr_flag in range(len(bin_counts)): seq_length = int(round(bin_counts[curr_flag] / self.seq_split_num)) curr_sequence_length = list(range(0, bin_counts[curr_flag], seq_length)) + [bin_counts[curr_flag]] # if left one sample, put it into the last sequence if curr_sequence_length[-1] - curr_sequence_length[-2] <= 1: curr_sequence_length = curr_sequence_length[:-2] + [curr_sequence_length[-1]] curr_sequence_length = np.array(curr_sequence_length) for sub_seq_idx in (curr_sequence_length[1:] - curr_sequence_length[:-1]): for _ in range(sub_seq_idx): new_flags.append(curr_new_flag) curr_new_flag += 1 assert len(new_flags) == len(self.flag) # assert len(np.bincount(new_flags)) == len(np.bincount(self.flag)) * self.seq_split_num self.flag = np.array(new_flags, dtype=np.int64) def load_annotations(self, ann_file): raise NotImplementedError def load_matching(self, matching_file): raise NotImplementedError def get_sample(self, idx): raise NotImplementedError def format_results(self, results, denormalize=True, prefix=None, save_semantic=False): '''Format prediction result to submission format. Args: results (list[Tensor]): List of prediction results. denormalize (bool): whether to denormalize prediction from (0, 1) \ to bev range. Default: True prefix (str): work dir prefix to save submission file. Returns: dict: Evaluation results ''' meta = self.meta output_format = meta['output_format'] submissions = { 'meta': meta, 'results': {}, } if output_format == 'raster': for pred in results: single_case = {} token = pred['token'] pred_map = pred['semantic_mask'] pred_bool = pred_map > 0 single_case['semantic_mask'] = pred_bool.bool() submissions['results'][token] = single_case # Use pickle format to minimize submission file size. out_path = osp.join(prefix, 'submission_raster.pkl') print(f'saving submissions results to {out_path}') os.makedirs(os.path.dirname(out_path), exist_ok=True) mmcv.dump(submissions, out_path) return out_path elif output_format == 'vector': all_pos_results = [] for pred in results: ''' For each case, the result should be formatted as Dict{'vectors': [], 'scores': [], 'labels': []} 'vectors': List of vector, each vector is a array([[x1, y1], [x2, y2] ...]), contain all vectors predicted in this sample. 'scores: List of score(float), contain scores of all instances in this sample. 'labels': List of label(int), contain labels of all instances in this sample. ''' if pred is None: # empty prediction continue single_case = {'vectors': [], 'scores': [], 'labels': [], 'props': [], 'track_vectors': [], 'track_scores': [], 'track_labels': []} token = pred['token'] roi_size = np.array(self.roi_size) origin = -np.array([self.roi_size[0]/2, self.roi_size[1]/2]) # save the extra semantic info if save_semantic: single_case['semantic_mask'] = pred['semantic_mask'].tolist() if 'scores' in pred: for i in range(len(pred['scores'])): score = pred['scores'][i] label = pred['labels'][i] vector = pred['vectors'][i] prop = pred['props'][i] # A line should have >=2 points if len(vector) < 2: continue if denormalize: eps = 1e-5 vector = vector * (roi_size + eps) + origin single_case['vectors'].append(vector) single_case['scores'].append(score) single_case['labels'].append(label) single_case['props'].append(prop) if 'track_scores' in pred: # also save the tracking information for analyzing for i in range(len(pred['track_scores'])): score = pred['track_scores'][i] label = pred['track_labels'][i] vector = pred['track_vectors'][i] if denormalize: eps = 1e-5 vector = vector * (roi_size + eps) + origin single_case['track_vectors'].append(vector) single_case['track_scores'].append(score) single_case['track_labels'].append(label) submissions['results'][token] = single_case if not self.eval_semantic: pos_results = pred['pos_results'] pos_vectors = pos_results['vectors'] if denormalize and len(pos_vectors) > 0: pos_vectors = pos_vectors.reshape(pos_vectors.shape[0], -1, 2) pos_vectors = (pos_vectors * roi_size + origin).reshape(pos_vectors.shape[0], -1) save_pos_results = { 'vectors': pos_vectors, 'labels': pos_results['labels'], 'scores': pos_results['scores'], 'scene_name': pos_results['scene_name'], 'local_idx': pos_results['local_idx'], 'global_ids': pos_results['global_ids'], 'meta': pred['meta'] } all_pos_results.append(save_pos_results) out_path = osp.join(prefix, 'submission_vector.json') print(f'saving submissions results to {out_path}') os.makedirs(os.path.dirname(out_path), exist_ok=True) mmcv.dump(submissions, out_path) if not self.eval_semantic: out_path_pos = osp.join(prefix, 'pos_predictions.pkl') with open(out_path_pos, 'wb') as f: pickle.dump(all_pos_results, f, protocol=pickle.HIGHEST_PROTOCOL) return out_path else: raise ValueError("output format must be either \'raster\' or \'vector\'") def evaluate(self, results, logger=None, **kwargs): '''Evaluate prediction result based on `output_format` specified by dataset. Args: results (list[Tensor]): List of prediction results. logger (logger): logger to print evaluation results. Returns: dict: Evaluation results. ''' print('len of the results', len(results)) eval_semantic = True if (hasattr(self, 'eval_semantic') and self.eval_semantic) else False save_semantic = True if 'save_semantic' in kwargs and kwargs['save_semantic'] or eval_semantic \ else False result_path = self.format_results(results, denormalize=True, prefix=self.work_dir, save_semantic=save_semantic) return self._evaluate(result_path, logger=logger, eval_semantic=eval_semantic) def _evaluate(self, result_path, logger=None, eval_semantic=False): if not eval_semantic: self.evaluator = VectorEvaluate(self.eval_config) else: self.evaluator = RasterEvaluate(self.eval_config) result_dict = self.evaluator.evaluate(result_path, logger=logger) return result_dict def show_gt(self, idx, out_dir='demo/'): '''Visualize ground-truth. Args: idx (int): index of sample. out_dir (str): output directory. ''' from mmcv.parallel import DataContainer from copy import deepcopy sample = self.get_sample(idx) sample = deepcopy(sample) data = self.pipeline(sample) #imgs = [mmcv.imread(i) for i in sample['img_filenames']] #cam_extrinsics = sample['cam_extrinsics'] #cam_intrinsics = sample['cam_intrinsics'] if 'vectors' in data: vectors = data['vectors'] if isinstance(vectors, DataContainer): vectors = vectors.data self.renderer.render_bev_from_vectors(vectors, out_dir) #self.renderer.render_camera_views_from_vectors(vectors, imgs, # cam_extrinsics, cam_intrinsics, 2, out_dir) if 'semantic_mask' in data: semantic_mask = data['semantic_mask'] if isinstance(semantic_mask, DataContainer): semantic_mask = semantic_mask.data self.renderer.render_bev_from_mask(semantic_mask, out_dir, flip=True) def show_result(self, submission, idx, score_thr=0, draw_score=False, show_semantic=False, out_dir='demo/'): '''Visualize prediction result. Args: idx (int): index of sample. submission (dict): prediction results. score_thr (float): threshold to filter prediction results. out_dir (str): output directory. ''' meta = submission['meta'] output_format = meta['output_format'] token = self.idx2token[idx] results = submission['results'][token] sample = self.get_sample(idx) if 'semantic_mask' in results and show_semantic: semantic_mask = np.array(results['semantic_mask']) self.renderer.render_bev_from_mask(semantic_mask, out_dir, flip=False) if output_format == 'vector' and 'scores' in results: vectors = {label: [] for label in self.cat2id.values()} for i in range(len(results['labels'])): score = results['scores'][i] label = results['labels'][i] prop = results['props'][i] v = results['vectors'][i] if score > score_thr: if draw_score: vectors[label].append((v, score, prop)) else: vectors[label].append(v) self.renderer.render_bev_from_vectors(vectors, out_dir, draw_scores=draw_score) # For projecting and visualizing results on perspective images #imgs = [mmcv.imread(i) for i in sample['img_filenames']] #cam_extrinsics = sample['cam_extrinsics'] #cam_intrinsics = sample['cam_intrinsics'] # self.renderer.render_camera_views_from_vectors(vectors, imgs, # cam_extrinsics, cam_intrinsics, 2, out_dir) def show_track(self, submission, idx, out_dir='demo/'): '''Visualize prediction result. Args: idx (int): index of sample. submission (dict): prediction results. score_thr (float): threshold to filter prediction results. out_dir (str): output directory. ''' meta = submission['meta'] token = self.idx2token[idx] results = submission['results'][token] vectors = {label: [] for label in self.cat2id.values()} for i in range(len(results['track_labels'])): score = results['track_scores'][i] label = results['track_labels'][i] v = results['track_vectors'][i] vectors[label].append((v, score, 1)) self.renderer.render_bev_from_vectors(vectors, out_dir, draw_scores=True) def __len__(self): """Return the length of data infos. Returns: int: Length of data infos. """ return len(self.samples) def _rand_another(self, idx): """Randomly get another item. Returns: int: Another index of item. """ return np.random.choice(self.__len__) def __getitem__(self, idx): """Get item from infos according to the given index. Returns: dict: Data dictionary of the corresponding index. """ input_dict = self.get_sample(idx) data = self.pipeline(input_dict) # prepare the local sequence index info seq_info = self.idx2scene[idx] data['seq_info'] = DC(seq_info, cpu_only=True) if self.multi_frame: # used when sampling multi-frame training data scene_name = input_dict['scene_name'] scene_seq_info = self.scene_name2idx[scene_name] local_idx_curr = input_dict['sample_idx'] - scene_seq_info[0] span = max(self.sampling_span, self.multi_frame) min_idx = local_idx_curr - span sampled_indices = np.random.choice(span, self.multi_frame-1, replace=False).tolist() sampled_indices = sorted(sampled_indices) local_indices_prev = [min_idx + x for x in sampled_indices] local_indices_prev = [x if x>=0 else 0 for x in local_indices_prev] data['img_metas'].data['local_idx'] = local_idx_curr global_indices_prev = [local_idx + scene_seq_info[0] for local_idx in local_indices_prev] all_prev_data = [] for idx, global_idx_prev in enumerate(global_indices_prev): input_dict_prev = self.get_sample(global_idx_prev) data_prev = self.pipeline(input_dict_prev) local_idx_prev = local_indices_prev[idx] data_prev['img_metas'].data['local_idx'] = local_idx_prev all_prev_data.append(data_prev) all_local2global_info = [] if self.matching: scene_matching_info = self.matching_meta[scene_name] for local_idx_prev in local_indices_prev: prev_local2global = DC(scene_matching_info['instance_ids'][local_idx_prev], cpu_only=True) all_local2global_info.append(prev_local2global) curr_local2global = DC(scene_matching_info['instance_ids'][local_idx_curr], cpu_only=True) all_local2global_info.append(curr_local2global) data['all_prev_data'] = all_prev_data data['all_local2global_info'] = all_local2global_info return data ================================================ FILE: plugin/datasets/builder.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Shihao Wang # --------------------------------------------- import copy import platform import random from functools import partial import numpy as np from mmcv.parallel import collate from mmcv.runner import get_dist_info from mmcv.utils import Registry, build_from_cfg from torch.utils.data import DataLoader from mmdet.datasets.samplers import GroupSampler from .samplers.group_sampler import DistributedGroupSampler from .samplers.distributed_sampler import DistributedSampler from .samplers.group_sampler import InfiniteGroupEachSampleInBatchSampler from .samplers.sampler import build_sampler def build_dataloader(dataset, samples_per_gpu, workers_per_gpu, num_gpus=1, dist=True, shuffle=True, seed=None, shuffler_sampler=None, nonshuffler_sampler=None, runner_type=dict(type='EpochBasedRunner'), **kwargs): """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. In non-distributed training, there is only one dataloader for all GPUs. Args: dataset (Dataset): A PyTorch dataset. samples_per_gpu (int): Number of training samples on each GPU, i.e., batch size of each GPU. workers_per_gpu (int): How many subprocesses to use for data loading for each GPU. num_gpus (int): Number of GPUs. Only used in non-distributed training. dist (bool): Distributed training/test or not. Default: True. shuffle (bool): Whether to shuffle the data at every epoch. Default: True. kwargs: any keyword argument to be used to initialize DataLoader Returns: DataLoader: A PyTorch dataloader. """ rank, world_size = get_dist_info() if dist: # DistributedGroupSampler will definitely shuffle the data to satisfy # that images on each GPU are in the same group if shuffle: sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'), dict( dataset=dataset, samples_per_gpu=samples_per_gpu, num_replicas=world_size, rank=rank, seed=seed) ) else: sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'), dict( dataset=dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed) ) batch_size = samples_per_gpu num_workers = workers_per_gpu batch_sampler = None else: # assert False, 'not support in bevformer' # print('WARNING!!!!, Only can be used for obtain inference speed!!!!') sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None batch_size = num_gpus * samples_per_gpu num_workers = num_gpus * workers_per_gpu batch_sampler = None # True entry here!!! if runner_type['type'] == 'IterBasedRunner' and shuffler_sampler['type'] =='InfiniteGroupEachSampleInBatchSampler': # TODO: original has more options, but I'm not using them # https://github.com/open-mmlab/mmdetection/blob/3b72b12fe9b14de906d1363982b9fba05e7d47c1/mmdet/datasets/builder.py#L145-L157 batch_sampler = build_sampler(shuffler_sampler, dict( dataset=dataset, samples_per_gpu=samples_per_gpu, num_replicas=world_size, rank=rank, seed=seed) ) batch_size = 1 # Since we have batch sampler, the batch_size must = 1 sampler = None init_fn = partial( worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None data_loader = DataLoader( dataset, batch_size=batch_size, sampler=sampler, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), pin_memory=False, worker_init_fn=init_fn, **kwargs) return data_loader def worker_init_fn(worker_id, num_workers, rank, seed): # The seed of each worker equals to # num_worker * rank + worker_id + user_seed worker_seed = num_workers * rank + worker_id + seed np.random.seed(worker_seed) random.seed(worker_seed) # Copyright (c) OpenMMLab. All rights reserved. # import platform # from mmcv.utils import Registry, build_from_cfg # from mmdet.datasets import DATASETS # from mmdet.datasets.builder import _concat_dataset # if platform.system() != 'Windows': # # https://github.com/pytorch/pytorch/issues/973 # import resource # rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) # base_soft_limit = rlimit[0] # hard_limit = rlimit[1] # soft_limit = min(max(4096, base_soft_limit), hard_limit) # resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) # OBJECTSAMPLERS = Registry('Object sampler') # def custom_build_dataset(cfg, default_args=None): # from mmdet3d.datasets.dataset_wrappers import CBGSDataset # from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset, # ConcatDataset, RepeatDataset) # if isinstance(cfg, (list, tuple)): # dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg]) # elif cfg['type'] == 'ConcatDataset': # dataset = ConcatDataset( # [custom_build_dataset(c, default_args) for c in cfg['datasets']], # cfg.get('separate_eval', True)) # elif cfg['type'] == 'RepeatDataset': # dataset = RepeatDataset( # custom_build_dataset(cfg['dataset'], default_args), cfg['times']) # elif cfg['type'] == 'ClassBalancedDataset': # dataset = ClassBalancedDataset( # custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr']) # elif cfg['type'] == 'CBGSDataset': # dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args)) # elif isinstance(cfg.get('ann_file'), (list, tuple)): # dataset = _concat_dataset(cfg, default_args) # else: # dataset = build_from_cfg(cfg, DATASETS, default_args) # return dataset ================================================ FILE: plugin/datasets/evaluation/AP.py ================================================ import numpy as np from .distance import chamfer_distance, frechet_distance, chamfer_distance_batch from typing import List, Tuple, Union from numpy.typing import NDArray import torch def average_precision(recalls, precisions, mode='area'): """Calculate average precision. Args: recalls (ndarray): shape (num_dets, ) precisions (ndarray): shape (num_dets, ) mode (str): 'area' or '11points', 'area' means calculating the area under precision-recall curve, '11points' means calculating the average precision of recalls at [0, 0.1, ..., 1] Returns: float: calculated average precision """ recalls = recalls[np.newaxis, :] precisions = precisions[np.newaxis, :] assert recalls.shape == precisions.shape and recalls.ndim == 2 num_scales = recalls.shape[0] ap = 0. if mode == 'area': zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) ones = np.ones((num_scales, 1), dtype=recalls.dtype) mrec = np.hstack((zeros, recalls, ones)) mpre = np.hstack((zeros, precisions, zeros)) for i in range(mpre.shape[1] - 1, 0, -1): mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) ind = np.where(mrec[0, 1:] != mrec[0, :-1])[0] ap = np.sum( (mrec[0, ind + 1] - mrec[0, ind]) * mpre[0, ind + 1]) elif mode == '11points': for thr in np.arange(0, 1 + 1e-3, 0.1): precs = precisions[0, recalls[i, :] >= thr] prec = precs.max() if precs.size > 0 else 0 ap += prec ap /= 11 else: raise ValueError( 'Unrecognized mode, only "area" and "11points" are supported') return ap def instance_match(pred_lines: NDArray, scores: NDArray, gt_lines: NDArray, thresholds: Union[Tuple, List], metric: str='chamfer') -> List: """Compute whether detected lines are true positive or false positive. Args: pred_lines (array): Detected lines of a sample, of shape (M, INTERP_NUM, 2 or 3). scores (array): Confidence score of each line, of shape (M, ). gt_lines (array): GT lines of a sample, of shape (N, INTERP_NUM, 2 or 3). thresholds (list of tuple): List of thresholds. metric (str): Distance function for lines matching. Default: 'chamfer'. Returns: list_of_tp_fp (list): tp-fp matching result at all thresholds """ if metric == 'chamfer': distance_fn = chamfer_distance elif metric == 'frechet': distance_fn = frechet_distance else: raise ValueError(f'unknown distance function {metric}') num_preds = pred_lines.shape[0] num_gts = gt_lines.shape[0] # tp and fp tp_fp_list = [] tp = np.zeros((num_preds), dtype=np.float32) fp = np.zeros((num_preds), dtype=np.float32) # if there is no gt lines in this sample, then all pred lines are false positives if num_gts == 0: fp[...] = 1 for thr in thresholds: tp_fp_list.append((tp.copy(), fp.copy())) return tp_fp_list if num_preds == 0: for thr in thresholds: tp_fp_list.append((tp.copy(), fp.copy())) return tp_fp_list assert pred_lines.shape[1] == gt_lines.shape[1], \ "sample points num should be the same" # distance matrix: M x N matrix = np.zeros((num_preds, num_gts)) # for i in range(num_preds): # for j in range(num_gts): # matrix[i, j] = distance_fn(pred_lines[i], gt_lines[j]) matrix = chamfer_distance_batch(pred_lines, gt_lines) # for each det, the min distance with all gts matrix_min = matrix.min(axis=1) # for each det, which gt is the closest to it matrix_argmin = matrix.argmin(axis=1) # sort all dets in descending order by scores sort_inds = np.argsort(-scores) # match under different thresholds for thr in thresholds: tp = np.zeros((num_preds), dtype=np.float32) fp = np.zeros((num_preds), dtype=np.float32) gt_covered = np.zeros(num_gts, dtype=bool) for i in sort_inds: if matrix_min[i] <= thr: matched_gt = matrix_argmin[i] if not gt_covered[matched_gt]: gt_covered[matched_gt] = True tp[i] = 1 else: fp[i] = 1 else: fp[i] = 1 tp_fp_list.append((tp, fp)) return tp_fp_list ================================================ FILE: plugin/datasets/evaluation/__init__.py ================================================ ================================================ FILE: plugin/datasets/evaluation/distance.py ================================================ from scipy.spatial import distance from numpy.typing import NDArray import torch def chamfer_distance(line1: NDArray, line2: NDArray) -> float: ''' Calculate chamfer distance between two lines. Make sure the lines are interpolated. Args: line1 (array): coordinates of line1 line2 (array): coordinates of line2 Returns: distance (float): chamfer distance ''' dist_matrix = distance.cdist(line1, line2, 'euclidean') dist12 = dist_matrix.min(-1).sum() / len(line1) dist21 = dist_matrix.min(-2).sum() / len(line2) return (dist12 + dist21) / 2 def frechet_distance(line1: NDArray, line2: NDArray) -> float: ''' Calculate frechet distance between two lines. Make sure the lines are interpolated. Args: line1 (array): coordinates of line1 line2 (array): coordinates of line2 Returns: distance (float): frechet distance ''' raise NotImplementedError def chamfer_distance_batch(pred_lines, gt_lines): ''' Calculate chamfer distance between two group of lines. Make sure the lines are interpolated. Args: pred_lines (array or tensor): shape (m, num_pts, 2 or 3) gt_lines (array or tensor): shape (n, num_pts, 2 or 3) Returns: distance (array): chamfer distance ''' _, num_pts, coord_dims = pred_lines.shape if not isinstance(pred_lines, torch.Tensor): pred_lines = torch.tensor(pred_lines) if not isinstance(gt_lines, torch.Tensor): gt_lines = torch.tensor(gt_lines) dist_mat = torch.cdist(pred_lines.view(-1, coord_dims), gt_lines.view(-1, coord_dims), p=2) # (num_query*num_points, num_gt*num_points) dist_mat = torch.stack(torch.split(dist_mat, num_pts)) # (num_query, num_points, num_gt*num_points) dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=-1)) # (num_gt, num_q, num_pts, num_pts) dist1 = dist_mat.min(-1)[0].sum(-1) dist2 = dist_mat.min(-2)[0].sum(-1) dist_matrix = (dist1 + dist2).transpose(0, 1) / (2 * num_pts) return dist_matrix.numpy() ================================================ FILE: plugin/datasets/evaluation/raster_eval.py ================================================ import torch from mmdet3d.datasets import build_dataset, build_dataloader import mmcv from functools import cached_property import prettytable from numpy.typing import NDArray from typing import Dict, Optional from logging import Logger from mmcv import Config from copy import deepcopy N_WORKERS = 16 class RasterEvaluate(object): """Evaluator for rasterized map. Args: dataset_cfg (Config): dataset cfg for gt n_workers (int): num workers to parallel """ def __init__(self, dataset_cfg: Config, n_workers: int=N_WORKERS): self.dataset = build_dataset(dataset_cfg) self.dataloader = build_dataloader( self.dataset, samples_per_gpu=1, workers_per_gpu=n_workers, shuffle=False, dist=False) self.cat2id = self.dataset.cat2id self.id2cat = {v: k for k, v in self.cat2id.items()} self.n_workers = n_workers @cached_property def gts(self) -> Dict[str, NDArray]: print('collecting gts...') gts = {} for data in mmcv.track_iter_progress(self.dataloader): token = deepcopy(data['img_metas'].data[0][0]['token']) gt = deepcopy(data['semantic_mask'].data[0][0]) gts[token] = gt del data # avoid dataloader memory crash return gts def evaluate(self, result_path: str, logger: Optional[Logger]=None) -> Dict[str, float]: ''' Do evaluation for a submission file and print evalution results to `logger` if specified. The submission will be aligned by tokens before evaluation. Args: result_path (str): path to submission file logger (Logger): logger to print evaluation result, Default: None Returns: result_dict (Dict): evaluation results. IoU by categories. ''' results = mmcv.load(result_path) meta = results['meta'] results = results['results'] result_dict = {} gts = [] preds = [] for token, gt in self.gts.items(): gts.append(gt) pred = torch.zeros((len(self.cat2id), gt.shape[1], gt.shape[2])).bool() if token in results: semantic_mask = torch.tensor(results[token]['semantic_mask']) for label_i in range(gt.shape[0]): pred[label_i] = (semantic_mask == label_i+1) preds.append(pred) preds = torch.stack(preds).bool() gts = torch.stack(gts).bool() # TODO: flip the gt gts = torch.flip(gts, [2,]) # for every label total = 0 for i in range(gts.shape[1]): category = self.id2cat[i] pred = preds[:, i] gt = gts[:, i] intersect = (pred & gt).sum().float().item() union = (pred | gt).sum().float().item() result_dict[category] = intersect / (union + 1e-7) total += result_dict[category] mIoU = total / gts.shape[1] result_dict['mIoU'] = mIoU categories = list(self.cat2id.keys()) table = prettytable.PrettyTable([' ', *categories, 'mean']) table.add_row(['IoU', *[round(result_dict[cat], 4) for cat in categories], round(mIoU, 4)]) if logger: from mmcv.utils import print_log print_log('\n'+str(table), logger=logger) print_log(f'mIoU = {mIoU:.4f}\n', logger=logger) return result_dict ================================================ FILE: plugin/datasets/evaluation/vector_eval.py ================================================ from functools import partial import numpy as np from multiprocessing import Pool from mmdet3d.datasets import build_dataset, build_dataloader import mmcv from .AP import instance_match, average_precision import prettytable from time import time from functools import cached_property from shapely.geometry import LineString from numpy.typing import NDArray from typing import Dict, List, Optional from logging import Logger from mmcv import Config from copy import deepcopy import os INTERP_NUM = 200 # number of points to interpolate during evaluation THRESHOLDS = [0.5, 1.0, 1.5] # AP thresholds N_WORKERS = 16 # num workers to parallel SAMPLE_DIST = 0.15 class VectorEvaluate(object): """Evaluator for vectorized map. Args: dataset_cfg (Config): dataset cfg for gt n_workers (int): num workers to parallel """ def __init__(self, dataset_cfg: Config, n_workers: int=N_WORKERS) -> None: self.dataset = build_dataset(dataset_cfg) self.cat2id = self.dataset.cat2id self.id2cat = {v: k for k, v in self.cat2id.items()} self.n_workers = n_workers self.new_split = 'newsplit' in self.dataset.ann_file self.roi_size = self.dataset.roi_size if self.roi_size == (60, 30): self.thresholds = [0.5, 1.0, 1.5] elif self.roi_size == (100, 50): self.thresholds = [1.0, 1.5, 2.0] @cached_property def gts(self) -> Dict[str, Dict[int, List[NDArray]]]: roi_size = self.dataset.roi_size if 'av2' in self.dataset.ann_file: dataset = 'av2' else: dataset = 'nusc' if self.new_split: tmp_file = f'./tmp_gts_{dataset}_{roi_size[0]}x{roi_size[1]}_newsplit.pkl' else: tmp_file = f'./tmp_gts_{dataset}_{roi_size[0]}x{roi_size[1]}.pkl' if os.path.exists(tmp_file): print(f'loading cached gts from {tmp_file}') gts = mmcv.load(tmp_file) return gts print('collecting gts...') gts = {} self.dataloader = build_dataloader( self.dataset, samples_per_gpu=1, workers_per_gpu=self.n_workers, shuffle=False, dist=False) pbar = mmcv.ProgressBar(len(self.dataloader)) for data in self.dataloader: token = deepcopy(data['img_metas'].data[0][0]['token']) gt = deepcopy(data['vectors'].data[0][0]) gts[token] = gt pbar.update() del data # avoid dataloader memory crash if not os.path.exists(tmp_file): print(f"saving gt to {tmp_file}") mmcv.dump(gts, tmp_file) return gts def interp_fixed_num(self, vector: NDArray, num_pts: int) -> NDArray: ''' Interpolate a polyline. Args: vector (array): line coordinates, shape (M, 2) num_pts (int): Returns: sampled_points (array): interpolated coordinates ''' line = LineString(vector) distances = np.linspace(0, line.length, num_pts) sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).squeeze() return sampled_points def interp_fixed_dist(self, vector: NDArray, sample_dist: float) -> NDArray: ''' Interpolate a line at fixed interval. Args: vector (LineString): vector sample_dist (float): sample interval Returns: points (array): interpolated points, shape (N, 2) ''' line = LineString(vector) distances = list(np.arange(sample_dist, line.length, sample_dist)) # make sure to sample at least two points when sample_dist > line.length distances = [0,] + distances + [line.length,] sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).squeeze() return sampled_points def _evaluate_single(self, pred_vectors: List, scores: List, groundtruth: List, thresholds: List, metric: str='metric') -> Dict[int, NDArray]: ''' Do single-frame matching for one class. Args: pred_vectors (List): List[vector(ndarray) (different length)], scores (List): List[score(float)] groundtruth (List): List of vectors thresholds (List): List of thresholds Returns: tp_fp_score_by_thr (Dict): matching results at different thresholds e.g. {0.5: (M, 2), 1.0: (M, 2), 1.5: (M, 2)} ''' pred_lines = [] # interpolate predictions for vector in pred_vectors: vector = np.array(vector) vector_interp = self.interp_fixed_num(vector, INTERP_NUM) pred_lines.append(vector_interp) if pred_lines: pred_lines = np.stack(pred_lines) else: pred_lines = np.zeros((0, INTERP_NUM, 2)) # interpolate groundtruth gt_lines = [] for vector in groundtruth: vector_interp = self.interp_fixed_num(vector, INTERP_NUM) gt_lines.append(vector_interp) if gt_lines: gt_lines = np.stack(gt_lines) else: gt_lines = np.zeros((0, INTERP_NUM, 2)) scores = np.array(scores) tp_fp_list = instance_match(pred_lines, scores, gt_lines, thresholds, metric) # (M, 2) tp_fp_score_by_thr = {} for i, thr in enumerate(thresholds): tp, fp = tp_fp_list[i] tp_fp_score = np.hstack([tp[:, None], fp[:, None], scores[:, None]]) tp_fp_score_by_thr[thr] = tp_fp_score return tp_fp_score_by_thr # {0.5: (M, 2), 1.0: (M, 2), 1.5: (M, 2)} def evaluate(self, result_path: str, metric: str='chamfer', logger: Optional[Logger]=None) -> Dict[str, float]: ''' Do evaluation for a submission file and print evalution results to `logger` if specified. The submission will be aligned by tokens before evaluation. We use multi-worker to speed up. Args: result_path (str): path to submission file metric (str): distance metric. Default: 'chamfer' logger (Logger): logger to print evaluation result, Default: None Returns: new_result_dict (Dict): evaluation results. AP by categories. ''' results = mmcv.load(result_path) results = results['results'] # re-group samples and gt by label samples_by_cls = {label: [] for label in self.id2cat.keys()} num_gts = {label: 0 for label in self.id2cat.keys()} num_preds = {label: 0 for label in self.id2cat.keys()} # align by token for token, gt in self.gts.items(): if token in results.keys(): pred = results[token] else: pred = {'vectors': [], 'scores': [], 'labels': []} # for every sample vectors_by_cls = {label: [] for label in self.id2cat.keys()} scores_by_cls = {label: [] for label in self.id2cat.keys()} for i in range(len(pred['labels'])): # i-th pred line in sample label = pred['labels'][i] vector = pred['vectors'][i] score = pred['scores'][i] vectors_by_cls[label].append(vector) scores_by_cls[label].append(score) for label in self.id2cat.keys(): new_sample = (vectors_by_cls[label], scores_by_cls[label], gt[label]) num_gts[label] += len(gt[label]) num_preds[label] += len(scores_by_cls[label]) samples_by_cls[label].append(new_sample) result_dict = {} print(f'\nevaluating {len(self.id2cat)} categories...') start = time() if self.n_workers > 0: pool = Pool(self.n_workers) sum_mAP = 0 pbar = mmcv.ProgressBar(len(self.id2cat)) for label in self.id2cat.keys(): samples = samples_by_cls[label] # List[(pred_lines, scores, gts)] result_dict[self.id2cat[label]] = { 'num_gts': num_gts[label], 'num_preds': num_preds[label] } sum_AP = 0 fn = partial(self._evaluate_single, thresholds=self.thresholds, metric=metric) if self.n_workers > 0: tpfp_score_list = pool.starmap(fn, samples) else: tpfp_score_list = [] for sample in samples: tpfp_score_list.append(fn(*sample)) for thr in self.thresholds: tp_fp_score = [i[thr] for i in tpfp_score_list] tp_fp_score = np.vstack(tp_fp_score) # (num_dets, 3) sort_inds = np.argsort(-tp_fp_score[:, -1]) tp = tp_fp_score[sort_inds, 0] # (num_dets,) fp = tp_fp_score[sort_inds, 1] # (num_dets,) tp = np.cumsum(tp, axis=0) fp = np.cumsum(fp, axis=0) eps = np.finfo(np.float32).eps recalls = tp / np.maximum(num_gts[label], eps) precisions = tp / np.maximum((tp + fp), eps) AP = average_precision(recalls, precisions, 'area') sum_AP += AP result_dict[self.id2cat[label]].update({f'AP@{thr}': AP}) pbar.update() AP = sum_AP / len(self.thresholds) sum_mAP += AP result_dict[self.id2cat[label]].update({f'AP': AP}) if self.n_workers > 0: pool.close() mAP = sum_mAP / len(self.id2cat.keys()) result_dict.update({'mAP': mAP}) print(f"finished in {time() - start:.2f}s") # print results table = prettytable.PrettyTable(['category', 'num_preds', 'num_gts'] + [f'AP@{thr}' for thr in self.thresholds] + ['AP']) for label in self.id2cat.keys(): table.add_row([ self.id2cat[label], result_dict[self.id2cat[label]]['num_preds'], result_dict[self.id2cat[label]]['num_gts'], *[round(result_dict[self.id2cat[label]][f'AP@{thr}'], 4) for thr in self.thresholds], round(result_dict[self.id2cat[label]]['AP'], 4), ]) from mmcv.utils import print_log print_log('\n'+str(table), logger=logger) mAP_normal = 0 for label in self.id2cat.keys(): for thr in self.thresholds: mAP_normal += result_dict[self.id2cat[label]][f'AP@{thr}'] mAP_normal = mAP_normal / 9 print_log(f'mAP_normal = {mAP_normal:.4f}\n', logger=logger) # print_log(f'mAP_hard = {mAP_easy:.4f}\n', logger=logger) new_result_dict = {} for name in self.cat2id: new_result_dict[name] = result_dict[name]['AP'] return new_result_dict ================================================ FILE: plugin/datasets/map_utils/av2map_extractor.py ================================================ from av2.map.map_api import ArgoverseStaticMap from pathlib import Path from shapely.geometry import LineString, box, Polygon from shapely import ops import numpy as np from .utils import split_collections, get_drivable_area_contour, \ get_ped_crossing_contour, remove_repeated_lines, transform_from, \ connect_lines, remove_boundary_dividers, remove_repeated_lanesegment, reassign_graph_attribute from numpy.typing import NDArray from typing import Dict, List, Tuple, Union from av2.geometry.se3 import SE3 from nuscenes.map_expansion.map_api import NuScenesMapExplorer import networkx as nx from nuscenes.eval.common.utils import quaternion_yaw, Quaternion from shapely.geometry import Polygon, LineString, box, MultiPolygon, MultiLineString from shapely.strtree import STRtree from shapely.geometry import CAP_STYLE, JOIN_STYLE class AV2MapExtractor(object): """Argoverse 2 map ground-truth extractor. Args: roi_size (tuple or list): bev range id2map (dict): log id to map json path """ def __init__(self, roi_size: Union[Tuple, List], id2map: Dict) -> None: self.roi_size = roi_size self.id2map = {} for log_id, path in id2map.items(): self.id2map[log_id] = ArgoverseStaticMap.from_json(Path(path)) def generate_nearby_dividers(self,avm, e2g_translation, e2g_rotation,patch): def get_path(ls_dict): pts_G = nx.DiGraph() junction_pts_list = [] tmp=ls_dict for key, value in tmp.items(): centerline_geom = LineString(value['polyline'].xyz) centerline_pts = np.array(centerline_geom.coords).round(3) start_pt = centerline_pts[0] end_pt = centerline_pts[-1] for idx, pts in enumerate(centerline_pts[:-1]): pts_G.add_edge(tuple(centerline_pts[idx]),tuple(centerline_pts[idx+1])) valid_incoming_num = 0 for idx, pred in enumerate(value['predecessors']): if pred in tmp.keys(): valid_incoming_num += 1 pred_geom = LineString(tmp[pred]['polyline'].xyz) pred_pt = np.array(pred_geom.coords).round(3)[-1] if pred_pt[0] == start_pt[0] and pred_pt[1] == start_pt[1] and pred_pt[2] == start_pt[2]: pass else: pts_G.add_edge(tuple(pred_pt), tuple(start_pt)) if valid_incoming_num > 1: junction_pts_list.append(tuple(start_pt)) valid_outgoing_num = 0 for idx, succ in enumerate(value['successors']): if succ in tmp.keys(): valid_outgoing_num += 1 succ_geom = LineString(tmp[succ]['polyline'].xyz) succ_pt = np.array(succ_geom.coords).round(3)[0] if end_pt[0] == succ_pt[0] and end_pt[1] == succ_pt[1] and end_pt[2] == succ_pt[2]: pass else: pts_G.add_edge(tuple(end_pt), tuple(succ_pt)) if valid_outgoing_num > 1: junction_pts_list.append(tuple(end_pt)) roots = (v for v, d in pts_G.in_degree() if d == 0) roots_list = [v for v, d in pts_G.in_degree() if d == 0] notroot_list = [v for v in pts_G.nodes if v not in roots_list] leaves = [v for v,d in pts_G.out_degree() if d==0] ### find path from each root to leaves all_paths = [] for root in roots: for leave in leaves: paths = nx.all_simple_paths(pts_G, root, leave) all_paths.extend(paths) for single_path in all_paths: for single_node in single_path: if single_node in notroot_list: notroot_list.remove(single_node) final_centerline_paths = [] for path in all_paths: merged_line = LineString(path) # pdb.set_trace() merged_line = merged_line.simplify(0.2, preserve_topology=True) final_centerline_paths.append(merged_line) local_centerline_paths = final_centerline_paths return local_centerline_paths left_lane_dict = {} right_lane_dict = {} scene_ls_list = avm.get_scenario_lane_segments() scene_ls_dict = dict() for ls in scene_ls_list: scene_ls_dict[ls.id] = dict( ls=ls, polygon = Polygon(ls.polygon_boundary), predecessors=ls.predecessors, successors=ls.successors ) nearby_ls_dict = dict() for key, value in scene_ls_dict.items(): polygon = value['polygon'] if polygon.is_valid: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: nearby_ls_dict[key] = value['ls'] ls_dict = nearby_ls_dict divider_ls_dict = dict() for key, value in ls_dict.items(): if not value.is_intersection: divider_ls_dict[key] = value left_lane_dict = {} right_lane_dict = {} for key,value in divider_ls_dict.items(): if value.left_neighbor_id is not None: left_lane_dict[key] = dict( polyline=value.left_lane_boundary, predecessors = value.predecessors, successors = value.successors, left_neighbor_id = value.left_neighbor_id, ) if value.right_neighbor_id is not None: right_lane_dict[key] = dict( polyline = value.right_lane_boundary, predecessors = value.predecessors, successors = value.successors, right_neighbor_id = value.right_neighbor_id, ) for key, value in left_lane_dict.items(): if value['left_neighbor_id'] in right_lane_dict.keys(): del right_lane_dict[value['left_neighbor_id']] for key, value in right_lane_dict.items(): if value['right_neighbor_id'] in left_lane_dict.keys(): del left_lane_dict[value['right_neighbor_id']] left_lane_dict = remove_repeated_lanesegment(left_lane_dict) right_lane_dict = remove_repeated_lanesegment(right_lane_dict) left_lane_dict = reassign_graph_attribute(left_lane_dict) right_lane_dict = reassign_graph_attribute(right_lane_dict) left_paths = get_path(left_lane_dict) right_paths = get_path(right_lane_dict) local_dividers = left_paths + right_paths return local_dividers def proc_polygon(self,polygon, ego_SE3_city): interiors = [] exterior_cityframe = np.array(list(polygon.exterior.coords)) exterior_egoframe = ego_SE3_city.transform_point_cloud(exterior_cityframe) for inter in polygon.interiors: inter_cityframe = np.array(list(inter.coords)) inter_egoframe = ego_SE3_city.transform_point_cloud(inter_cityframe) interiors.append(inter_egoframe[:,:3]) new_polygon = Polygon(exterior_egoframe[:,:3], interiors) return new_polygon def proc_line(self,line,ego_SE3_city): new_line_pts_cityframe = np.array(list(line.coords)) new_line_pts_egoframe = ego_SE3_city.transform_point_cloud(new_line_pts_cityframe) line = LineString(new_line_pts_egoframe[:,:3]) #TODO return line def extract_local_divider(self,nearby_dividers, ego_SE3_city, patch_box, patch_angle,patch_size): patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle) # pdb.set_trace() # final_pgeom = remove_repeated_lines(nearby_dividers) line_list = [] # pdb.set_trace() for line in nearby_dividers: if line.is_empty: # Skip lines without nodes. continue new_line = line.intersection(patch) if not new_line.is_empty: if new_line.geom_type == 'MultiLineString': for single_line in new_line.geoms: if single_line.is_empty: continue single_line = self.proc_line(single_line,ego_SE3_city) line_list.append(single_line) else: new_line = self.proc_line(new_line, ego_SE3_city) line_list.append(new_line) centerlines = line_list poly_centerlines = [line.buffer(0.1, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) for line in centerlines] index_by_id = dict((id(pt), i) for i, pt in enumerate(poly_centerlines)) tree = STRtree(poly_centerlines) final_pgeom = [] remain_idx = [i for i in range(len(centerlines))] for i, pline in enumerate(poly_centerlines): if i not in remain_idx: continue remain_idx.pop(remain_idx.index(i)) final_pgeom.append(centerlines[i]) for o in tree.query(pline): o_idx = index_by_id[id(o)] if o_idx not in remain_idx: continue inter = o.intersection(pline).area union = o.union(pline).area iou = inter / union if iou >= 0.90: remain_idx.pop(remain_idx.index(o_idx)) # return [np.array(line.coords) for line in final_pgeom] final_pgeom = connect_lines(final_pgeom) return final_pgeom def extract_local_boundary(self,avm, ego_SE3_city, patch_box, patch_angle,patch_size): boundary_list = [] patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle) for da in avm.get_scenario_vector_drivable_areas(): boundary_list.append(da.xyz) polygon_list = [] for da in boundary_list: exterior_coords = da interiors = [] # polygon = Polygon(exterior_coords, interiors) polygon = Polygon(exterior_coords, interiors) if polygon.is_valid: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: if new_polygon.geom_type is 'Polygon': if not new_polygon.is_valid: continue new_polygon = self.proc_polygon(new_polygon,ego_SE3_city) if not new_polygon.is_valid: continue elif new_polygon.geom_type is 'MultiPolygon': polygons = [] for single_polygon in new_polygon.geoms: if not single_polygon.is_valid or single_polygon.is_empty: continue new_single_polygon = self.proc_polygon(single_polygon,ego_SE3_city) if not new_single_polygon.is_valid: continue polygons.append(new_single_polygon) if len(polygons) == 0: continue new_polygon = MultiPolygon(polygons) if not new_polygon.is_valid: continue else: raise ValueError('{} is not valid'.format(new_polygon.geom_type)) if new_polygon.geom_type is 'Polygon': new_polygon = MultiPolygon([new_polygon]) polygon_list.append(new_polygon) union_segments = ops.unary_union(polygon_list) max_x = patch_size[1] / 2 max_y = patch_size[0] / 2 local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) exteriors = [] interiors = [] if union_segments.geom_type != 'MultiPolygon': union_segments = MultiPolygon([union_segments]) for poly in union_segments.geoms: exteriors.append(poly.exterior) for inter in poly.interiors: interiors.append(inter) results = [] for ext in exteriors: if ext.is_ccw: ext.coords = list(ext.coords)[::-1] lines = ext.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) for inter in interiors: if not inter.is_ccw: inter.coords = list(inter.coords)[::-1] lines = inter.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) boundary_lines = [] for line in results: if not line.is_empty: if line.geom_type == 'MultiLineString': for single_line in line.geoms: boundary_lines.append(single_line) elif line.geom_type == 'LineString': boundary_lines.append(line) else: raise NotImplementedError return boundary_lines def get_scene_dividers(self,avm,patch_box,patch_angle): patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle) scene_ls_list = avm.get_scenario_lane_segments() # pdb.set_trace() scene_ls_dict = dict() for ls in scene_ls_list: scene_ls_dict[ls.id] = dict( ls=ls, polygon = Polygon(ls.polygon_boundary), predecessors=ls.predecessors, successors=ls.successors ) nearby_ls_dict = dict() for key, value in scene_ls_dict.items(): polygon = value['polygon'] if polygon.is_valid: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: nearby_ls_dict[key] = value['ls'] ls_dict = nearby_ls_dict divider_ls_dict = dict() for key, value in ls_dict.items(): if not value.is_intersection: divider_ls_dict[key] = value return divider_ls_dict def get_scene_ped_crossings(self,avm,e2g_translation,e2g_rotation,polygon_ped=True): g2e_translation = e2g_rotation.T.dot(-e2g_translation) g2e_rotation = e2g_rotation.T roi_x, roi_y = self.roi_size[:2] local_patch = box(-roi_x / 2, -roi_y / 2, roi_x / 2, roi_y / 2) ped_crossings = [] for _, pc in avm.vector_pedestrian_crossings.items(): edge1_xyz = pc.edge1.xyz edge2_xyz = pc.edge2.xyz ego1_xyz = transform_from(edge1_xyz, g2e_translation, g2e_rotation) ego2_xyz = transform_from(edge2_xyz, g2e_translation, g2e_rotation) # if True, organize each ped crossing as closed polylines. if polygon_ped: vertices = np.concatenate([ego1_xyz, ego2_xyz[::-1, :]]) p = Polygon(vertices) line = get_ped_crossing_contour(p, local_patch) if line is not None: if len(line.coords) < 3 or Polygon(line).area < 1: continue ped_crossings.append(line) # Otherwise organize each ped crossing as two parallel polylines. else: line1 = LineString(ego1_xyz) line2 = LineString(ego2_xyz) line1_local = line1.intersection(local_patch) line2_local = line2.intersection(local_patch) # take the whole ped cross if all two edges are in roi range if not line1_local.is_empty and not line2_local.is_empty: ped_crossings.append(line1_local) ped_crossings.append(line2_local) return ped_crossings def get_map_geom(self, log_id: str, e2g_translation: NDArray, e2g_rotation: NDArray, polygon_ped=True) -> Dict[str, List[Union[LineString, Polygon]]]: ''' Extract geometries given `log_id` and ego pose. Args: log_id (str): log id e2g_translation (array): ego2global translation, shape (3,) e2g_rotation (array): ego2global rotation matrix, shape (3, 3) polygon_ped: if True, organize each ped crossing as closed polylines. \ Otherwise organize each ped crossing as two parallel polylines. \ Default: True Returns: geometries (Dict): extracted geometries by category. ''' avm = self.id2map[log_id] patch_h = self.roi_size[1] patch_w = self.roi_size[0] patch_size = (patch_h, patch_w) map_pose = e2g_translation[:2] rotation = Quaternion._from_matrix(e2g_rotation) patch_box = (map_pose[0], map_pose[1], patch_size[0], patch_size[1]) patch_angle = quaternion_yaw(rotation) / np.pi * 180 city_SE2_ego = SE3(e2g_rotation, e2g_translation) ego_SE3_city = city_SE2_ego.inverse() patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle) nearby_dividers = self.generate_nearby_dividers(avm, e2g_translation,e2g_rotation,patch) # pdb.set_trace() map_anno=dict( divider=[], ped_crossing=[], boundary=[], drivable_area=[], ) map_anno['ped_crossing'] = self.get_scene_ped_crossings(avm,e2g_translation,e2g_rotation,polygon_ped=polygon_ped) map_anno['boundary'] = self.extract_local_boundary(avm, ego_SE3_city, patch_box, patch_angle,patch_size) # map_anno['centerline'] = extract_local_centerline(nearby_centerlines, ego_SE3_city, patch_box, patch_angle,patch_size) all_dividers = self.extract_local_divider(nearby_dividers, ego_SE3_city, patch_box, patch_angle,patch_size) map_anno['divider'] = remove_boundary_dividers(all_dividers,map_anno['boundary']) ######## return map_anno ================================================ FILE: plugin/datasets/map_utils/nuscmap_extractor.py ================================================ from shapely.geometry import LineString, box, Polygon from shapely import ops, strtree import numpy as np from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer from nuscenes.eval.common.utils import quaternion_yaw from pyquaternion import Quaternion from .utils import split_collections, get_drivable_area_contour, get_ped_crossing_contour from numpy.typing import NDArray from typing import Dict, List, Tuple, Union from shapely.geometry import Polygon, MultiPolygon, LineString, Point, box, MultiLineString from shapely import affinity, ops import networkx as nx class NuscMapExtractor(object): """NuScenes map ground-truth extractor. Args: data_root (str): path to nuScenes dataset roi_size (tuple or list): bev range """ def __init__(self, data_root: str, roi_size: Union[List, Tuple]) -> None: self.roi_size = roi_size self.MAPS = ['boston-seaport', 'singapore-hollandvillage', 'singapore-onenorth', 'singapore-queenstown'] self.nusc_maps = {} self.map_explorer = {} for loc in self.MAPS: self.nusc_maps[loc] = NuScenesMap( dataroot=data_root, map_name=loc) self.map_explorer[loc] = CNuScenesMapExplorer(self.nusc_maps[loc]) def get_map_geom(self, location: str, e2g_translation: Union[List, NDArray], e2g_rotation: Union[List, NDArray]) -> Dict[str, List[Union[LineString, Polygon]]]: # Borrowed from MapTR's codebase to make sure data are the same # (center_x, center_y, len_y, len_x) in nuscenes format patch_size_ego_coord = (self.roi_size[1], self.roi_size[0]) patch_size_lidar_coord = (self.roi_size[0], self.roi_size[1]) vector_map_maptr = VectorizedLocalMap(self.nusc_maps[location], self.map_explorer[location], patch_size_lidar_coord, patch_size_ego_coord, map_classes=['divider','ped_crossing','boundary']) map_annos = vector_map_maptr.gen_vectorized_samples(e2g_translation, e2g_rotation) return dict( divider=map_annos['divider'], # List[LineString] ped_crossing=map_annos['ped_crossing'], # List[LineString] boundary=map_annos['boundary'], # List[LineString] drivable_area=[], # List[Polygon], ) class VectorizedLocalMap(object): CLASS2LABEL = { 'road_divider': 0, 'lane_divider': 0, 'ped_crossing': 1, 'contours': 2, 'others': -1 } def __init__(self, nusc_map, map_explorer, patch_size, roi_size, map_classes=['divider','ped_crossing','boundary','centerline'], line_classes=['road_divider', 'lane_divider'], ped_crossing_classes=['ped_crossing'], contour_classes=['road_segment', 'lane'], centerline_classes=['lane_connector','lane'], use_simplify=True, ): super().__init__() self.nusc_map = nusc_map self.map_explorer = map_explorer self.vec_classes = map_classes self.line_classes = line_classes self.ped_crossing_classes = ped_crossing_classes self.polygon_classes = contour_classes self.centerline_classes = centerline_classes self.patch_size = patch_size self.roi_size = roi_size self.local_patch = box(-self.roi_size[0] / 2, -self.roi_size[1] / 2, self.roi_size[0] / 2, self.roi_size[1] / 2) def gen_vectorized_samples(self, lidar2global_translation, lidar2global_rotation): ''' use lidar2global to get gt map layers ''' map_pose = lidar2global_translation[:2] rotation = Quaternion(lidar2global_rotation) # import ipdb;ipdb.set_trace() patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1]) patch_angle = quaternion_yaw(rotation) / np.pi * 180 map_dict = {'divider':[],'ped_crossing':[],'boundary':[],'centerline':[]} vectors = [] for vec_class in self.vec_classes: if vec_class == 'divider': line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes) line_instances_dict = self.line_geoms_to_instances(line_geom) for line_type, instances in line_instances_dict.items(): for instance in instances: instance = affinity.rotate(instance, -90, origin=(0, 0), use_radians=False) map_dict[vec_class].append(instance) # vectors.append((instance, self.CLASS2LABEL.get(line_type, -1))) elif vec_class == 'ped_crossing': ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes) ped_instance_list = ped_geom['ped_crossing'] #ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom) for instance in ped_instance_list: # vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1))) instance = affinity.rotate(instance, -90, origin=(0, 0), use_radians=False) map_dict[vec_class].append(instance) elif vec_class == 'boundary': polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes) poly_bound_list = self.poly_geoms_to_instances(polygon_geom) for instance in poly_bound_list: # import ipdb;ipdb.set_trace() instance = affinity.rotate(instance, -90, origin=(0, 0), use_radians=False) map_dict[vec_class].append(instance) # vectors.append((contour, self.CLASS2LABEL.get('contours', -1))) elif vec_class =='centerline': centerline_geom = self.get_centerline_geom(patch_box, patch_angle, self.centerline_classes) centerline_list = self.centerline_geoms_to_instances(centerline_geom) for instance in centerline_list: instance = affinity.rotate(instance, -90, origin=(0, 0), use_radians=False) map_dict[vec_class].append(instance) else: raise ValueError(f'WRONG vec_class: {vec_class}') return map_dict def get_centerline_geom(self, patch_box, patch_angle, layer_names): map_geom = {} for layer_name in layer_names: if layer_name in self.centerline_classes: return_token = False layer_centerline_dict = self.map_explorer._get_centerline( patch_box, patch_angle, layer_name, return_token=return_token) if len(layer_centerline_dict.keys()) == 0: continue # import ipdb;ipdb.set_trace() map_geom.update(layer_centerline_dict) return map_geom def get_map_geom(self, patch_box, patch_angle, layer_names): map_geom = {} for layer_name in layer_names: if layer_name in self.line_classes: geoms = self.get_divider_line(patch_box, patch_angle, layer_name) # map_geom.append((layer_name, geoms)) map_geom[layer_name] = geoms elif layer_name in self.polygon_classes: geoms = self.get_contour_line(patch_box, patch_angle, layer_name) # map_geom.append((layer_name, geoms)) map_geom[layer_name] = geoms elif layer_name in self.ped_crossing_classes: geoms = self.get_ped_crossing_line_stmmapnet(patch_box, patch_angle) # map_geom.append((layer_name, geoms)) map_geom[layer_name] = geoms return map_geom def get_divider_line(self,patch_box,patch_angle,layer_name): if layer_name not in self.map_explorer.map_api.non_geometric_line_layers: raise ValueError("{} is not a line layer".format(layer_name)) if layer_name == 'traffic_light': return None patch_x = patch_box[0] patch_y = patch_box[1] patch = self.map_explorer.get_patch_coord(patch_box, patch_angle) line_list = [] records = getattr(self.map_explorer.map_api, layer_name) for record in records: line = self.map_explorer.map_api.extract_line(record['line_token']) if line.is_empty: # Skip lines without nodes. continue new_line = line.intersection(patch) if not new_line.is_empty: new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False) new_line = affinity.affine_transform(new_line, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) line_list.append(new_line) return line_list def get_contour_line(self,patch_box,patch_angle,layer_name): if layer_name not in self.map_explorer.map_api.non_geometric_polygon_layers: raise ValueError('{} is not a polygonal layer'.format(layer_name)) patch_x = patch_box[0] patch_y = patch_box[1] patch = self.map_explorer.get_patch_coord(patch_box, patch_angle) records = getattr(self.map_explorer.map_api, layer_name) polygon_list = [] if layer_name == 'drivable_area': for record in records: polygons = [self.map_explorer.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']] for polygon in polygons: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: new_polygon = affinity.rotate(new_polygon, -patch_angle, origin=(patch_x, patch_y), use_radians=False) new_polygon = affinity.affine_transform(new_polygon, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) if new_polygon.geom_type == 'Polygon': new_polygon = MultiPolygon([new_polygon]) polygon_list.append(new_polygon) else: for record in records: polygon = self.map_explorer.map_api.extract_polygon(record['polygon_token']) if polygon.is_valid: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: new_polygon = affinity.rotate(new_polygon, -patch_angle, origin=(patch_x, patch_y), use_radians=False) new_polygon = affinity.affine_transform(new_polygon, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) if new_polygon.geom_type == 'Polygon': new_polygon = MultiPolygon([new_polygon]) polygon_list.append(new_polygon) return polygon_list def get_ped_crossing_line(self, patch_box, patch_angle): patch_x = patch_box[0] patch_y = patch_box[1] patch = self.map_explorer.get_patch_coord(patch_box, patch_angle) polygon_list = [] records = getattr(self.map_explorer.map_api, 'ped_crossing') # records = getattr(self.nusc_maps[location], 'ped_crossing') for record in records: polygon = self.map_explorer.map_api.extract_polygon(record['polygon_token']) if polygon.is_valid: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: new_polygon = affinity.rotate(new_polygon, -patch_angle, origin=(patch_x, patch_y), use_radians=False) new_polygon = affinity.affine_transform(new_polygon, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) if new_polygon.geom_type == 'Polygon': new_polygon = MultiPolygon([new_polygon]) polygon_list.append(new_polygon) return polygon_list def _union_ped_stmmapnet(self, ped_geoms: List[Polygon]) -> List[Polygon]: ''' merge close ped crossings. Args: ped_geoms (list): list of Polygon Returns: union_ped_geoms (Dict): merged ped crossings ''' ped_geoms = sorted(ped_geoms, key=lambda x:x.area, reverse=True) def get_rec_direction(geom): rect = geom.minimum_rotated_rectangle rect_v_p = np.array(rect.exterior.coords)[:3] rect_v = rect_v_p[1:]-rect_v_p[:-1] v_len = np.linalg.norm(rect_v, axis=-1) longest_v_i = v_len.argmax() return rect_v[longest_v_i], v_len[longest_v_i] tree = strtree.STRtree(ped_geoms) index_by_id = dict((id(pt), i) for i, pt in enumerate(ped_geoms)) final_pgeom = [] remain_idx = [i for i in range(len(ped_geoms))] for i, pgeom in enumerate(ped_geoms): if i not in remain_idx: continue # update remain_idx.pop(remain_idx.index(i)) pgeom_v, pgeom_v_norm = get_rec_direction(pgeom) final_pgeom.append(pgeom) intersect_pgeom = tree.query(pgeom) intersect_pgeom = sorted(intersect_pgeom, key=lambda x:x.area, reverse=True) for o in intersect_pgeom: o_idx = index_by_id[id(o)] if o_idx not in remain_idx: continue o_v, o_v_norm = get_rec_direction(o) cos = pgeom_v.dot(o_v)/(pgeom_v_norm*o_v_norm) o_pgeom_union = o.union(pgeom) ch_union = o_pgeom_union.convex_hull ch_area_ratio = o_pgeom_union.area / ch_union.area # add an extra criterion for merging here to handle patch-boundary-case if 1 - np.abs(cos) < 0.01 and ch_area_ratio > 0.8: # theta < 8 degrees. final_pgeom[-1] =\ final_pgeom[-1].union(o) # update remain_idx.pop(remain_idx.index(o_idx)) final_pgeom = self._handle_small_peds(final_pgeom) results = [] for p in final_pgeom: results.extend(split_collections(p)) return results def _handle_small_peds(self, ped_geoms): def get_two_rec_directions(geom): rect = geom.minimum_rotated_rectangle rect_v_p = np.array(rect.exterior.coords)[:3] rect_v = rect_v_p[1:]-rect_v_p[:-1] v_len = np.linalg.norm(rect_v, axis=-1) return rect_v, v_len tree = strtree.STRtree(ped_geoms) index_by_id = dict((id(pt), i) for i, pt in enumerate(ped_geoms)) final_pgeom = [] remain_idx = [i for i in range(len(ped_geoms))] for i, pgeom in enumerate(ped_geoms): if i not in remain_idx: continue # update remain_idx.pop(remain_idx.index(i)) final_pgeom.append(pgeom) pgeom_v, pgeom_v_norm = get_two_rec_directions(pgeom) intersect_pgeom = tree.query(pgeom) intersect_pgeom = sorted(intersect_pgeom, key=lambda x:x.area, reverse=True) for o in intersect_pgeom: o_idx = index_by_id[id(o)] if o_idx not in remain_idx: continue if o.area >= pgeom.area: continue o_pgeom_union = o.union(pgeom) o_v, o_v_norm = get_two_rec_directions(o_pgeom_union) ch_union = o_pgeom_union.convex_hull ch_area_ratio = o_pgeom_union.area / ch_union.area #mrr_union = o_pgeom_union.minimum_rotated_rectangle #mrr_area_ratio = o_pgeom_union.area / mrr_union.area cos_00 = pgeom_v[0].dot(o_v[0])/(pgeom_v_norm[0]*o_v_norm[0]) cos_01 = pgeom_v[0].dot(o_v[1])/(pgeom_v_norm[0]*o_v_norm[1]) cos_10 = pgeom_v[1].dot(o_v[0])/(pgeom_v_norm[1]*o_v_norm[0]) cos_11 = pgeom_v[1].dot(o_v[1])/(pgeom_v_norm[1]*o_v_norm[1]) cos_checks = np.array([(1 - np.abs(cos) < 0.001) for cos in [cos_00, cos_01, cos_10, cos_11]]) # add an extra criterion for merging here to handle patch-boundary-case if cos_checks.sum() == 2 and ch_area_ratio > 0.8: final_pgeom[-1] =\ final_pgeom[-1].union(o) # update remain_idx.pop(remain_idx.index(o_idx)) return final_pgeom def get_ped_crossing_line_stmmapnet(self, patch_box, patch_angle): # get ped crossings ped_crossings = [] ped = self.map_explorer._get_layer_polygon( patch_box, patch_angle, 'ped_crossing') for p in ped: ped_crossings += split_collections(p) # some ped crossings are split into several small parts # we need to merge them ped_crossings = self._union_ped_stmmapnet(ped_crossings) # NOTE: clean-up noisy ped-crossing instances (for our cleaned training data only, maybe need to still # use the original version when evaluation...) # 1). filter too small ped_crossing merging results #areas = [p.area for p in ped_crossings] #print('Ped areas\n', areas) updated_ped_crossings = [] for p_idx, p in enumerate(ped_crossings): area = p.area if area < 1: continue elif area < 20: covered = False for other_idx, p_other in enumerate(ped_crossings): if other_idx != p_idx and p.covered_by(p_other): covered = True break if not covered: updated_ped_crossings.append(p) else: updated_ped_crossings.append(p) ped_crossing_lines = [] for p in updated_ped_crossings: # extract exteriors to get a closed polyline line = get_ped_crossing_contour(p, self.local_patch) if line is not None: ped_crossing_lines.append(line) return ped_crossing_lines def line_geoms_to_instances(self, line_geom): line_instances_dict = dict() for line_type, a_type_of_lines in line_geom.items(): one_type_instances = self._one_type_line_geom_to_instances(a_type_of_lines) line_instances_dict[line_type] = one_type_instances return line_instances_dict def _one_type_line_geom_to_instances(self, line_geom): line_instances = [] for line in line_geom: if not line.is_empty: if line.geom_type == 'MultiLineString': for single_line in line.geoms: line_instances.append(single_line) elif line.geom_type == 'LineString': line_instances.append(line) else: raise NotImplementedError return line_instances def ped_poly_geoms_to_instances(self, ped_geom): # ped = ped_geom[0][1] # import ipdb;ipdb.set_trace() ped = ped_geom['ped_crossing'] union_segments = ops.unary_union(ped) max_x = self.patch_size[1] / 2 max_y = self.patch_size[0] / 2 local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2) exteriors = [] interiors = [] if union_segments.geom_type != 'MultiPolygon': union_segments = MultiPolygon([union_segments]) for poly in union_segments.geoms: exteriors.append(poly.exterior) for inter in poly.interiors: interiors.append(inter) results = [] for ext in exteriors: if ext.is_ccw: ext.coords = list(ext.coords)[::-1] lines = ext.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) for inter in interiors: if not inter.is_ccw: inter.coords = list(inter.coords)[::-1] lines = inter.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) return self._one_type_line_geom_to_instances(results) def poly_geoms_to_instances(self, polygon_geom): roads = polygon_geom['road_segment'] lanes = polygon_geom['lane'] # import ipdb;ipdb.set_trace() union_roads = ops.unary_union(roads) union_lanes = ops.unary_union(lanes) union_segments = ops.unary_union([union_roads, union_lanes]) max_x = self.patch_size[1] / 2 max_y = self.patch_size[0] / 2 local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) exteriors = [] interiors = [] if union_segments.geom_type != 'MultiPolygon': union_segments = MultiPolygon([union_segments]) for poly in union_segments.geoms: exteriors.append(poly.exterior) for inter in poly.interiors: interiors.append(inter) results = [] for ext in exteriors: if ext.is_ccw: ext.coords = list(ext.coords)[::-1] lines = ext.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) for inter in interiors: if not inter.is_ccw: inter.coords = list(inter.coords)[::-1] lines = inter.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) return self._one_type_line_geom_to_instances(results) def centerline_geoms_to_instances(self,geoms_dict): centerline_geoms_list,pts_G = self.union_centerline(geoms_dict) # vectors_dict = self.centerline_geoms2vec(centerline_geoms_list) # import ipdb;ipdb.set_trace() return self._one_type_line_geom_to_instances(centerline_geoms_list) def centerline_geoms2vec(self, centerline_geoms_list): vector_dict = {} # import ipdb;ipdb.set_trace() # centerline_geoms_list = [line.simplify(0.2, preserve_topology=True) \ # for line in centerline_geoms_list] vectors = self._geom_to_vectors( centerline_geoms_list) vector_dict.update({'centerline': ('centerline', vectors)}) return vector_dict def union_centerline(self, centerline_geoms): # import ipdb;ipdb.set_trace() pts_G = nx.DiGraph() junction_pts_list = [] for key, value in centerline_geoms.items(): centerline_geom = value['centerline'] if centerline_geom.geom_type == 'MultiLineString': start_pt = np.array(centerline_geom.geoms[0].coords).round(3)[0] end_pt = np.array(centerline_geom.geoms[-1].coords).round(3)[-1] for single_geom in centerline_geom.geoms: single_geom_pts = np.array(single_geom.coords).round(3) for idx, pt in enumerate(single_geom_pts[:-1]): pts_G.add_edge(tuple(single_geom_pts[idx]),tuple(single_geom_pts[idx+1])) elif centerline_geom.geom_type == 'LineString': centerline_pts = np.array(centerline_geom.coords).round(3) start_pt = centerline_pts[0] end_pt = centerline_pts[-1] for idx, pts in enumerate(centerline_pts[:-1]): pts_G.add_edge(tuple(centerline_pts[idx]),tuple(centerline_pts[idx+1])) else: raise NotImplementedError valid_incoming_num = 0 for idx, pred in enumerate(value['incoming_tokens']): if pred in centerline_geoms.keys(): valid_incoming_num += 1 pred_geom = centerline_geoms[pred]['centerline'] if pred_geom.geom_type == 'MultiLineString': pred_pt = np.array(pred_geom.geoms[-1].coords).round(3)[-1] # if pred_pt != centerline_pts[0]: pts_G.add_edge(tuple(pred_pt), tuple(start_pt)) else: pred_pt = np.array(pred_geom.coords).round(3)[-1] pts_G.add_edge(tuple(pred_pt), tuple(start_pt)) if valid_incoming_num > 1: junction_pts_list.append(tuple(start_pt)) valid_outgoing_num = 0 for idx, succ in enumerate(value['outgoing_tokens']): if succ in centerline_geoms.keys(): valid_outgoing_num += 1 succ_geom = centerline_geoms[succ]['centerline'] if succ_geom.geom_type == 'MultiLineString': succ_pt = np.array(succ_geom.geoms[0].coords).round(3)[0] # if pred_pt != centerline_pts[0]: pts_G.add_edge(tuple(end_pt), tuple(succ_pt)) else: succ_pt = np.array(succ_geom.coords).round(3)[0] pts_G.add_edge(tuple(end_pt), tuple(succ_pt)) if valid_outgoing_num > 1: junction_pts_list.append(tuple(end_pt)) roots = (v for v, d in pts_G.in_degree() if d == 0) leaves = [v for v, d in pts_G.out_degree() if d == 0] all_paths = [] for root in roots: paths = nx.all_simple_paths(pts_G, root, leaves) all_paths.extend(paths) final_centerline_paths = [] for path in all_paths: merged_line = LineString(path) merged_line = merged_line.simplify(0.2, preserve_topology=True) final_centerline_paths.append(merged_line) return final_centerline_paths, pts_G class CNuScenesMapExplorer(NuScenesMapExplorer): def __ini__(self, *args, **kwargs): super(self, CNuScenesMapExplorer).__init__(*args, **kwargs) def _get_centerline(self, patch_box: Tuple[float, float, float, float], patch_angle: float, layer_name: str, return_token: bool = False) -> dict: """ Retrieve the centerline of a particular layer within the specified patch. :param patch_box: Patch box defined as [x_center, y_center, height, width]. :param patch_angle: Patch orientation in degrees. :param layer_name: name of map layer to be extracted. :return: dict(token:record_dict, token:record_dict,...) """ if layer_name not in ['lane','lane_connector']: raise ValueError('{} is not a centerline layer'.format(layer_name)) patch_x = patch_box[0] patch_y = patch_box[1] patch = self.get_patch_coord(patch_box, patch_angle) records = getattr(self.map_api, layer_name) centerline_dict = dict() for record in records: if record['polygon_token'] is None: # import ipdb # ipdb.set_trace() continue polygon = self.map_api.extract_polygon(record['polygon_token']) # if polygon.intersects(patch) or polygon.within(patch): # if not polygon.is_valid: # print('within: {}, intersect: {}'.format(polygon.within(patch), polygon.intersects(patch))) # print('polygon token {} is_valid: {}'.format(record['polygon_token'], polygon.is_valid)) # polygon = polygon.buffer(0) if polygon.is_valid: # if within or intersect : new_polygon = polygon.intersection(patch) # new_polygon = polygon if not new_polygon.is_empty: centerline = self.map_api.discretize_lanes( record, 0.5) centerline = list(self.map_api.discretize_lanes([record['token']], 0.5).values())[0] centerline = LineString(np.array(centerline)[:,:2].round(3)) if centerline.is_empty: continue centerline = centerline.intersection(patch) if not centerline.is_empty: centerline = \ to_patch_coord(centerline, patch_angle, patch_x, patch_y) # centerline.coords = np.array(centerline.coords).round(3) # if centerline.geom_type != 'LineString': # import ipdb;ipdb.set_trace() record_dict = dict( centerline=centerline, token=record['token'], incoming_tokens=self.map_api.get_incoming_lane_ids(record['token']), outgoing_tokens=self.map_api.get_outgoing_lane_ids(record['token']), ) centerline_dict.update({record['token']: record_dict}) return centerline_dict def to_patch_coord(new_polygon, patch_angle, patch_x, patch_y): new_polygon = affinity.rotate(new_polygon, -patch_angle, origin=(patch_x, patch_y), use_radians=False) new_polygon = affinity.affine_transform(new_polygon, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) return new_polygon ================================================ FILE: plugin/datasets/map_utils/utils.py ================================================ from shapely.geometry import LineString, box, Polygon, LinearRing from shapely.geometry.base import BaseGeometry from shapely import ops import numpy as np from scipy.spatial import distance from typing import List, Optional, Tuple from numpy.typing import NDArray def split_collections(geom: BaseGeometry) -> List[Optional[BaseGeometry]]: ''' Split Multi-geoms to list and check is valid or is empty. Args: geom (BaseGeometry): geoms to be split or validate. Returns: geometries (List): list of geometries. ''' assert geom.geom_type in ['MultiLineString', 'LineString', 'MultiPolygon', 'Polygon', 'GeometryCollection'], f"got geom type {geom.geom_type}" if 'Multi' in geom.geom_type: outs = [] for g in geom.geoms: if g.is_valid and not g.is_empty: outs.append(g) return outs else: if geom.is_valid and not geom.is_empty: return [geom,] else: return [] def get_drivable_area_contour(drivable_areas: List[Polygon], roi_size: Tuple) -> List[LineString]: ''' Extract drivable area contours to get list of boundaries. Args: drivable_areas (list): list of drivable areas. roi_size (tuple): bev range size Returns: boundaries (List): list of boundaries. ''' max_x = roi_size[0] / 2 max_y = roi_size[1] / 2 # a bit smaller than roi to avoid unexpected boundaries on edges local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) exteriors = [] interiors = [] for poly in drivable_areas: exteriors.append(poly.exterior) for inter in poly.interiors: interiors.append(inter) results = [] for ext in exteriors: # NOTE: we make sure all exteriors are clock-wise # such that each boundary's right-hand-side is drivable area # and left-hand-side is walk way if ext.is_ccw: ext = LinearRing(list(ext.coords)[::-1]) lines = ext.intersection(local_patch) if lines.geom_type == 'GeometryCollection' and len(lines) == 0: continue if lines.geom_type == 'MultiLineString': lines = ops.linemerge(lines) assert lines.geom_type in ['MultiLineString', 'LineString'] results.extend(split_collections(lines)) for inter in interiors: # NOTE: we make sure all interiors are counter-clock-wise if not inter.is_ccw: inter = LinearRing(list(inter.coords)[::-1]) lines = inter.intersection(local_patch) if lines.geom_type == 'GeometryCollection' and len(lines) == 0: continue if lines.geom_type == 'MultiLineString': lines = ops.linemerge(lines) assert lines.geom_type in ['MultiLineString', 'LineString'] results.extend(split_collections(lines)) return results def get_ped_crossing_contour(polygon: Polygon, local_patch: box) -> Optional[LineString]: ''' Extract ped crossing contours to get a closed polyline. Different from `get_drivable_area_contour`, this function ensures a closed polyline. Args: polygon (Polygon): ped crossing polygon to be extracted. local_patch (tuple): local patch params Returns: line (LineString): a closed line ''' ext = polygon.exterior if not ext.is_ccw: ext = LinearRing(list(ext.coords)[::-1]) lines = ext.intersection(local_patch) if lines.type != 'LineString': # remove points in intersection results lines = [l for l in lines.geoms if l.geom_type != 'Point'] lines = ops.linemerge(lines) # same instance but not connected. if lines.type != 'LineString': ls = [] for l in lines.geoms: ls.append(np.array(l.coords)) lines = np.concatenate(ls, axis=0) lines = LineString(lines) if not lines.is_empty: start = list(lines.coords[0]) end = list(lines.coords[-1]) if not np.allclose(start, end, atol=1e-3): new_line = list(lines.coords) new_line.append(start) lines = LineString(new_line) # make ped cross closed return lines return None def remove_repeated_lines(lines: List[LineString]) -> List[LineString]: ''' Remove repeated dividers since each divider in argoverse2 is mentioned twice by both left lane and right lane. Args: lines (List): list of dividers Returns: lines (List): list of left dividers ''' new_lines = [] for line in lines: repeated = False for l in new_lines: length = min(line.length, l.length) # hand-crafted rule to check overlap # if line.buffer(0.01).intersection(l.buffer(0.01)).area \ # > 0.2 * length: # repeated = True # break area1 = line.buffer(0.1) area2 = l.buffer(0.1) inter = area1.intersection(area2).area union = area1.union(area2).area iou = inter / union if iou >= 0.90: repeated = True break if not repeated: new_lines.append(line) return new_lines def remove_repeated_lanesegment(lane_dict): ''' Remove repeated dividers since each divider in argoverse2 is mentioned twice by both left lane and right lane. Args: lines (List): list of dividers Returns: lines (List): list of left dividers ''' new_lane_dict = {} # for line in lines: for key, value in lane_dict.items(): repeated = False # for l in new_lines: for new_key, new_value in new_lane_dict.items(): # length = min(line.length, l.length) line = LineString(value['polyline'].xyz) l = LineString(new_value['polyline'].xyz) area1 = line.buffer(0.01) area2 = l.buffer(0.01) inter = area1.intersection(area2).area union = area1.union(area2).area iou = inter / union if iou >= 0.90: repeated = True break if not repeated: new_lane_dict[key] = value return new_lane_dict def reassign_graph_attribute(lane_dict): for key, value in lane_dict.items(): if len(value['predecessors']) > 0: if value['predecessors'][0] not in lane_dict.keys() or value['predecessors'][0]==key: value['predecessors'] = [] else: lane_dict[value['predecessors'][0]]['successors'] = [key] for key, value in lane_dict.items(): if len(value['successors']) > 0: if value['successors'][0] not in lane_dict.keys() or value['successors'][0]==key: value['successors'] = [] else: lane_dict[value['successors'][0]]['predecessors'] = [key] return lane_dict def remove_boundary_dividers(dividers: List[LineString], boundaries: List[LineString]) -> List[LineString]: ''' Some dividers overlaps with boundaries in argoverse2 dataset so we need to remove these dividers. Args: dividers (list): list of dividers boundaries (list): list of boundaries Returns: left_dividers (list): list of left dividers ''' for idx in range(len(dividers))[::-1]: divider = dividers[idx] for bound in boundaries: length = min(divider.length, bound.length) # hand-crafted rule to check overlap if divider.buffer(0.3).intersection(bound.buffer(0.3)).area \ > 0.2 * length: # the divider overlaps boundary dividers.pop(idx) break return dividers def connect_lines(lines: List[LineString]) -> List[LineString]: ''' Some dividers are split into multiple small parts so we need to connect these lines. Args: dividers (list): list of dividers boundaries (list): list of boundaries Returns: left_dividers (list): list of left dividers ''' new_lines = [] eps = 0.1 # threshold to identify continuous lines while len(lines) > 1: line1 = lines[0] merged_flag = False for i, line2 in enumerate(lines[1:]): # hand-crafted rule begin1 = list(line1.coords)[0] end1 = list(line1.coords)[-1] begin2 = list(line2.coords)[0] end2 = list(line2.coords)[-1] dist_matrix = distance.cdist([begin1, end1], [begin2, end2]) if dist_matrix[0, 0] < eps: coords = list(line2.coords)[::-1] + list(line1.coords) elif dist_matrix[0, 1] < eps: coords = list(line2.coords) + list(line1.coords) elif dist_matrix[1, 0] < eps: coords = list(line1.coords) + list(line2.coords) elif dist_matrix[1, 1] < eps: coords = list(line1.coords) + list(line2.coords)[::-1] else: continue new_line = LineString(coords) lines.pop(i + 1) lines[0] = new_line merged_flag = True break if merged_flag: continue new_lines.append(line1) lines.pop(0) if len(lines) == 1: new_lines.append(lines[0]) return new_lines def transform_from(xyz: NDArray, translation: NDArray, rotation: NDArray) -> NDArray: ''' Transform points between different coordinate system. Args: xyz (array): original point coordinates translation (array): translation rotation (array): rotation matrix Returns: left_dividers (list): list of left dividers ''' new_xyz = xyz @ rotation.T + translation return new_xyz ================================================ FILE: plugin/datasets/nusc_dataset.py ================================================ from.base_dataset import BaseMapDataset from .map_utils.nuscmap_extractor import NuscMapExtractor from mmdet.datasets import DATASETS import numpy as np from .visualize.renderer import Renderer import mmcv from time import time from pyquaternion import Quaternion import pickle @DATASETS.register_module() class NuscDataset(BaseMapDataset): """NuScenes map dataset class. Args: ann_file (str): annotation file path cat2id (dict): category to class id roi_size (tuple): bev range eval_config (Config): evaluation config meta (dict): meta information pipeline (Config): data processing pipeline config interval (int): annotation load interval work_dir (str): path to work dir test_mode (bool): whether in test mode """ def __init__(self, data_root, **kwargs): super().__init__(**kwargs) self.map_extractor = NuscMapExtractor(data_root, self.roi_size) self.renderer = Renderer(self.cat2id, self.roi_size, 'nusc') def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations. """ start_time = time() ann = mmcv.load(ann_file) samples = ann[::self.interval] print(f'collected {len(samples)} samples in {(time() - start_time):.2f}s') self.samples = samples def load_matching(self, matching_file): with open(matching_file, 'rb') as pf: data = pickle.load(pf) total_samples = 0 for scene_name, info in data.items(): total_samples += len(info['sample_ids']) assert total_samples == len(self.samples), 'Matching info not matched with data samples' self.matching_meta = data print(f'loaded matching meta for {len(data)} scenes') def get_sample(self, idx): """Get data sample. For each sample, map extractor will be applied to extract map elements. Args: idx (int): data index Returns: result (dict): dict of input """ sample = self.samples[idx] location = sample['location'] lidar2ego = np.eye(4) lidar2ego[:3,:3] = Quaternion(sample['lidar2ego_rotation']).rotation_matrix lidar2ego[:3, 3] = sample['lidar2ego_translation'] ego2global = np.eye(4) ego2global[:3,:3] = Quaternion(sample['e2g_rotation']).rotation_matrix ego2global[:3, 3] = sample['e2g_translation'] # NOTE: The original StreamMapNet uses the ego location to query the map, # to align with the lidar-centered setting in MapTR, we made some modifiactions # here to switch to the lidar-center setting lidar2global = ego2global @ lidar2ego lidar2global_translation = list(lidar2global[:3, 3]) lidar2global_translation = [float(x) for x in lidar2global_translation] lidar2global_rotation = list(Quaternion(matrix=lidar2global).q) map_geoms = self.map_extractor.get_map_geom(location, lidar2global_translation, lidar2global_rotation) lidar_shifted_e2g_translation = np.array(sample['e2g_translation']) lidar_shifted_e2g_translation[0] = lidar2global_translation[0] lidar_shifted_e2g_translation[1] = lidar2global_translation[1] lidar_shifted_e2g_translation = lidar_shifted_e2g_translation.tolist() e2g_rotation = sample['e2g_rotation'] lidar2global = np.eye(4) lidar2global[:3,:3] = Quaternion(e2g_rotation).rotation_matrix lidar2global[:3, 3] = lidar_shifted_e2g_translation global2lidar = np.linalg.inv(lidar2global) ego2lidar = global2lidar @ ego2global map_label2geom = {} for k, v in map_geoms.items(): if k in self.cat2id.keys(): map_label2geom[self.cat2id[k]] = v ego2img_rts = [] ego2cam_rts = [] for c in sample['cams'].values(): extrinsic, intrinsic = np.array( c['extrinsics']), np.array(c['intrinsics']) # ego coord to cam coord #ego2cam_rt = extrinsic cam2ego_rt = np.linalg.inv(extrinsic) cam2lidar_rt = ego2lidar @ cam2ego_rt lidar2cam_rt = np.linalg.inv(cam2lidar_rt) ego2cam_rt = lidar2cam_rt viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic ego2img_rt = (viewpad @ ego2cam_rt) ego2cam_rts.append(ego2cam_rt) ego2img_rts.append(ego2img_rt) input_dict = { 'location': location, 'token': sample['token'], 'img_filenames': [c['img_fpath'] for c in sample['cams'].values()], # intrinsics are 3x3 Ks 'cam_intrinsics': [c['intrinsics'] for c in sample['cams'].values()], # extrinsics are 4x4 tranform matrix, **ego2cam** 'cam_extrinsics': [c['extrinsics'] for c in sample['cams'].values()], 'ego2img': ego2img_rts, 'ego2cam': ego2cam_rts, 'map_geoms': map_label2geom, # {0: List[ped_crossing(LineString)], 1: ...} #'ego2global_translation': sample['e2g_translation'], #'ego2global_rotation': Quaternion(sample['e2g_rotation']).rotation_matrix.tolist(), 'ego2global_translation': lidar_shifted_e2g_translation, 'ego2global_rotation': Quaternion(e2g_rotation).rotation_matrix.tolist(), 'sample_idx': sample['sample_idx'], 'scene_name': sample['scene_name'], 'lidar2ego_translation': sample['lidar2ego_translation'], 'lidar2ego_rotation': sample['lidar2ego_rotation'], } return input_dict ================================================ FILE: plugin/datasets/pipelines/__init__.py ================================================ from .loading import LoadMultiViewImagesFromFiles from .formating import FormatBundleMap from .transform import ResizeMultiViewImages, PadMultiViewImages, Normalize3D, PhotoMetricDistortionMultiViewImage from .rasterize import RasterizeMap, PV_Map from .vectorize import VectorizeMap __all__ = [ 'LoadMultiViewImagesFromFiles', 'FormatBundleMap', 'Normalize3D', 'ResizeMultiViewImages', 'PadMultiViewImages', 'RasterizeMap', 'PV_Map', 'VectorizeMap', 'PhotoMetricDistortionMultiViewImage' ] ================================================ FILE: plugin/datasets/pipelines/formating.py ================================================ import numpy as np from mmcv.parallel import DataContainer as DC from mmdet3d.core.points import BasePoints from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import to_tensor @PIPELINES.register_module() class FormatBundleMap(object): """Format data for map tasks and then collect data for model input. These fields are formatted as follows. - img: (1) transpose, (2) to tensor, (3) to DataContainer (stack=True) - semantic_mask (if exists): (1) to tensor, (2) to DataContainer (stack=True) - vectors (if exists): (1) to DataContainer (cpu_only=True) - img_metas: (1) to DataContainer (cpu_only=True) """ def __init__(self, process_img=True, keys=['img', 'semantic_mask', 'vectors'], meta_keys=['intrinsics', 'extrinsics']): self.process_img = process_img self.keys = keys self.meta_keys = meta_keys def __call__(self, results): """Call function to transform and format common fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with default bundle. """ # Format 3D data if 'points' in results: assert isinstance(results['points'], BasePoints) results['points'] = DC(results['points'].tensor) for key in ['voxels', 'coors', 'voxel_centers', 'num_points']: if key not in results: continue results[key] = DC(to_tensor(results[key]), stack=False) if 'img' in results and self.process_img: if isinstance(results['img'], list): # process multiple imgs in single frame imgs = [img.transpose(2, 0, 1) for img in results['img']] imgs = np.ascontiguousarray(np.stack(imgs, axis=0)) results['img'] = DC(to_tensor(imgs), stack=True) else: img = np.ascontiguousarray(results['img'].transpose(2, 0, 1)) results['img'] = DC(to_tensor(img), stack=True) if 'semantic_mask' in results: #results['semantic_mask'] = DC(to_tensor(results['semantic_mask']), stack=True) if isinstance(results['semantic_mask'], np.ndarray): results['semantic_mask'] = DC(to_tensor(results['semantic_mask']), stack=True, pad_dims=None) else: assert isinstance(results['semantic_mask'], list) results['semantic_mask'] = DC(results['semantic_mask'], stack=False) if 'vectors' in results: # vectors may have different sizes vectors = results['vectors'] results['vectors'] = DC(vectors, stack=False, cpu_only=True) if 'polys' in results: results['polys'] = DC(results['polys'], stack=False, cpu_only=True) return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(process_img={self.process_img}, ' return repr_str ================================================ FILE: plugin/datasets/pipelines/loading.py ================================================ import mmcv import numpy as np from mmdet.datasets.builder import PIPELINES @PIPELINES.register_module(force=True) class LoadMultiViewImagesFromFiles(object): """Load multi channel images from a list of separate channel files. Expects results['img_filename'] to be a list of filenames. Args: to_float32 (bool): Whether to convert the img to float32. Defaults to False. color_type (str): Color type of the file. Defaults to 'unchanged'. """ def __init__(self, to_float32=False, color_type='unchanged'): self.to_float32 = to_float32 self.color_type = color_type def __call__(self, results): """Call function to load multi-view image from files. Args: results (dict): Result dict containing multi-view image filenames. Returns: dict: The result dict containing the multi-view image data. \ Added keys and values are described below. - filename (str): Multi-view image filenames. - img (np.ndarray): Multi-view image arrays. - img_shape (tuple[int]): Shape of multi-view image arrays. - ori_shape (tuple[int]): Shape of original image arrays. - pad_shape (tuple[int]): Shape of padded image arrays. - scale_factor (float): Scale factor. - img_norm_cfg (dict): Normalization configuration of images. """ filename = results['img_filenames'] img = [mmcv.imread(name, self.color_type) for name in filename] if self.to_float32: img = [i.astype(np.float32) for i in img] results['img'] = img results['img_shape'] = [i.shape for i in img] results['ori_shape'] = [i.shape for i in img] # Set initial values for default meta_keys results['pad_shape'] = [i.shape for i in img] # results['scale_factor'] = 1.0 num_channels = 1 if len(img[0].shape) < 3 else img[0].shape[2] results['img_norm_cfg'] = dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False) results['img_fields'] = ['img'] return results def __repr__(self): """str: Return a string that describes the module.""" return f'{self.__class__.__name__} (to_float32={self.to_float32}, '\ f"color_type='{self.color_type}')" ================================================ FILE: plugin/datasets/pipelines/rasterize.py ================================================ import numpy as np from mmdet.datasets.builder import PIPELINES from shapely.geometry import LineString, Polygon from shapely import affinity import cv2 from PIL import Image, ImageDraw from numpy.typing import NDArray from typing import List, Tuple, Union, Dict import torch import pdb @PIPELINES.register_module(force=True) class RasterizeMap(object): """Generate rasterized semantic map and put into `semantic_mask` key. Args: roi_size (tuple or list): bev range canvas_size (tuple or list): bev feature size thickness (int): thickness of rasterized lines coords_dim (int): dimension of point coordinates """ def __init__(self, roi_size: Union[Tuple, List], canvas_size: Union[Tuple, List], thickness: int, coords_dim: int, semantic_mask=False, ): self.roi_size = roi_size self.canvas_size = canvas_size self.scale_x = self.canvas_size[0] / self.roi_size[0] self.scale_y = self.canvas_size[1] / self.roi_size[1] self.thickness = thickness self.coords_dim = coords_dim self.semantic_mask = semantic_mask def line_ego_to_mask(self, line_ego: LineString, mask: NDArray, color: int=1, thickness: int=3, fill_poly=False ) -> None: # """Rasterize a single line to mask. # Args: # line_ego (LineString): line # mask (array): semantic mask to paint on # color (int): positive label, default: 1 # thickness (int): thickness of rasterized lines, default: 3 # """ trans_x = self.canvas_size[0] / 2 trans_y = self.canvas_size[1] / 2 line_ego = affinity.scale(line_ego, self.scale_x, self.scale_y, origin=(0, 0)) line_ego = affinity.affine_transform(line_ego, [1.0, 0.0, 0.0, 1.0, trans_x, trans_y]) coords = np.array(list(line_ego.coords), dtype=np.int32)[:, :2] coords = coords.reshape((-1, 2)) assert len(coords) >= 2 if fill_poly: cv2.fillPoly(mask, np.int32([coords]), color=color) else: cv2.polylines(mask, np.int32([coords]), False, color=color, thickness=thickness) def polygons_ego_to_mask(self, polygons: List[Polygon], color: int=1) -> NDArray: # ''' Rasterize a polygon to mask. # Args: # polygons (list): list of polygons # color (int): positive label, default: 1 # Returns: # mask (array): mask with rasterize polygons # ''' #mask = Image.new("L", size=(self.canvas_size[0], self.canvas_size[1]), color=0) # Image lib api expect size as (w, h) trans_x = self.canvas_size[0] / 2 trans_y = self.canvas_size[1] / 2 masks = [] for polygon in polygons: mask = Image.new("L", size=(self.canvas_size[0], self.canvas_size[1]), color=0) polygon = affinity.scale(polygon, self.scale_x, self.scale_y, origin=(0, 0)) polygon = affinity.affine_transform(polygon, [1.0, 0.0, 0.0, 1.0, trans_x, trans_y]) ext = np.array(polygon.exterior.coords)[:, :2] vert_list = [(x, y) for x, y in ext] ImageDraw.Draw(mask).polygon(vert_list, outline=1, fill=color) masks.append(mask) #return np.array(mask, np.uint8) return masks def get_semantic_mask(self, map_geoms: Dict) -> NDArray: # ''' Rasterize all map geometries to semantic mask. # Args: # map_geoms (dict): map geoms by class # Returns: # semantic_mask (array): semantic mask # ''' num_classes = len(map_geoms) if self.semantic_mask: semantic_mask = np.zeros((num_classes, self.canvas_size[1], self.canvas_size[0]), dtype=np.uint8) else: instance_masks = [] for label, geom_list in map_geoms.items(): if len(geom_list) == 0: continue if geom_list[0].geom_type == 'LineString': for line in geom_list: if self.semantic_mask: fill_poly = True if label == 0 else False self.line_ego_to_mask(line, semantic_mask[label], color=1, thickness=self.thickness, fill_poly=fill_poly) else: canvas = np.zeros((self.canvas_size[1], self.canvas_size[0]), dtype=np.uint8) self.line_ego_to_mask(line, canvas, color=1, thickness=self.thickness, fill_poly=False) instance_masks.append([canvas, label]) elif geom_list[0].geom_type == 'Polygon': # drivable area polygons = [] for polygon in geom_list: polygons.append(polygon) if self.semantic_mask: semantic_mask[label] = self.polygons_ego_to_mask(polygons, color=1) else: polygon_masks = self.polygons_ego_to_mask(polygons, color=1) for mask in polygon_masks: instance_masks.append([mask, label]) else: raise ValueError('map geoms must be either LineString or Polygon!') if self.semantic_mask: semantic_mask = np.ascontiguousarray(semantic_mask) return semantic_mask else: return instance_masks def __call__(self, input_dict: Dict) -> Dict: map_geoms = input_dict['map_geoms'] # {0: List[ped_crossing: LineString], 1: ...} semantic_mask = self.get_semantic_mask(map_geoms) input_dict['semantic_mask'] = semantic_mask # (num_class, canvas_size[1], canvas_size[0]) return input_dict def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(roi_size={self.roi_size}, ' repr_str += f'canvas_size={self.canvas_size}), ' repr_str += f'thickness={self.thickness}), ' repr_str += f'coords_dim={self.coords_dim})' return repr_str @PIPELINES.register_module(force=True) class PV_Map(object): """Generate rasterized semantic map and put into `semantic_mask` key. Args: roi_size (tuple or list): bev range canvas_size (tuple or list): bev feature size thickness (int): thickness of rasterized lines coords_dim (int): dimension of point coordinates """ def __init__(self, img_shape: Union[Tuple, List], feat_down_sample: int, thickness: int, coords_dim: int, pv_mask=False, num_cams=6, num_coords=2 ): self.num_cams = num_cams self.num_coords = num_coords self.img_shape = img_shape self.feat_down_sample = feat_down_sample self.pv_scale_x = self.img_shape[0] // feat_down_sample self.pv_scale_y = self.img_shape[1] // feat_down_sample self.thickness = thickness self.coords_dim = coords_dim self.pv_mask = pv_mask def perspective(self,cam_coords, proj_mat): pix_coords = proj_mat @ cam_coords valid_idx = pix_coords[2, :] > 0 pix_coords = pix_coords[:, valid_idx] pix_coords = pix_coords[:2, :] / (pix_coords[2, :] + 1e-7) pix_coords = pix_coords.transpose(1, 0) return pix_coords @staticmethod def get_valid_pix_coords(pix_coords): valid_idx = pix_coords[:, 2] > 0 pix_coords = pix_coords[valid_idx, :] pix_coords = pix_coords[:, :2] / (pix_coords[:, 2:3] + 1e-7) return pix_coords def line_ego_to_pvmask(self, line_ego, mask, lidar2feat, color=1, thickness=1): distances = np.linspace(0, line_ego.length, 200) coords = np.array([np.array(line_ego.interpolate(distance).coords) for distance in distances]).reshape(-1, self.num_coords) if coords.shape[1] == 2: coords = np.concatenate((coords,np.zeros((coords.shape[0],1))),axis=1) pts_num = coords.shape[0] ones = np.ones((pts_num,1)) lidar_coords = np.concatenate([coords,ones], axis=1).transpose(1,0) pix_coords = self.perspective(lidar_coords, lidar2feat) // self.feat_down_sample cv2.polylines(mask, np.int32([pix_coords]), False, color=color, thickness=thickness) def lines_ego_to_pv(self, lines_ego, pv_mask, ego2imgs, color=1, thickness=1): lines_coord = [] for line_ego in lines_ego: distances = np.linspace(0, line_ego.length, 100) coords = np.array([np.array(line_ego.interpolate(distance).coords) for distance in distances]).reshape(-1, self.num_coords) if coords.shape[1] == 2: coords = np.concatenate((coords,np.zeros((coords.shape[0],1))),axis=1) pts_num = coords.shape[0] ones = np.ones((pts_num,1)) lidar_coords = np.concatenate([coords,ones], axis=1) lines_coord.append(lidar_coords) lines_coord = torch.tensor(np.stack(lines_coord, axis=0)) for cam_idx in range(len(ego2imgs)): ego2img_i = torch.tensor(ego2imgs[cam_idx]) pers_lines_coord = torch.einsum('lk,ijk->ijl', ego2img_i, lines_coord) valid_lines_coord = [self.get_valid_pix_coords(pers_coord) for pers_coord in pers_lines_coord] valid_lines_coord = [x // self.feat_down_sample for x in valid_lines_coord if len(x) > 0] lines_to_draw = [x.numpy().astype(np.int32) for x in valid_lines_coord] cv2.polylines(pv_mask[cam_idx], lines_to_draw, False, color=color, thickness=thickness) def get_pvmask_old(self,map_geoms: Dict,ego2img: List, img_filenames: List) -> NDArray: # ''' Rasterize all map geometries to semantic mask. # Args: # map_geoms (dict): map geoms by class # Returns: # semantic_mask (array): semantic mask # ''' num_classes = len(map_geoms) if self.pv_mask: gt_pv_mask = np.zeros((self.num_cams, num_classes, self.pv_scale_x, self.pv_scale_y), dtype=np.uint8) else: instance_masks = [] for label, geom_list in map_geoms.items(): if len(geom_list) == 0: continue if geom_list[0].geom_type == 'LineString': for line in geom_list: for cam_index in range(self.num_cams): self.line_ego_to_pvmask(line,gt_pv_mask[cam_index][label],ego2img[cam_index],color=1,thickness=self.thickness) if self.pv_mask: gt_pv_mask = np.ascontiguousarray(gt_pv_mask) ## Visualize to double-check the pv seg is correct #self.visualize_all_pv_masks(gt_pv_mask, img_filenames) #import pdb; pdb.set_trace() return gt_pv_mask else: return instance_masks def get_pvmask(self, map_geoms: Dict,ego2img: List, img_filenames: List) -> NDArray: # ''' Rasterize all map geometries to semantic mask. # Args: # map_geoms (dict): map geoms by class # Returns: # semantic_mask (array): semantic mask # ''' num_classes = len(map_geoms) if self.pv_mask: gt_pv_mask = np.zeros((num_classes, self.num_cams, self.pv_scale_x, self.pv_scale_y), dtype=np.uint8) else: instance_masks = [] for label, geom_list in map_geoms.items(): if len(geom_list) == 0: continue self.lines_ego_to_pv(geom_list, gt_pv_mask[label], ego2img, color=1, thickness=self.thickness) gt_pv_mask = gt_pv_mask.transpose(1, 0, 2, 3) if self.pv_mask: gt_pv_mask = np.ascontiguousarray(gt_pv_mask) ## Visualize to double-check the pv seg is correct #self.visualize_all_pv_masks(gt_pv_mask, img_filenames) #import pdb; pdb.set_trace() return gt_pv_mask else: return instance_masks def __call__(self, input_dict: Dict) -> Dict: map_geoms = input_dict['map_geoms'] # {0: List[ped_crossing: LineString], 1: ...} pv_mask = self.get_pvmask(map_geoms, input_dict['ego2img'], input_dict['img_filenames']) input_dict['pv_mask'] = pv_mask # (num_class, canvas_size[1], canvas_size[0]) return input_dict def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(roi_size={self.roi_size}, ' repr_str += f'canvas_size={self.canvas_size}), ' repr_str += f'thickness={self.thickness}), ' repr_str += f'coords_dim={self.coords_dim})' return repr_str def visualize_all_pv_masks(self, gt_pv_mask, img_filenames): for cam_id in range(gt_pv_mask.shape[0]): viz_img = self._visualize_pv_mask(gt_pv_mask[cam_id]) viz_img = viz_img.transpose(1, 2, 0) out_path = './check_pv_seg/viz_{}.png'.format(cam_id) out_raw_path = './check_pv_seg/viz_raw_{}.png'.format(cam_id) filepath = img_filenames[cam_id] pv_img = cv2.imread(filepath) #pv_img = cv2.resize(pv_img, (800, 480)) #viz_mask = cv2.resize(viz_img, (800, 480)) pv_img = cv2.resize(pv_img, (608, 608)) viz_mask = cv2.resize(viz_img, (608, 608)) mask = (viz_mask == 255).all(-1)[..., None] viz_img = pv_img * mask + viz_mask * (1-mask) cv2.imwrite(out_path, viz_img) cv2.imwrite(out_raw_path, pv_img) def _visualize_pv_mask(self, pv_mask): COLOR_MAPS_BGR = { # bgr colors 1: (0, 0, 255), 2: (0, 255, 0), 0: (255, 0, 0), } num_classes, h, w = pv_mask.shape viz_img = np.ones((num_classes, h, w), dtype=np.uint8) * 255 for label in range(num_classes): valid = (pv_mask[label] == 1) viz_img[:, valid] = np.array(COLOR_MAPS_BGR[label]).reshape(3, 1) return viz_img ================================================ FILE: plugin/datasets/pipelines/transform.py ================================================ import numpy as np import mmcv from mmdet.datasets.builder import PIPELINES from numpy import random @PIPELINES.register_module(force=True) class Normalize3D(object): """Normalize the image. Added key is "img_norm_cfg". Args: mean (sequence): Mean values of 3 channels. std (sequence): Std values of 3 channels. to_rgb (bool): Whether to convert the image from BGR to RGB, default is true. """ def __init__(self, mean, std, to_rgb=True): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) self.to_rgb = to_rgb def __call__(self, results): """Call function to normalize images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Normalized results, 'img_norm_cfg' key is added into result dict. """ for key in results.get('img_fields', ['img']): results[key] = [mmcv.imnormalize( img, self.mean, self.std, self.to_rgb) for img in results[key]] results['img_norm_cfg'] = dict( mean=self.mean, std=self.std, to_rgb=self.to_rgb) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})' return repr_str @PIPELINES.register_module(force=True) class PadMultiViewImages(object): """Pad multi-view images and change intrinsics There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", If set `change_intrinsics=True`, key 'cam_intrinsics' and 'ego2img' will be changed. Args: size (tuple, optional): Fixed padding size, (h, w). size_divisor (int, optional): The divisor of padded size. pad_val (float, optional): Padding value, 0 by default. change_intrinsics (bool): whether to update intrinsics. """ def __init__(self, size=None, size_divisor=None, pad_val=0, change_intrinsics=False): self.size = size self.size_divisor = size_divisor self.pad_val = pad_val # only one of size and size_divisor should be valid assert size is not None or size_divisor is not None assert size is None or size_divisor is None self.change_intrinsics = change_intrinsics def _pad_img(self, results): """Pad images according to ``self.size``.""" original_shape = [img.shape for img in results['img']] for key in results.get('img_fields', ['img']): if self.size is not None: padded_img = [mmcv.impad( img, shape=self.size, pad_val=self.pad_val) for img in results[key]] elif self.size_divisor is not None: padded_img = [mmcv.impad_to_multiple( img, self.size_divisor, pad_val=self.pad_val) for img in results[key]] results[key] = padded_img if self.change_intrinsics: post_intrinsics, post_ego2imgs = [], [] for img, oshape, cam_intrinsic, ego2img in zip(results['img'], \ original_shape, results['cam_intrinsics'], results['ego2img']): scaleW = img.shape[1] / oshape[1] scaleH = img.shape[0] / oshape[0] rot_resize_matrix = np.array([ [scaleW, 0, 0, 0], [0, scaleH, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]) post_intrinsic = rot_resize_matrix[:3, :3] @ cam_intrinsic post_ego2img = rot_resize_matrix @ ego2img post_intrinsics.append(post_intrinsic) post_ego2imgs.append(post_ego2img) results.update({ 'cam_intrinsics': post_intrinsics, 'ego2img': post_ego2imgs, }) results['img_shape'] = [img.shape for img in padded_img] results['img_fixed_size'] = self.size results['img_size_divisor'] = self.size_divisor def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.size}, ' repr_str += f'size_divisor={self.size_divisor}, ' repr_str += f'pad_val={self.pad_val})' repr_str += f'change_intrinsics={self.change_intrinsics})' return repr_str @PIPELINES.register_module(force=True) class ResizeMultiViewImages(object): """Resize mulit-view images and change intrinsics If set `change_intrinsics=True`, key 'cam_intrinsics' and 'ego2img' will be changed Args: size (tuple, optional): resize target size, (h, w). change_intrinsics (bool): whether to update intrinsics. """ def __init__(self, size=None, scale=None, change_intrinsics=True): self.size = size self.scale = scale assert size is None or scale is None self.change_intrinsics = change_intrinsics def __call__(self, results:dict): new_imgs, post_intrinsics, post_ego2imgs = [], [], [] for img, cam_intrinsic, ego2img in zip(results['img'], \ results['cam_intrinsics'], results['ego2img']): if self.scale is not None: h, w = img.shape[:2] target_h = int(h * self.scale) target_w = int(w * self.scale) else: target_h = self.size[0] target_w = self.size[1] tmp, scaleW, scaleH = mmcv.imresize(img, # NOTE: mmcv.imresize expect (w, h) shape (target_w, target_h), return_scale=True) new_imgs.append(tmp) rot_resize_matrix = np.array([ [scaleW, 0, 0, 0], [0, scaleH, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]) post_intrinsic = rot_resize_matrix[:3, :3] @ cam_intrinsic post_ego2img = rot_resize_matrix @ ego2img post_intrinsics.append(post_intrinsic) post_ego2imgs.append(post_ego2img) results['img'] = new_imgs results['img_shape'] = [img.shape for img in new_imgs] if self.change_intrinsics: results.update({ 'cam_intrinsics': post_intrinsics, 'ego2img': post_ego2imgs, }) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.size}, ' repr_str += f'change_intrinsics={self.change_intrinsics})' return repr_str @PIPELINES.register_module() class PhotoMetricDistortionMultiViewImage: """Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last. 1. random brightness 2. random contrast (mode 0) 3. convert color from BGR to HSV 4. random saturation 5. random hue 6. convert color from HSV to BGR 7. random contrast (mode 1) 8. randomly swap channels Args: brightness_delta (int): delta of brightness. contrast_range (tuple): range of contrast. saturation_range (tuple): range of saturation. hue_delta (int): delta of hue. """ def __init__(self, brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18): self.brightness_delta = brightness_delta self.contrast_lower, self.contrast_upper = contrast_range self.saturation_lower, self.saturation_upper = saturation_range self.hue_delta = hue_delta def __call__(self, results): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images distorted. """ imgs = results['img'] new_imgs = [] for img in imgs: assert img.dtype == np.float32, \ 'PhotoMetricDistortion needs the input image of dtype np.float32,'\ ' please set "to_float32=True" in "LoadImageFromFile" pipeline' # random brightness if random.randint(2): delta = random.uniform(-self.brightness_delta, self.brightness_delta) img += delta # mode == 0 --> do random contrast first # mode == 1 --> do random contrast last mode = random.randint(2) if mode == 1: if random.randint(2): alpha = random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha # convert color from BGR to HSV img = mmcv.bgr2hsv(img) # random saturation if random.randint(2): img[..., 1] *= random.uniform(self.saturation_lower, self.saturation_upper) # random hue if random.randint(2): img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta) img[..., 0][img[..., 0] > 360] -= 360 img[..., 0][img[..., 0] < 0] += 360 # convert color from HSV to BGR img = mmcv.hsv2bgr(img) # random contrast if mode == 0: if random.randint(2): alpha = random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha # randomly swap channels # if random.randint(2): # img = img[..., random.permutation(3)] new_imgs.append(img) results['img'] = new_imgs return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' repr_str += 'contrast_range=' repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' repr_str += 'saturation_range=' repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' repr_str += f'hue_delta={self.hue_delta})' return repr_str ================================================ FILE: plugin/datasets/pipelines/vectorize.py ================================================ import numpy as np from mmdet.datasets.builder import PIPELINES from shapely.geometry import LineString from numpy.typing import NDArray from typing import List, Tuple, Union, Dict @PIPELINES.register_module(force=True) class VectorizeMap(object): """Generate vectoized map and put into `semantic_mask` key. Concretely, shapely geometry objects are converted into sample points (ndarray). We use args `sample_num`, `sample_dist`, `simplify` to specify sampling method. Args: roi_size (tuple or list): bev range . normalize (bool): whether to normalize points to range (0, 1). coords_dim (int): dimension of point coordinates. simplify (bool): whether to use simpily function. If true, `sample_num` \ and `sample_dist` will be ignored. sample_num (int): number of points to interpolate from a polyline. Set to -1 to ignore. sample_dist (float): interpolate distance. Set to -1 to ignore. """ def __init__(self, roi_size: Union[Tuple, List], normalize: bool, coords_dim: int, simplify: bool=False, sample_num: int=-1, sample_dist: float=-1, permute: bool=False ): self.coords_dim = coords_dim self.sample_num = sample_num self.sample_dist = sample_dist self.roi_size = np.array(roi_size) self.normalize = normalize self.simplify = simplify self.permute = permute if sample_dist > 0: assert sample_num < 0 and not simplify self.sample_fn = self.interp_fixed_dist elif sample_num > 0: assert sample_dist < 0 and not simplify self.sample_fn = self.interp_fixed_num else: assert simplify def interp_fixed_num(self, line: LineString) -> NDArray: ''' Interpolate a line to fixed number of points. Args: line (LineString): line Returns: points (array): interpolated points, shape (N, 2) ''' distances = np.linspace(0, line.length, self.sample_num) sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).squeeze() return sampled_points def interp_fixed_dist(self, line: LineString) -> NDArray: ''' Interpolate a line at fixed interval. Args: line (LineString): line Returns: points (array): interpolated points, shape (N, 2) ''' distances = list(np.arange(self.sample_dist, line.length, self.sample_dist)) # make sure to sample at least two points when sample_dist > line.length distances = [0,] + distances + [line.length,] sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).squeeze() return sampled_points def get_vectorized_lines(self, map_geoms: Dict) -> Dict: ''' Vectorize map elements. Iterate over the input dict and apply the specified sample funcion. Args: line (LineString): line Returns: vectors (array): dict of vectorized map elements. ''' vectors = {} for label, geom_list in map_geoms.items(): vectors[label] = [] for geom in geom_list: if geom.geom_type == 'LineString': if self.simplify: line = geom.simplify(0.2, preserve_topology=True) line = np.array(line.coords) else: line = self.sample_fn(geom) line = line[:, :self.coords_dim] if self.normalize: line = self.normalize_line(line) if self.permute: line = self.permute_line(line) vectors[label].append(line) elif geom.geom_type == 'Polygon': # polygon objects will not be vectorized continue else: raise ValueError('map geoms must be either LineString or Polygon!') return vectors def normalize_line(self, line: NDArray) -> NDArray: ''' Convert points to range (0, 1). Args: line (LineString): line Returns: normalized (array): normalized points. ''' origin = -np.array([self.roi_size[0]/2, self.roi_size[1]/2]) line[:, :2] = line[:, :2] - origin # transform from range [0, 1] to (0, 1) eps = 1e-5 line[:, :2] = line[:, :2] / (self.roi_size + eps) return line def permute_line(self, line: np.ndarray, padding=1e5): ''' (num_pts, 2) -> (num_permute, num_pts, 2) where num_permute = 2 * (num_pts - 1) ''' is_closed = np.allclose(line[0], line[-1], atol=1e-3) num_points = len(line) permute_num = num_points - 1 permute_lines_list = [] if is_closed: pts_to_permute = line[:-1, :] # throw away replicate start end pts for shift_i in range(permute_num): permute_lines_list.append(np.roll(pts_to_permute, shift_i, axis=0)) flip_pts_to_permute = np.flip(pts_to_permute, axis=0) for shift_i in range(permute_num): permute_lines_list.append(np.roll(flip_pts_to_permute, shift_i, axis=0)) else: permute_lines_list.append(line) permute_lines_list.append(np.flip(line, axis=0)) permute_lines_array = np.stack(permute_lines_list, axis=0) if is_closed: tmp = np.zeros((permute_num * 2, num_points, self.coords_dim)) tmp[:, :-1, :] = permute_lines_array tmp[:, -1, :] = permute_lines_array[:, 0, :] # add replicate start end pts permute_lines_array = tmp else: # padding padding = np.full([permute_num * 2 - 2, num_points, self.coords_dim], padding) permute_lines_array = np.concatenate((permute_lines_array, padding), axis=0) return permute_lines_array def __call__(self, input_dict): map_geoms = input_dict['map_geoms'] input_dict['vectors'] = self.get_vectorized_lines(map_geoms) return input_dict def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(simplify={self.simplify}, ' repr_str += f'sample_num={self.sample_num}), ' repr_str += f'sample_dist={self.sample_dist}), ' repr_str += f'roi_size={self.roi_size})' repr_str += f'normalize={self.normalize})' repr_str += f'coords_dim={self.coords_dim})' return repr_str ================================================ FILE: plugin/datasets/samplers/__init__.py ================================================ from .group_sampler import DistributedGroupSampler, InfiniteGroupEachSampleInBatchSampler from .distributed_sampler import DistributedSampler from .sampler import SAMPLER, build_sampler ================================================ FILE: plugin/datasets/samplers/distributed_sampler.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- # Modified by Shihao Wang # --------------------------------------------- import math import torch from torch.utils.data import DistributedSampler as _DistributedSampler from .sampler import SAMPLER import numpy as np @SAMPLER.register_module() class DistributedSampler(_DistributedSampler): def __init__(self, dataset=None, num_replicas=None, rank=None, shuffle=True, seed=0): super().__init__( dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) # for the compatibility from PyTorch 1.3+ self.seed = seed if seed is not None else 0 self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.groups_num = len(self.group_sizes) self.groups = list(set(self.flag)) assert self.groups == list(range(self.groups_num)) # Now, for efficiency, make a dict {group_idx: List[dataset sample_idxs]} self.group_idx_to_sample_idxs = { group_idx: np.where(self.flag == group_idx)[0].tolist() for group_idx in range(self.groups_num)} num_groups_per_gpu = math.ceil(len(self.groups) / self.num_replicas) # assign groups (continuous videos) to each gpu rank # self.sample_group_idx = self.groups[self.rank*num_groups_per_gpu: min(len(self.groups), (self.rank+1)*num_groups_per_gpu)] self.sample_group_idx = self.groups[self.rank::self.num_replicas] self.sample_idxs = [] for i in self.sample_group_idx: self.sample_idxs.extend(self.group_idx_to_sample_idxs[i]) #print('Rank', rank, 'Num samples', len(self.sample_idxs), 'Samples', self.sample_idxs) self.num_samples = len(self.sample_idxs) self.total_size = len(self.dataset) def __iter__(self): # only used for validation/testing # only support batchsize = 1 if self.shuffle: assert False # else: # indices = torch.arange(len(self.dataset)).tolist() # # add extra samples to make it evenly divisible # # in case that indices is shorter than half of total_size # indices = (indices * # math.ceil(self.total_size / len(indices)))[:self.total_size] # assert len(indices) == self.total_size # # subsample # per_replicas = self.total_size//self.num_replicas # # indices = indices[self.rank:self.total_size:self.num_replicas] # indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas] # assert len(indices) == self.num_samples return iter(self.sample_idxs) ================================================ FILE: plugin/datasets/samplers/group_sampler.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- # Modified by Shihao Wang # --------------------------------------------- import math import itertools import copy import torch.distributed as dist import numpy as np import torch from mmcv.runner import get_dist_info from torch.utils.data import Sampler from .sampler import SAMPLER import random class GroupSampler(Sampler): def __init__(self, dataset, samples_per_gpu=1): assert hasattr(dataset, 'flag') self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.flag = dataset.flag.astype(np.int64) self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, size in enumerate(self.group_sizes): self.num_samples += int(np.ceil( size / self.samples_per_gpu)) * self.samples_per_gpu print('Warning!!! Only used for testing!') def __iter__(self): for i, size in enumerate(self.group_sizes): if size == 0: continue indice = np.where(self.flag == i)[0] assert len(indice) == size yield from indice def __len__(self): return self.num_samples @SAMPLER.register_module() class DistributedGroupSampler(Sampler): """Sampler that restricts data loading to a subset of the dataset. It is especially useful in conjunction with :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each process can pass a DistributedSampler instance as a DataLoader sampler, and load a subset of the original dataset that is exclusive to it. .. note:: Dataset is assumed to be of constant size. Arguments: dataset: Dataset used for sampling. num_replicas (optional): Number of processes participating in distributed training. rank (optional): Rank of the current process within num_replicas. seed (int, optional): random seed used to shuffle the sampler if ``shuffle=True``. This number should be identical across all processes in the distributed group. Default: 0. """ def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None, seed=0): _rank, _num_replicas = get_dist_info() if num_replicas is None: num_replicas = _num_replicas if rank is None: rank = _rank self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.seed = seed if seed is not None else 0 assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, j in enumerate(self.group_sizes): self.num_samples += int( math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) * self.samples_per_gpu self.total_size = self.num_samples * self.num_replicas def __iter__(self): # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch + self.seed) indices = [] for i, size in enumerate(self.group_sizes): if size > 0: indice = np.where(self.flag == i)[0] assert len(indice) == size # add .numpy() to avoid bug when selecting indice in parrots. # TODO: check whether torch.randperm() can be replaced by # numpy.random.permutation(). indice = indice[list( torch.randperm(int(size), generator=g).numpy())].tolist() extra = int( math.ceil( size * 1.0 / self.samples_per_gpu / self.num_replicas) ) * self.samples_per_gpu * self.num_replicas - len(indice) # pad indice tmp = indice.copy() for _ in range(extra // size): indice.extend(tmp) indice.extend(tmp[:extra % size]) indices.extend(indice) assert len(indices) == self.total_size indices = [ indices[j] for i in list( torch.randperm( len(indices) // self.samples_per_gpu, generator=g)) for j in range(i * self.samples_per_gpu, (i + 1) * self.samples_per_gpu) ] # subsample offset = self.num_samples * self.rank indices = indices[offset:offset + self.num_samples] assert len(indices) == self.num_samples return iter(indices) def __len__(self): return self.num_samples def set_epoch(self, epoch): self.epoch = epoch def sync_random_seed(seed=None, device='cuda'): """Make sure different ranks share the same seed. All workers must call this function, otherwise it will deadlock. This method is generally used in `DistributedSampler`, because the seed should be identical across all processes in the distributed group. In distributed sampling, different ranks should sample non-overlapped data in the dataset. Therefore, this function is used to make sure that each rank shuffles the data indices in the same order based on the same seed. Then different ranks could use different indices to select non-overlapped data from the same data list. Args: seed (int, Optional): The seed. Default to None. device (str): The device where the seed will be put on. Default to 'cuda'. Returns: int: Seed to be used. """ if seed is None: seed = np.random.randint(2**31) assert isinstance(seed, int) rank, num_replicas = get_dist_info() if num_replicas == 1: return seed if rank == 0: random_num = torch.tensor(seed, dtype=torch.int32, device=device) else: random_num = torch.tensor(0, dtype=torch.int32, device=device) dist.broadcast(random_num, src=0) return random_num.item() @SAMPLER.register_module() class InfiniteGroupEachSampleInBatchSampler(Sampler): """ Pardon this horrendous name. Basically, we want every sample to be from its own group. If batch size is 4 and # of GPUs is 8, each sample of these 32 should be operating on its own group. Shuffling is only done for group order, not done within groups. Arguments: dataset: Dataset used for sampling. min_len: Minimum sequence sampling length max_len: Maximum sequence sampling length num_iters_to_seq: After `num_iters_to_seq` iterations, start sequential sampling. Default: 0 samples_per_gpu (optional): Per gpu batchsize. Default: 1 num_replicas (optional): Number of processes participating in distributed training. rank (optional): Rank of the current process within num_replicas. seed (int, optional): random seed used to shuffle the sampler if ``shuffle=True``. This number should be identical across all processes in the distributed group. Default: 0. """ def __init__(self, dataset, seq_split_num=-1, num_iters_to_seq=0, random_drop=0, samples_per_gpu=1, num_replicas=None, rank=None, seed=0): _rank, _num_replicas = get_dist_info() if num_replicas is None: num_replicas = _num_replicas if rank is None: rank = _rank self.dataset = dataset self.batch_size = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.seq_split_num = seq_split_num self.sub_seq_generator = torch.Generator() self.sub_seq_generator.manual_seed(self.rank + seed) self.seed = sync_random_seed(seed) self.random_drop = random_drop self.size = len(self.dataset) self._iters = 0 self.num_iters_to_seq = num_iters_to_seq assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.groups_num = len(self.group_sizes) self.global_batch_size = samples_per_gpu * num_replicas assert self.groups_num >= self.global_batch_size # Now, for efficiency, make a dict {group_idx: List[dataset sample_idxs]} self.group_idx_to_sample_idxs = { group_idx: np.where(self.flag == group_idx)[0].tolist() for group_idx in range(self.groups_num)} self.group_idx_to_sample_idxs_generator = { group_idx: self._sample_sub_sequence(group_idx) for group_idx in range(self.groups_num) } # Get a generator per sample idx. Considering samples over all # GPUs, each sample position has its own generator self.group_indices_per_global_sample_idx = [ self._group_indices_per_global_sample_idx(self.rank * self.batch_size + local_sample_idx) for local_sample_idx in range(self.batch_size)] # Keep track of a buffer of dataset sample idxs for each local sample idx self.buffer_per_local_sample = [[] for _ in range(self.batch_size)] def _infinite_group_indices(self): g = torch.Generator() g.manual_seed(self.seed) while True: yield from torch.randperm(self.groups_num, generator=g).tolist() def _group_indices_per_global_sample_idx(self, global_sample_idx): yield from itertools.islice(self._infinite_group_indices(), global_sample_idx, None, self.global_batch_size) def _sample_sub_sequence(self, group_idx): '''randomly split sub-sequences in a whole sequence''' sample_ids = self.group_idx_to_sample_idxs[group_idx] while True: if self._iters < self.num_iters_to_seq or self.seq_split_num == -1: shuffled = torch.randperm(len(sample_ids), generator=self.sub_seq_generator).tolist() yield from [[sample_ids[i]] for i in shuffled] else: # split the sequence into parts idx = torch.randperm(len(sample_ids), generator=self.sub_seq_generator).tolist() idx.remove(0) idx = sorted(idx[:self.seq_split_num - 1]) # choose n-1 split position split_idx = [0] + idx + [len(sample_ids)] sub_seq_idx = [sample_ids[split_idx[i]: split_idx[i + 1]] for i in range(len(split_idx) - 1)] # [[1,2,3], [4,5], ...] shuffled = torch.randperm(len(sub_seq_idx), generator=self.sub_seq_generator).tolist() for i in shuffled: sub_seq = sub_seq_idx[i] length = len(sub_seq) drop_num = math.floor(length * self.random_drop) drop_idxs = torch.randperm(length, generator=self.sub_seq_generator).tolist()[:drop_num] new_sub_seq = [sub_seq[j] for j in range(length) if j not in drop_idxs] yield new_sub_seq # yield from [sub_seq_idx[i] for i in shuffled] def __iter__(self): last_group_idx_batch = [-1 for i in range(self.batch_size)] while True: curr_batch = [] for local_sample_idx in range(self.batch_size): if len(self.buffer_per_local_sample[local_sample_idx]) == 0: # Finished current group, refill with next group new_group_idx = next(self.group_indices_per_global_sample_idx[local_sample_idx]) # 保证不会连续两段相同的序列 # 如果不加的话,在epoch轮换时会有概率连续两段相同序列 if new_group_idx == last_group_idx_batch[local_sample_idx]: new_group_idx = next(self.group_indices_per_global_sample_idx[local_sample_idx]) last_group_idx_batch[local_sample_idx] = new_group_idx self.buffer_per_local_sample[local_sample_idx] = \ copy.deepcopy(next(self.group_idx_to_sample_idxs_generator[new_group_idx])) curr_batch.append(self.buffer_per_local_sample[local_sample_idx].pop(0)) self._iters += 1 yield curr_batch def __len__(self): """Length of base dataset.""" return self.size def set_epoch(self, epoch): self.epoch = epoch ================================================ FILE: plugin/datasets/samplers/sampler.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- # Modified by Shihao Wang # --------------------------------------------- from mmcv.utils.registry import Registry, build_from_cfg SAMPLER = Registry('sampler') def build_sampler(cfg, default_args): return build_from_cfg(cfg, SAMPLER, default_args) ================================================ FILE: plugin/datasets/visualize/renderer.py ================================================ import os.path as osp import os import av2.geometry.interpolate as interp_utils import numpy as np import copy import cv2 import matplotlib import matplotlib.pyplot as plt from PIL import Image matplotlib.use('agg') # prevent memory leak for drawing figures in a loop def remove_nan_values(uv): is_u_valid = np.logical_not(np.isnan(uv[:, 0])) is_v_valid = np.logical_not(np.isnan(uv[:, 1])) is_uv_valid = np.logical_and(is_u_valid, is_v_valid) uv_valid = uv[is_uv_valid] return uv_valid def points_ego2img(pts_ego, extrinsics, intrinsics): pts_ego_4d = np.concatenate([pts_ego, np.ones([len(pts_ego), 1])], axis=-1) pts_cam_4d = extrinsics @ pts_ego_4d.T uv = (intrinsics @ pts_cam_4d[:3, :]).T uv = remove_nan_values(uv) depth = uv[:, 2] uv = uv[:, :2] / uv[:, 2].reshape(-1, 1) return uv, depth def draw_polyline_ego_on_img(polyline_ego, img_bgr, extrinsics, intrinsics, color_bgr, thickness): if polyline_ego.shape[1] == 2: zeros = np.zeros((polyline_ego.shape[0], 1)) polyline_ego = np.concatenate([polyline_ego, zeros], axis=1) polyline_ego = interp_utils.interp_arc(t=500, points=polyline_ego) uv, depth = points_ego2img(polyline_ego, extrinsics, intrinsics) h, w, c = img_bgr.shape is_valid_x = np.logical_and(0 <= uv[:, 0], uv[:, 0] < w - 1) is_valid_y = np.logical_and(0 <= uv[:, 1], uv[:, 1] < h - 1) is_valid_z = depth > 0 is_valid_points = np.logical_and.reduce([is_valid_x, is_valid_y, is_valid_z]) if is_valid_points.sum() == 0: return uv = np.round(uv[is_valid_points]).astype(np.int32) draw_visible_polyline_cv2( copy.deepcopy(uv), valid_pts_bool=np.ones((len(uv), 1), dtype=bool), image=img_bgr, color=color_bgr, thickness_px=thickness, ) def draw_visible_polyline_cv2(line, valid_pts_bool, image, color, thickness_px): """Draw a polyline onto an image using given line segments. Args: line: Array of shape (K, 2) representing the coordinates of line. valid_pts_bool: Array of shape (K,) representing which polyline coordinates are valid for rendering. For example, if the coordinate is occluded, a user might specify that it is invalid. Line segments touching an invalid vertex will not be rendered. image: Array of shape (H, W, 3), representing a 3-channel BGR image color: Tuple of shape (3,) with a BGR format color thickness_px: thickness (in pixels) to use when rendering the polyline. """ line = np.round(line).astype(int) # type: ignore for i in range(len(line) - 1): if (not valid_pts_bool[i]) or (not valid_pts_bool[i + 1]): continue x1 = line[i][0] y1 = line[i][1] x2 = line[i + 1][0] y2 = line[i + 1][1] # Use anti-aliasing (AA) for curves image = cv2.line(image, pt1=(x1, y1), pt2=(x2, y2), color=color, thickness=thickness_px, lineType=cv2.LINE_AA) COLOR_MAPS_BGR = { # bgr colors 'divider': (0, 0, 255), 'boundary': (0, 255, 0), 'ped_crossing': (255, 0, 0), 'centerline': (51, 183, 255), 'drivable_area': (171, 255, 255) } COLOR_MAPS_PLT = { 'divider': 'r', 'boundary': 'g', 'ped_crossing': 'b', 'centerline': 'orange', 'drivable_area': 'y', } CAM_NAMES_AV2 = ['ring_front_center', 'ring_front_right', 'ring_front_left', 'ring_rear_right','ring_rear_left', 'ring_side_right', 'ring_side_left', ] CAM_NAMES_NUSC = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT',] class Renderer(object): """Render map elements on image views. Args: cat2id (dict): category to class id roi_size (tuple): bev range dataset (str): 'av2' or 'nusc' """ def __init__(self, cat2id, roi_size, dataset='av2'): self.roi_size = roi_size self.cat2id = cat2id self.id2cat = {v: k for k, v in cat2id.items()} if dataset == 'av2': self.cam_names = CAM_NAMES_AV2 else: self.cam_names = CAM_NAMES_NUSC def render_bev_from_vectors(self, vectors, out_dir, draw_scores=False, specified_path=None, id_info=None): '''Render bev segmentation using vectorized map elements. Args: vectors (dict): dict of vectorized map elements. out_dir (str): output directory ''' car_img = Image.open('resources/car.png') #car_img = Image.open('resources/car_lidar_coord.png') if specified_path: map_path = specified_path else: map_path = os.path.join(out_dir, 'map.jpg') fig = plt.figure(figsize=(self.roi_size[0], self.roi_size[1])) ax = fig.add_subplot(1, 1, 1) ax.set_xlim(-self.roi_size[0] / 2, self.roi_size[0] / 2) ax.set_ylim(-self.roi_size[1] / 2, self.roi_size[1] / 2) ax.axis('off') #ax.imshow(car_img, extent=[-2.0, 2.0, -2.5, 2.5]) ax.imshow(car_img, extent=[-2.5, 2.5, -2.0, 2.0]) for label, vector_list in vectors.items(): cat = self.id2cat[label] color = COLOR_MAPS_PLT[cat] for vec_i, vector in enumerate(vector_list): if draw_scores: vector, score, prop = vector if isinstance(vector, list): vector = np.array(vector) from shapely.geometry import LineString vector = np.array(LineString(vector).simplify(0.2).coords) pts = vector[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) # plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], angles='xy', color=color, # scale_units='xy', scale=1) # for i in range(len(x)): ax.plot(x, y, 'o-', color=color, linewidth=20, markersize=50) if draw_scores: #print('Prop:', prop, 'Label:', label) if prop: p = 'p' else: p = '' score = round(score, 2) mid_idx = len(x) // 2 ax.text(x[mid_idx], y[mid_idx], str(score)+p, fontsize=100, color=color) if id_info: vec_id = id_info[label][vec_i] mid_idx = len(x) // 2 ax.text(x[mid_idx], y[mid_idx], f'{cat[:1].upper()}{vec_id}', fontsize=100, color=color) #plt.savefig(map_path, bbox_inches='tight', dpi=40) fig.savefig(map_path, bbox_inches='tight', dpi=20) plt.clf() # or cla() to simulate use case of plotting fresh figures def render_camera_views_from_vectors(self, vectors, imgs, extrinsics, intrinsics, ego2cams, thickness, out_dir): '''Project vectorized map elements to camera views. Args: vectors (dict): dict of vectorized map elements. imgs (tensor): images in bgr color. extrinsics (array): ego2img extrinsics, shape (4, 4) intrinsics (array): intrinsics, shape (3, 3) thickness (int): thickness of lines to draw on images. out_dir (str): output directory ''' for i in range(len(imgs)): img = imgs[i] extrinsic = extrinsics[i] intrinsic = intrinsics[i] ego2cam = ego2cams[i] img_bgr = copy.deepcopy(img) for label, vector_list in vectors.items(): cat = self.id2cat[label] color = COLOR_MAPS_BGR[cat] for vector in vector_list: img_bgr = np.ascontiguousarray(img_bgr) if isinstance(vector, list): vector = np.array(vector) draw_polyline_ego_on_img(vector, img_bgr, ego2cam, intrinsic, color, thickness) out_path = osp.join(out_dir, self.cam_names[i]) + '.jpg' cv2.imwrite(out_path, img_bgr) def render_bev_from_mask(self, semantic_mask, out_dir, flip=False): '''Render bev segmentation from semantic_mask. Args: semantic_mask (array): semantic mask. out_dir (str): output directory ''' if len(semantic_mask.shape) == 3: c, h, w = semantic_mask.shape else: h, w = semantic_mask.shape bev_img = np.ones((3, h, w), dtype=np.uint8) * 255 if 'drivable_area' in self.cat2id: drivable_area_mask = semantic_mask[self.cat2id['drivable_area']] bev_img[:, drivable_area_mask == 1] = \ np.array(COLOR_MAPS_BGR['drivable_area']).reshape(3, 1) for label in self.id2cat: cat = self.id2cat[label] if cat == 'drivable_area': continue if len(semantic_mask.shape) == 3: valid = (semantic_mask[label] == 1) else: valid = semantic_mask == (label + 1) bev_img[:, valid] = np.array(COLOR_MAPS_BGR[cat]).reshape(3, 1) #for label in range(c): # cat = self.id2cat[label] # if cat == 'drivable_area': # continue # mask = semantic_mask[label] # valid = mask == 1 # bev_img[:, valid] = np.array(COLOR_MAPS_BGR[cat]).reshape(3, 1) out_path = osp.join(out_dir, 'semantic_map.jpg') if flip: bev_img_flipud = np.array([np.flipud(i) for i in bev_img], dtype=np.uint8) cv2.imwrite(out_path, bev_img_flipud.transpose((1, 2, 0))) else: cv2.imwrite(out_path, bev_img.transpose((1, 2, 0))) ================================================ FILE: plugin/models/__init__.py ================================================ from .backbones import * from .heads import * from .necks import * from .losses import * from .mapers import * from .transformer_utils import * from .assigner import * from .utils import * ================================================ FILE: plugin/models/assigner/__init__.py ================================================ from .assigner import HungarianLinesAssigner from .match_cost import MapQueriesCost, BBoxLogitsCost, DynamicLinesCost, IoUCostC, BBoxCostC, LinesL1Cost, LinesFixNumChamferCost, ClsSigmoidCost ================================================ FILE: plugin/models/assigner/assigner.py ================================================ import torch from mmdet.core.bbox.builder import BBOX_ASSIGNERS from mmdet.core.bbox.assigners import AssignResult from mmdet.core.bbox.assigners import BaseAssigner from mmdet.core.bbox.match_costs import build_match_cost from scipy.optimize import linear_sum_assignment import numpy as np @BBOX_ASSIGNERS.register_module() class HungarianLinesAssigner(BaseAssigner): """ Computes one-to-one matching between predictions and ground truth. This class computes an assignment between the targets and the predictions based on the costs. The costs are weighted sum of three components: classification cost and regression L1 cost. The targets don't include the no_object, so generally there are more predictions than targets. After the one-to-one matching, the un-matched are treated as backgrounds. Thus each query prediction will be assigned with `0` or a positive integer indicating the ground truth index: - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. bbox_weight (int | float, optional): The scale factor for regression L1 cost. Default 1.0. """ def __init__(self, cost=dict( type='MapQueriesCost', cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='LinesCost', weight=1.0), ), **kwargs): self.cost = build_match_cost(cost) def assign(self, preds: dict, gts: dict, track_info=None, gt_bboxes_ignore=None, eps=1e-7): """ Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: lines_pred (Tensor): predicted normalized lines: [num_query, num_points, 2] cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. lines_gt (Tensor): Ground truth lines [num_gt, num_points, 2]. labels_gt (Tensor): Label of `gt_bboxes`, shape (num_gt,). gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_lines = gts['lines'].size(0), preds['lines'].size(0) # 1. assign -1 by default assigned_gt_inds = \ preds['lines'].new_full((num_lines,), -1, dtype=torch.long) assigned_labels = \ preds['lines'].new_full((num_lines,), -1, dtype=torch.long) if num_gts == 0 or num_lines == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels), None # 2. compute the weighted costs gt_permute_idx = None # (num_preds, num_gts) if self.cost.reg_cost.permute: cost, gt_permute_idx, reg_cost = self.cost(preds, gts) else: cost, reg_cost = self.cost(preds, gts) # Manipulate the cost matrix here using the two-frame matching info # for non-first-frame supervision if track_info is not None: prop_i = 0 # iterate through queries for j in range(cost.shape[0]): if j >= len(track_info['track_queries_fal_pos_mask']): # padding queries, loss will be filtered later cost[j] = np.inf continue if track_info['track_queries_fal_pos_mask'][j]: # false positive and palceholder track queries should not # be matched to any target cost[j] = np.inf # Tweak the cost matrix here to force the G.T. assignment of the track queries elif track_info['track_queries_mask'][j]: track_query_id = track_info['track_query_match_ids'][prop_i].long().item() prop_i += 1 cost[j] = np.inf cost[:, track_query_id] = np.inf cost[j, track_query_id] = -1 # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu().numpy() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') try: matched_row_inds, matched_col_inds = linear_sum_assignment(cost) except: print('cost max{}, min{}'.format(cost.max(), cost.min())) import pdb; pdb.set_trace() matched_row_inds = torch.from_numpy(matched_row_inds).to( preds['lines'].device) matched_col_inds = torch.from_numpy(matched_col_inds).to( preds['lines'].device) # Pass out the un-weighted reg cost for temporal propagation mathced_reg_cost = reg_cost[matched_row_inds, matched_col_inds] # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gts['labels'][matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels), gt_permute_idx, mathced_reg_cost ================================================ FILE: plugin/models/assigner/match_cost.py ================================================ import torch from mmdet.core.bbox.match_costs.builder import MATCH_COST from mmdet.core.bbox.match_costs import build_match_cost from torch.nn.functional import smooth_l1_loss from mmdet.core.bbox.iou_calculators import bbox_overlaps from mmdet.core.bbox.transforms import bbox_cxcywh_to_xyxy def chamfer_distance(line1, line2) -> float: ''' Calculate chamfer distance between two lines. Make sure the lines are interpolated. Args: line1 (tensor): shape (num_pts, 2) line2 (tensor): shape (num_pts, 2) Returns: distance (float): chamfer distance ''' dist_matrix = torch.cdist(line1, line2, p=2) dist12 = dist_matrix.min(-1)[0].sum() / len(line1) dist21 = dist_matrix.min(-2)[0].sum() / len(line2) return (dist12 + dist21) / 2 @MATCH_COST.register_module() class ClsSigmoidCost: """ClsSoftmaxCost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.): self.weight = weight def __call__(self, cls_pred, gt_labels): """ Args: cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). Returns: torch.Tensor: cls_cost value with weight """ # Following the official DETR repo, contrary to the loss that # NLL is used, we approximate it in 1 - cls_score[gt_label]. # The 1 is a constant that doesn't change the matching, # so it can be omitted. cls_score = cls_pred.sigmoid() cls_cost = -cls_score[:, gt_labels] return cls_cost * self.weight @MATCH_COST.register_module() class LinesFixNumChamferCost(object): """BBox3DL1Cost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.0, permute=False): self.weight = weight self.permute = permute def __call__(self, lines_pred, gt_lines): """ Args: lines_pred (Tensor): predicted normalized lines: [num_query, 2*num_points] gt_lines (Tensor): Ground truth lines [num_gt, 2*num_points] or [num_gt, num_permute, 2*num_points] Returns: torch.Tensor: reg_cost value with weight shape [num_pred, num_gt] """ if self.permute: assert len(gt_lines.shape) == 3 else: assert len(gt_lines.shape) == 2 num_gt, num_pred = len(gt_lines), len(lines_pred) if self.permute: gt_lines = gt_lines.flatten(0, 1) # (num_gt*num_permute, 2*num_pts) num_pts = lines_pred.shape[-1] // 2 lines_pred = lines_pred.view(-1, 2) # [num_query*num_points, 2] gt_lines = gt_lines.view(-1, 2) # [num_gt*num_points, 2] dist_mat = torch.cdist(lines_pred, gt_lines, p=2) # (num_query*num_points, num_gt*num_points) dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=-1)) # (num_gt, num_query*num_points, num_pts) dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=1)) # (num_q, num_gt, num_pts, num_pts) dist1 = dist_mat.min(-1)[0].sum(-1) dist2 = dist_mat.min(-2)[0].sum(-1) dist_mat = (dist1 + dist2) / (2 * num_pts) # (num_pred, num_gt) if self.permute: # dist_mat: (num_pred, num_gt*num_permute) dist_mat = dist_mat.view(num_pred, num_gt, -1) # (num_pred, num_gt, num_permute) dist_mat, gt_permute_index = dist_mat.min(-1) return dist_mat * self.weight, gt_permute_index return dist_mat * self.weight @MATCH_COST.register_module() class LinesL1Cost(object): """LinesL1Cost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.0, beta=0.0, permute=False): self.weight = weight self.permute = permute self.beta = beta def __call__(self, lines_pred, gt_lines, **kwargs): """ Args: lines_pred (Tensor): predicted normalized lines: [num_query, 2*num_points] gt_lines (Tensor): Ground truth lines [num_gt, 2*num_points] or [num_gt, num_permute, 2*num_points] Returns: torch.Tensor: reg_cost value with weight shape [num_pred, num_gt] """ if self.permute: assert len(gt_lines.shape) == 3 else: assert len(gt_lines.shape) == 2 num_pred, num_gt = len(lines_pred), len(gt_lines) if self.permute: # permute-invarint labels gt_lines = gt_lines.flatten(0, 1) # (num_gt*num_permute, 2*num_pts) num_pts = lines_pred.shape[-1]//2 if self.beta > 0: lines_pred = lines_pred.unsqueeze(1).repeat(1, len(gt_lines), 1) gt_lines = gt_lines.unsqueeze(0).repeat(num_pred, 1, 1) dist_mat = smooth_l1_loss(lines_pred, gt_lines, reduction='none', beta=self.beta).sum(-1) else: dist_mat = torch.cdist(lines_pred, gt_lines, p=1) dist_mat = dist_mat / num_pts if self.permute: # dist_mat: (num_pred, num_gt*num_permute) dist_mat = dist_mat.view(num_pred, num_gt, -1) # (num_pred, num_gt, num_permute) dist_mat, gt_permute_index = torch.min(dist_mat, 2) return dist_mat * self.weight, gt_permute_index return dist_mat * self.weight @MATCH_COST.register_module() class BBoxCostC: """BBoxL1Cost. Args: weight (int | float, optional): loss_weight box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN Examples: >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost >>> import torch >>> self = BBoxL1Cost() >>> bbox_pred = torch.rand(1, 4) >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) >>> factor = torch.tensor([10, 8, 10, 8]) >>> self(bbox_pred, gt_bboxes, factor) tensor([[1.6172, 1.6422]]) """ def __init__(self, weight=1., box_format='xyxy'): self.weight = weight assert box_format in ['xyxy', 'xywh'] self.box_format = box_format def __call__(self, bbox_pred, gt_bboxes): """ Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. gt_bboxes (Tensor): Ground truth boxes with normalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. Returns: torch.Tensor: bbox_cost value with weight """ # if self.box_format == 'xywh': # gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes) # elif self.box_format == 'xyxy': # bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred) bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) return bbox_cost * self.weight @MATCH_COST.register_module() class IoUCostC: """IoUCost. Args: iou_mode (str, optional): iou mode such as 'iou' | 'giou' weight (int | float, optional): loss weight Examples: >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost >>> import torch >>> self = IoUCost() >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]]) >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) >>> self(bboxes, gt_bboxes) tensor([[-0.1250, 0.1667], [ 0.1667, -0.5000]]) """ def __init__(self, iou_mode='giou', weight=1., box_format='xywh'): self.weight = weight self.iou_mode = iou_mode assert box_format in ['xyxy', 'xywh'] self.box_format = box_format def __call__(self, bboxes, gt_bboxes): """ Args: bboxes (Tensor): Predicted boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_query, 4]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. Returns: torch.Tensor: iou_cost value with weight """ if self.box_format == 'xywh': bboxes = bbox_cxcywh_to_xyxy(bboxes) gt_bboxes = bbox_cxcywh_to_xyxy(gt_bboxes) # overlaps: [num_bboxes, num_gt] overlaps = bbox_overlaps( bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False) # The 1 is a constant that doesn't change the matching, so omitted. iou_cost = -overlaps return iou_cost * self.weight @MATCH_COST.register_module() class DynamicLinesCost(object): """LinesL1Cost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.): self.weight = weight def __call__(self, lines_pred, lines_gt, masks_pred, masks_gt): """ Args: lines_pred (Tensor): predicted normalized lines: [nP, num_points, 2] lines_gt (Tensor): Ground truth lines [nG, num_points, 2] masks_pred: [nP, num_points] masks_gt: [nG, num_points] Returns: dist_mat: reg_cost value with weight shape [nP, nG] """ dist_mat = self.cal_dist(lines_pred, lines_gt) dist_mat = self.get_dynamic_line(dist_mat, masks_pred, masks_gt) dist_mat = dist_mat * self.weight return dist_mat def cal_dist(self, x1, x2): ''' Args: x1: B1,N,2 x2: B2,N,2 Return: dist_mat: B1,B2,N ''' x1 = x1.permute(1, 0, 2) x2 = x2.permute(1, 0, 2) dist_mat = torch.cdist(x1, x2, p=2) dist_mat = dist_mat.permute(1, 2, 0) return dist_mat def get_dynamic_line(self, mat, m1, m2): ''' get dynamic line with difference approach mat: N1xN2xnpts m1: N1xnpts m2: N2xnpts ''' # nPxnGxnum_points m1 = m1.unsqueeze(1).sigmoid() > 0.5 m2 = m2.unsqueeze(0) valid_points_mask = (m1 + m2)/2. average_factor_mask = valid_points_mask.sum(-1) > 0 average_factor = average_factor_mask.masked_fill( ~average_factor_mask, 1) # takes the average mat = mat * valid_points_mask mat = mat.sum(-1) / average_factor return mat @MATCH_COST.register_module() class BBoxLogitsCost(object): """BBoxLogits. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.): self.weight = weight def calNLL(self, logits, value): ''' Args: logits: B1, 8, cls_dim value: B2, 8, Return: log_likelihood: B1,B2,8 ''' logits = logits[:, None] value = value[None] value = value.long().unsqueeze(-1) value, log_pmf = torch.broadcast_tensors(value, logits) value = value[..., :1] return log_pmf.gather(-1, value).squeeze(-1) def __call__(self, bbox_pred, bbox_gt, **kwargs): """ Args: bbox_pred: nproposal, 4*2, pos_dim bbox_gt: ngt, 4*2 Returns: cost: nproposal, ngt """ cost = self.calNLL(bbox_pred, bbox_gt).mean(-1) return cost * self.weight @MATCH_COST.register_module() class MapQueriesCost(object): def __init__(self, cls_cost, reg_cost, iou_cost=None): self.cls_cost = build_match_cost(cls_cost) self.reg_cost = build_match_cost(reg_cost) self.iou_cost = None if iou_cost is not None: self.iou_cost = build_match_cost(iou_cost) def __call__(self, preds: dict, gts: dict): # classification and bboxcost. cls_cost = self.cls_cost(preds['scores'], gts['labels']) # regression cost regkwargs = {} if 'masks' in preds and 'masks' in gts: assert isinstance(self.reg_cost, DynamicLinesCost), ' Issues!!' regkwargs = { 'masks_pred': preds['masks'], 'masks_gt': gts['masks'], } reg_cost = self.reg_cost(preds['lines'], gts['lines'], **regkwargs) if self.reg_cost.permute: reg_cost, gt_permute_idx = reg_cost # weighted sum of above three costs cost = cls_cost + reg_cost # Need to pass the reg cost out, and use this to filter deviated # instances for temporal label assignment... raw_reg_cost = reg_cost / self.reg_cost.weight # Iou if self.iou_cost is not None: iou_cost = self.iou_cost(preds['lines'],gts['lines']) cost += iou_cost if self.reg_cost.permute: return cost, gt_permute_idx, raw_reg_cost return cost, raw_reg_cost ================================================ FILE: plugin/models/backbones/__init__.py ================================================ from .bevformer_backbone import BEVFormerBackbone ================================================ FILE: plugin/models/backbones/bevformer/__init__.py ================================================ from .custom_base_transformer_layer import MyCustomBaseTransformerLayer from .encoder import BEVFormerEncoder from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D, MSIPM3D from .temporal_self_attention import TemporalSelfAttention from .transformer import PerceptionTransformer from .temporal_net import TemporalNet ================================================ FILE: plugin/models/backbones/bevformer/custom_base_transformer_layer.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- import copy import warnings import torch import torch.nn as nn from mmcv import ConfigDict, deprecated_api_warning from mmcv.cnn import Linear, build_activation_layer, build_norm_layer from mmcv.runner.base_module import BaseModule, ModuleList, Sequential from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) # Avoid BC-breaking of importing MultiScaleDeformableAttention from this file try: from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 warnings.warn( ImportWarning( '``MultiScaleDeformableAttention`` has been moved to ' '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 )) except ImportError: warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' '``mmcv.ops.multi_scale_deform_attn``, ' 'You should install ``mmcv-full`` if you need this module. ') from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention @TRANSFORMER_LAYER.register_module() class MyCustomBaseTransformerLayer(BaseModule): """Base `TransformerLayer` for vision transformer. It can be built from `mmcv.ConfigDict` and support more flexible customization, for example, using any number of `FFN or LN ` and use different kinds of `attention` by specifying a list of `ConfigDict` named `attn_cfgs`. It is worth mentioning that it supports `prenorm` when you specifying `norm` as the first element of `operation_order`. More details about the `prenorm`: `On Layer Normalization in the Transformer Architecture `_ . Args: attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for `self_attention` or `cross_attention` modules, The order of the configs in the list should be consistent with corresponding attentions in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. Default: None. ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for FFN, The order of the configs in the list should be consistent with corresponding ffn in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Support `prenorm` when you specifying first element as `norm`. Default:None. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. """ def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=True, **kwargs): deprecated_args = dict( feedforward_channels='feedforward_channels', ffn_dropout='ffn_drop', ffn_num_fcs='num_fcs') for ori_name, new_name in deprecated_args.items(): if ori_name in kwargs: warnings.warn( f'The arguments `{ori_name}` in BaseTransformerLayer ' f'has been deprecated, now you should set `{new_name}` ' f'and other FFN related arguments ' f'to a dict named `ffn_cfgs`. ') ffn_cfgs[new_name] = kwargs[ori_name] super(MyCustomBaseTransformerLayer, self).__init__(init_cfg) self.batch_first = batch_first assert set(operation_order) & set( ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ set(operation_order), f'The operation_order of' \ f' {self.__class__.__name__} should ' \ f'contains all four operation type ' \ f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" num_attn = operation_order.count('self_attn') + operation_order.count( 'cross_attn') if isinstance(attn_cfgs, dict): attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] else: assert num_attn == len(attn_cfgs), f'The length ' \ f'of attn_cfg {num_attn} is ' \ f'not consistent with the number of attention' \ f'in operation_order {operation_order}.' self.num_attn = num_attn self.operation_order = operation_order self.norm_cfg = norm_cfg self.pre_norm = operation_order[0] == 'norm' self.attentions = ModuleList() index = 0 for operation_name in operation_order: if operation_name in ['self_attn', 'cross_attn']: if 'batch_first' in attn_cfgs[index]: assert self.batch_first == attn_cfgs[index]['batch_first'] else: attn_cfgs[index]['batch_first'] = self.batch_first attention = build_attention(attn_cfgs[index]) # Some custom attentions used as `self_attn` # or `cross_attn` can have different behavior. attention.operation_name = operation_name self.attentions.append(attention) index += 1 self.embed_dims = self.attentions[0].embed_dims self.ffns = ModuleList() num_ffns = operation_order.count('ffn') if isinstance(ffn_cfgs, dict): ffn_cfgs = ConfigDict(ffn_cfgs) if isinstance(ffn_cfgs, dict): ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] assert len(ffn_cfgs) == num_ffns for ffn_index in range(num_ffns): if 'embed_dims' not in ffn_cfgs[ffn_index]: ffn_cfgs['embed_dims'] = self.embed_dims else: assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims self.ffns.append( build_feedforward_network(ffn_cfgs[ffn_index])) self.norms = ModuleList() num_norms = operation_order.count('norm') for _ in range(num_norms): self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) def forward(self, query, key=None, value=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: if layer == 'self_attn': temp_key = temp_value = query query = self.attentions[attn_index]( query, temp_key, temp_value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=query_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 elif layer == 'cross_attn': query = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=key_pos, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query @TRANSFORMER_LAYER.register_module() class MyCustomBaseTransformerLayerWithoutSelfAttn(BaseModule): """Base `TransformerLayer` for vision transformer. It can be built from `mmcv.ConfigDict` and support more flexible customization, for example, using any number of `FFN or LN ` and use different kinds of `attention` by specifying a list of `ConfigDict` named `attn_cfgs`. It is worth mentioning that it supports `prenorm` when you specifying `norm` as the first element of `operation_order`. More details about the `prenorm`: `On Layer Normalization in the Transformer Architecture `_ . Args: attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for `self_attention` or `cross_attention` modules, The order of the configs in the list should be consistent with corresponding attentions in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. Default: None. ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for FFN, The order of the configs in the list should be consistent with corresponding ffn in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Support `prenorm` when you specifying first element as `norm`. Default:None. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. """ def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=True, **kwargs): deprecated_args = dict( feedforward_channels='feedforward_channels', ffn_dropout='ffn_drop', ffn_num_fcs='num_fcs') for ori_name, new_name in deprecated_args.items(): if ori_name in kwargs: warnings.warn( f'The arguments `{ori_name}` in BaseTransformerLayer ' f'has been deprecated, now you should set `{new_name}` ' f'and other FFN related arguments ' f'to a dict named `ffn_cfgs`. ') ffn_cfgs[new_name] = kwargs[ori_name] super(MyCustomBaseTransformerLayerWithoutSelfAttn, self).__init__(init_cfg) self.batch_first = batch_first assert set(operation_order) & set( ['norm', 'ffn', 'cross_attn']) == \ set(operation_order), f'The operation_order of' \ f' {self.__class__.__name__} should ' \ f'contains all three operation type ' \ f"{['norm', 'ffn', 'cross_attn']}" num_attn = operation_order.count( 'cross_attn') if isinstance(attn_cfgs, dict): attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] else: assert num_attn == len(attn_cfgs), f'The length ' \ f'of attn_cfg {num_attn} is ' \ f'not consistent with the number of attention' \ f'in operation_order {operation_order}.' self.num_attn = num_attn self.operation_order = operation_order self.norm_cfg = norm_cfg self.pre_norm = operation_order[0] == 'norm' self.attentions = ModuleList() index = 0 for operation_name in operation_order: if operation_name in ['self_attn', 'cross_attn']: if 'batch_first' in attn_cfgs[index]: assert self.batch_first == attn_cfgs[index]['batch_first'] else: attn_cfgs[index]['batch_first'] = self.batch_first attention = build_attention(attn_cfgs[index]) # Some custom attentions used as `self_attn` # or `cross_attn` can have different behavior. attention.operation_name = operation_name self.attentions.append(attention) index += 1 self.embed_dims = self.attentions[0].embed_dims self.ffns = ModuleList() num_ffns = operation_order.count('ffn') if isinstance(ffn_cfgs, dict): ffn_cfgs = ConfigDict(ffn_cfgs) if isinstance(ffn_cfgs, dict): ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] assert len(ffn_cfgs) == num_ffns for ffn_index in range(num_ffns): if 'embed_dims' not in ffn_cfgs[ffn_index]: ffn_cfgs['embed_dims'] = self.embed_dims else: assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims self.ffns.append( build_feedforward_network(ffn_cfgs[ffn_index])) self.norms = ModuleList() num_norms = operation_order.count('norm') for _ in range(num_norms): self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) def forward(self, query, key=None, value=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: if layer == 'self_attn': temp_key = temp_value = query query = self.attentions[attn_index]( query, temp_key, temp_value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=query_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 elif layer == 'cross_attn': query = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=key_pos, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query ================================================ FILE: plugin/models/backbones/bevformer/encoder.py ================================================ """ Borrowed from StreamMapNet, and add BEV memory fusion """ from .custom_base_transformer_layer import MyCustomBaseTransformerLayer from .temporal_net import TemporalNet import copy import warnings from mmcv.cnn.bricks.registry import (ATTENTION, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import TransformerLayerSequence from mmcv.runner import force_fp32, auto_fp16 import numpy as np import torch import torch.nn as nn from mmcv.utils import TORCH_VERSION, digit_version from mmcv.utils import ext_loader from einops import rearrange ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) @TRANSFORMER_LAYER_SEQUENCE.register_module() class BEVFormerEncoder(TransformerLayerSequence): """ Attention with both self and cross Implements the decoder in DETR transformer. Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. """ def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes', **kwargs): super(BEVFormerEncoder, self).__init__(*args, **kwargs) self.return_intermediate = return_intermediate temporal_mem_layers = [] for _ in range(self.num_layers): mem_conv = TemporalNet(history_steps=4, hidden_dims=self.embed_dims, num_blocks=1) temporal_mem_layers.append(mem_conv) self.temporal_mem_layers = nn.ModuleList(temporal_mem_layers) self.num_points_in_pillar = num_points_in_pillar self.pc_range = pc_range self.fp16_enabled = False @staticmethod def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float): """Get the reference points used in SCA and TSA. Args: H, W: spatial shape of bev. Z: hight of pillar. D: sample D points uniformly from each pillar. device (obj:`device`): The device where reference_points should be. Returns: Tensor: reference points used in decoder, has \ shape (bs, num_keys, num_levels, 2). """ # reference points in 3D space, used in spatial cross-attention (SCA) if dim == '3d': zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype, device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype, device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W # ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype, # device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H # change y-axis direction ys = torch.linspace(H - 0.5, 0.5, H, dtype=dtype, device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H ref_3d = torch.stack((xs, ys, zs), -1) ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1) ref_3d = ref_3d[None].repeat(bs, 1, 1, 1) return ref_3d # reference points on 2D bev plane, used in temporal self-attention (TSA). elif dim == '2d': ref_y, ref_x = torch.meshgrid( # torch.linspace( # 0.5, H - 0.5, H, dtype=dtype, device=device), torch.linspace( H - 0.5, 0.5, H, dtype=dtype, device=device), torch.linspace( 0.5, W - 0.5, W, dtype=dtype, device=device) ) ref_y = ref_y.reshape(-1)[None] / H ref_x = ref_x.reshape(-1)[None] / W ref_2d = torch.stack((ref_x, ref_y), -1) ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2) return ref_2d # This function must use fp32!!! @force_fp32(apply_to=('reference_points', 'img_metas')) def point_sampling(self, reference_points, pc_range, img_metas): ego2img = [] for img_meta in img_metas: ego2img.append(img_meta['ego2img']) ego2img = np.asarray(ego2img) ego2img = reference_points.new_tensor(ego2img) # (B, N, 4, 4) reference_points = reference_points.clone() reference_points[..., 0:1] = reference_points[..., 0:1] * \ (pc_range[3] - pc_range[0]) + pc_range[0] reference_points[..., 1:2] = reference_points[..., 1:2] * \ (pc_range[4] - pc_range[1]) + pc_range[1] reference_points[..., 2:3] = reference_points[..., 2:3] * \ (pc_range[5] - pc_range[2]) + pc_range[2] reference_points = torch.cat( (reference_points, torch.ones_like(reference_points[..., :1])), -1) reference_points = reference_points.permute(1, 0, 2, 3) D, B, num_query = reference_points.size()[:3] num_cam = ego2img.size(1) reference_points = reference_points.view( D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1) ego2img = ego2img.view( 1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1) reference_points_cam = torch.matmul(ego2img.to(torch.float32), reference_points.to(torch.float32)).squeeze(-1) eps = 1e-5 bev_mask = (reference_points_cam[..., 2:3] > eps) reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps) reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1] reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0] bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0) & (reference_points_cam[..., 1:2] < 1.0) & (reference_points_cam[..., 0:1] < 1.0) & (reference_points_cam[..., 0:1] > 0.0)) if digit_version(TORCH_VERSION) >= digit_version('1.8'): bev_mask = torch.nan_to_num(bev_mask) else: bev_mask = bev_mask.new_tensor( np.nan_to_num(bev_mask.cpu().numpy())) reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4) bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1) return reference_points_cam, bev_mask @auto_fp16() def forward(self, bev_query, key, value, *args, bev_h=None, bev_w=None, bev_pos=None, spatial_shapes=None, level_start_index=None, prev_bev=None, shift=0., warped_history_bev=None, **kwargs): """Forward function for `TransformerDecoder`. Args: bev_query (Tensor): Input BEV query with shape `(num_query, bs, embed_dims)`. key & value (Tensor): Input multi-cameta features with shape (num_cam, num_value, bs, embed_dims) reference_points (Tensor): The reference points of offset. has shape (bs, num_query, 4) when as_two_stage, otherwise has shape ((bs, num_query, 2). valid_ratios (Tensor): The radios of valid points on the feature map, has shape (bs, num_levels, 2) Returns: Tensor: Results with shape [1, num_query, bs, embed_dims] when return_intermediate is `False`, otherwise it has shape [num_layers, num_query, bs, embed_dims]. """ output = bev_query intermediate = [] ref_3d = self.get_reference_points( bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) ref_2d = self.get_reference_points( bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) reference_points_cam, bev_mask = self.point_sampling( ref_3d, self.pc_range, kwargs['img_metas']) # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper. # shift_ref_2d = ref_2d # .clone() shift_ref_2d = ref_2d.clone() shift_ref_2d += shift[:, None, None, :] # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims) bev_query = bev_query.permute(1, 0, 2) bev_pos = bev_pos.permute(1, 0, 2) bs, len_bev, num_bev_level, _ = ref_2d.shape if prev_bev is not None: prev_bev = prev_bev.permute(1, 0, 2) prev_bev = torch.stack( [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1) hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape( bs*2, len_bev, num_bev_level, 2) else: hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape( bs*2, len_bev, num_bev_level, 2) for lid, layer in enumerate(self.layers): output = layer( bev_query, key, value, *args, bev_pos=bev_pos, ref_2d=hybird_ref_2d, ref_3d=ref_3d, bev_h=bev_h, bev_w=bev_w, spatial_shapes=spatial_shapes, level_start_index=level_start_index, reference_points_cam=reference_points_cam, bev_mask=bev_mask, prev_bev=prev_bev, warped_history_bev=warped_history_bev, **kwargs) # BEV memory fusion layer mem_layer = self.temporal_mem_layers[lid] curr_feat = rearrange(output, 'b (h w) c -> b c h w', h=warped_history_bev.shape[3]) fused_output = mem_layer(warped_history_bev, curr_feat) fused_output = rearrange(fused_output, 'b c h w -> b (h w) c') output = output + fused_output bev_query = output if self.return_intermediate: intermediate.append(output) if self.return_intermediate: return torch.stack(intermediate) return output @TRANSFORMER_LAYER.register_module() class BEVFormerLayer(MyCustomBaseTransformerLayer): """Implements decoder layer in DETR transformer. Args: attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): Configs for self_attention or cross_attention, the order should be consistent with it in `operation_order`. If it is a dict, it would be expand to the number of attention in `operation_order`. feedforward_channels (int): The hidden dimension for FFNs. ffn_dropout (float): Probability of an element to be zeroed in ffn. Default 0.0. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Default:None act_cfg (dict): The activation config for FFNs. Default: `LN` norm_cfg (dict): Config dict for normalization layer. Default: `LN`. ffn_num_fcs (int): The number of fully-connected layers in FFNs. Default:2. """ def __init__(self, attn_cfgs, feedforward_channels, ffn_dropout=0.0, operation_order=None, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN'), ffn_num_fcs=2, **kwargs): super(BEVFormerLayer, self).__init__( attn_cfgs=attn_cfgs, feedforward_channels=feedforward_channels, ffn_dropout=ffn_dropout, operation_order=operation_order, act_cfg=act_cfg, norm_cfg=norm_cfg, ffn_num_fcs=ffn_num_fcs, **kwargs) self.fp16_enabled = False assert len(operation_order) == 6 assert set(operation_order) == set( ['self_attn', 'norm', 'cross_attn', 'ffn']) def forward(self, query, key=None, value=None, bev_pos=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, ref_2d=None, ref_3d=None, bev_h=None, bev_w=None, reference_points_cam=None, mask=None, spatial_shapes=None, level_start_index=None, prev_bev=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: # temporal self attention if layer == 'self_attn': query = self.attentions[attn_index]( query, prev_bev, prev_bev, identity if self.pre_norm else None, query_pos=bev_pos, key_pos=bev_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, reference_points=ref_2d, spatial_shapes=torch.tensor( [[bev_h, bev_w]], device=query.device), level_start_index=torch.tensor([0], device=query.device), **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 # spaital cross attention elif layer == 'cross_attn': query = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=key_pos, reference_points=ref_3d, reference_points_cam=reference_points_cam, mask=mask, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, spatial_shapes=spatial_shapes, level_start_index=level_start_index, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query ================================================ FILE: plugin/models/backbones/bevformer/grid_mask.py ================================================ import torch import torch.nn as nn import numpy as np from PIL import Image from mmcv.runner import force_fp32, auto_fp16 class Grid(object): def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio self.mode=mode self.st_prob = prob self.prob = prob def set_prob(self, epoch, max_epoch): self.prob = self.st_prob * epoch / max_epoch def __call__(self, img, label): if np.random.rand() > self.prob: return img, label h = img.size(1) w = img.size(2) self.d1 = 2 self.d2 = min(h, w) hh = int(1.5*h) ww = int(1.5*w) d = np.random.randint(self.d1, self.d2) if self.ratio == 1: self.l = np.random.randint(1, d) else: self.l = min(max(int(d*self.ratio+0.5),1),d-1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: for i in range(hh//d): s = d*i + st_h t = min(s+self.l, hh) mask[s:t,:] *= 0 if self.use_w: for i in range(ww//d): s = d*i + st_w t = min(s+self.l, ww) mask[:,s:t] *= 0 r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] mask = torch.from_numpy(mask).float() if self.mode == 1: mask = 1-mask mask = mask.expand_as(img) if self.offset: offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float() offset = (1 - mask) * offset img = img * mask + offset else: img = img * mask return img, label class GridMask(nn.Module): def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): super(GridMask, self).__init__() self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio self.mode = mode self.st_prob = prob self.prob = prob self.fp16_enable = False def set_prob(self, epoch, max_epoch): self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5 def set_ratio_and_prob(self, ratio, prob): self.prob = prob self.ratio = ratio @auto_fp16() def forward(self, x): if np.random.rand() > self.prob or not self.training: return x n,c,h,w = x.size() x = x.view(-1,h,w) hh = int(1.5*h) ww = int(1.5*w) d = np.random.randint(2, h) self.l = min(max(int(d*self.ratio+0.5),1),d-1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: for i in range(hh//d): s = d*i + st_h t = min(s+self.l, hh) mask[s:t,:] *= 0 if self.use_w: for i in range(ww//d): s = d*i + st_w t = min(s+self.l, ww) mask[:,s:t] *= 0 r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] mask = torch.from_numpy(mask).to(x.dtype).cuda() if self.mode == 1: mask = 1-mask mask = mask.expand_as(x) if self.offset: offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).to(x.dtype).cuda() x = x * mask + offset * (1 - mask) else: x = x * mask return x.view(n,c,h,w) ================================================ FILE: plugin/models/backbones/bevformer/multi_scale_deformable_attn_function.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- import torch from torch.cuda.amp import custom_bwd, custom_fwd from torch.autograd.function import Function, once_differentiable from mmcv.utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) class MultiScaleDeformableAttnFunction_fp16(Function): @staticmethod @custom_fwd(cast_inputs=torch.float16) def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): """GPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), im2col_step (Tensor): The step used in image to column. Returns: Tensor: has shape (bs, num_queries, embed_dims) """ ctx.im2col_step = im2col_step output = ext_module.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step=ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): """GPU version of backward function. Args: grad_output (Tensor): Gradient of output tensor of forward. Returns: Tuple[Tensor]: Gradient of input tensors in forward. """ value, value_spatial_shapes, value_level_start_index, \ sampling_locations, attention_weights = ctx.saved_tensors grad_value = torch.zeros_like(value) grad_sampling_loc = torch.zeros_like(sampling_locations) grad_attn_weight = torch.zeros_like(attention_weights) ext_module.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output.contiguous(), grad_value, grad_sampling_loc, grad_attn_weight, im2col_step=ctx.im2col_step) return grad_value, None, None, \ grad_sampling_loc, grad_attn_weight, None class MultiScaleDeformableAttnFunction_fp32(Function): @staticmethod @custom_fwd(cast_inputs=torch.float32) def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): """GPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), im2col_step (Tensor): The step used in image to column. Returns: Tensor: has shape (bs, num_queries, embed_dims) """ ctx.im2col_step = im2col_step output = ext_module.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step=ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): """GPU version of backward function. Args: grad_output (Tensor): Gradient of output tensor of forward. Returns: Tuple[Tensor]: Gradient of input tensors in forward. """ value, value_spatial_shapes, value_level_start_index, \ sampling_locations, attention_weights = ctx.saved_tensors grad_value = torch.zeros_like(value) grad_sampling_loc = torch.zeros_like(sampling_locations) grad_attn_weight = torch.zeros_like(attention_weights) ext_module.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output.contiguous(), grad_value, grad_sampling_loc, grad_attn_weight, im2col_step=ctx.im2col_step) return grad_value, None, None, \ grad_sampling_loc, grad_attn_weight, None ================================================ FILE: plugin/models/backbones/bevformer/spatial_cross_attention.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch import warnings import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import xavier_init, constant_init from mmcv.cnn.bricks.registry import (ATTENTION, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import build_attention import math from mmcv.runner import force_fp32, auto_fp16 from mmcv.runner.base_module import BaseModule, ModuleList, Sequential from mmcv.utils import ext_loader from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ MultiScaleDeformableAttnFunction_fp16 ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) @ATTENTION.register_module() class SpatialCrossAttention(BaseModule): """An attention module used in BEVFormer. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_cams (int): The number of cameras dropout (float): A Dropout layer on `inp_residual`. Default: 0.. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. deformable_attention: (dict): The config for the deformable attention used in SCA. """ def __init__(self, embed_dims=256, num_cams=6, pc_range=None, dropout=0.1, init_cfg=None, batch_first=False, deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=256, num_levels=4), **kwargs ): super(SpatialCrossAttention, self).__init__(init_cfg) self.init_cfg = init_cfg self.dropout = nn.Dropout(dropout) self.pc_range = pc_range self.fp16_enabled = False self.deformable_attention = build_attention(deformable_attention) self.embed_dims = embed_dims self.num_cams = num_cams self.output_proj = nn.Linear(embed_dims, embed_dims) self.batch_first = batch_first self.init_weight() def init_weight(self): """Default initialization for Parameters of Module.""" xavier_init(self.output_proj, distribution='uniform', bias=0.) @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam')) def forward(self, query, key, value, residual=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, reference_points_cam=None, bev_mask=None, level_start_index=None, flag='encoder', **kwargs): """Forward Function of Detr3DCrossAtten. Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape `(num_key, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_key, bs, embed_dims)`. (B, N, C, H, W) residual (Tensor): The tensor used for addition, with the same shape as `x`. Default None. If None, `x` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, 4), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. or (N, Length_{query}, num_levels, 4), add additional two dimensions is (w, h) to form reference boxes. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different level. With shape (num_levels, 2), last dimension represent (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape (num_levels) and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if key is None: key = query if value is None: value = key if residual is None: inp_residual = query slots = torch.zeros_like(query) if query_pos is not None: query = query + query_pos bs, num_query, _ = query.size() D = reference_points_cam.size(3) indexes = [] for i, mask_per_img in enumerate(bev_mask): index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1) indexes.append(index_query_per_img) max_len = max([len(each) for each in indexes]) # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. queries_rebatch = query.new_zeros( [bs, self.num_cams, max_len, self.embed_dims]) reference_points_rebatch = reference_points_cam.new_zeros( [bs, self.num_cams, max_len, D, 2]) for j in range(bs): for i, reference_points_per_img in enumerate(reference_points_cam): index_query_per_img = indexes[i] queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] num_cams, l, bs, embed_dims = key.shape key = key.permute(2, 0, 1, 3).reshape( bs * self.num_cams, l, self.embed_dims) value = value.permute(2, 0, 1, 3).reshape( bs * self.num_cams, l, self.embed_dims) queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value, reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes, level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims) for j in range(bs): for i, index_query_per_img in enumerate(indexes): slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)] count = bev_mask.sum(-1) > 0 count = count.permute(1, 2, 0).sum(-1) count = torch.clamp(count, min=1.0) slots = slots / count[..., None] slots = self.output_proj(slots) return self.dropout(slots) + inp_residual @ATTENTION.register_module() class MSDeformableAttention3D(BaseModule): """An attention module used in BEVFormer based on Deformable-Detr. `Deformable DETR: Deformable Transformers for End-to-End Object Detection. `_. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 64. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 4. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_identity`. Default: 0.1. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=8, im2col_step=64, dropout=0.1, batch_first=True, norm_cfg=None, init_cfg=None): super().__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.batch_first = batch_first self.output_proj = None self.fp16_enabled = False # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn( "You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points self.sampling_offsets = nn.Linear( embed_dims, num_heads * num_levels * num_points * 2) self.attention_weights = nn.Linear(embed_dims, num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" constant_init(self.sampling_offsets, 0.) thetas = torch.arange( self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view( self.num_heads, 1, 1, 2).repeat(1, self.num_levels, self.num_points, 1) for i in range(self.num_points): grid_init[:, :, i, :] *= i + 1 self.sampling_offsets.bias.data = grid_init.view(-1) constant_init(self.attention_weights, val=0., bias=0.) xavier_init(self.value_proj, distribution='uniform', bias=0.) xavier_init(self.output_proj, distribution='uniform', bias=0.) self._is_init = True def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape ( bs, num_query, embed_dims). key (Tensor): The key tensor with shape `(bs, num_key, embed_dims)`. value (Tensor): The value tensor with shape `(bs, num_key, embed_dims)`. identity (Tensor): The tensor used for addition, with the same shape as `query`. Default None. If None, `query` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, 2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. or (N, Length_{query}, num_levels, 4), add additional two dimensions is (w, h) to form reference boxes. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different levels. With shape (num_levels, 2), last dimension represents (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape ``(num_levels, )`` and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if value is None: value = query if identity is None: identity = query if query_pos is not None: query = query + query_pos if not self.batch_first: # change to (bs, num_query ,embed_dims) query = query.permute(1, 0, 2) value = value.permute(1, 0, 2) bs, num_query, _ = query.shape bs, num_value, _ = value.shape assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value value = self.value_proj(value) if key_padding_mask is not None: value = value.masked_fill(key_padding_mask[..., None], 0.0) value = value.view(bs, num_value, self.num_heads, -1) sampling_offsets = self.sampling_offsets(query).view( bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) attention_weights = self.attention_weights(query).view( bs, num_query, self.num_heads, self.num_levels * self.num_points) attention_weights = attention_weights.softmax(-1) attention_weights = attention_weights.view(bs, num_query, self.num_heads, self.num_levels, self.num_points) if reference_points.shape[-1] == 2: """ For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. For each referent point, we sample `num_points` sampling points. For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. """ offset_normalizer = torch.stack( [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) bs, num_query, num_Z_anchors, xy = reference_points.shape reference_points = reference_points[:, :, None, None, None, :, :] sampling_offsets = sampling_offsets / \ offset_normalizer[None, None, None, :, None, :] bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape sampling_offsets = sampling_offsets.view( bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) sampling_locations = reference_points + sampling_offsets bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape assert num_all_points == num_points * num_Z_anchors sampling_locations = sampling_locations.view( bs, num_query, num_heads, num_levels, num_all_points, xy) elif reference_points.shape[-1] == 4: assert False else: raise ValueError( f'Last dim of reference_points must be' f' 2 or 4, but get {reference_points.shape[-1]} instead.') # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points # if torch.cuda.is_available() and value.is_cuda: if value.dtype == torch.float16: MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 else: MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 output = MultiScaleDeformableAttnFunction.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, sampling_locations, attention_weights) if not self.batch_first: output = output.permute(1, 0, 2) return output @ATTENTION.register_module() class MSIPM3D(BaseModule): """An attention module used in BEVFormer based on Deformable-Detr. `Deformable DETR: Deformable Transformers for End-to-End Object Detection. `_. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 64. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 4. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_identity`. Default: 0.1. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=8, im2col_step=64, dropout=0.1, batch_first=True, norm_cfg=None, init_cfg=None): super().__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.batch_first = batch_first self.output_proj = None self.fp16_enabled = False # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn( "You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points # self.sampling_offsets = nn.Linear( # embed_dims, num_heads * num_levels * num_points * 2) # self.attention_weights = nn.Linear(embed_dims, # num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" # constant_init(self.sampling_offsets, 0.) thetas = torch.arange( self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view( self.num_heads, 1, 1, 2).repeat(1, self.num_levels, self.num_points, 1) for i in range(self.num_points): grid_init[:, :, i, :] *= i + 1 # self.sampling_offsets.bias.data = grid_init.view(-1) self.fixed_sampling_offsets = nn.Parameter(grid_init.view(-1), requires_grad=False) # constant_init(self.attention_weights, val=0., bias=0.) xavier_init(self.value_proj, distribution='uniform', bias=0.) xavier_init(self.output_proj, distribution='uniform', bias=0.) self._is_init = True def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape ( bs, num_query, embed_dims). key (Tensor): The key tensor with shape `(bs, num_key, embed_dims)`. value (Tensor): The value tensor with shape `(bs, num_key, embed_dims)`. identity (Tensor): The tensor used for addition, with the same shape as `query`. Default None. If None, `query` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, 2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. or (N, Length_{query}, num_levels, 4), add additional two dimensions is (w, h) to form reference boxes. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different levels. With shape (num_levels, 2), last dimension represents (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape ``(num_levels, )`` and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if value is None: value = query if identity is None: identity = query if query_pos is not None: query = query + query_pos if not self.batch_first: # change to (bs, num_query ,embed_dims) query = query.permute(1, 0, 2) value = value.permute(1, 0, 2) bs, num_query, _ = query.shape bs, num_value, _ = value.shape assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value value = self.value_proj(value) if key_padding_mask is not None: value = value.masked_fill(key_padding_mask[..., None], 0.0) value = value.view(bs, num_value, self.num_heads, -1) sampling_offsets = self.fixed_sampling_offsets.view( 1, 1, self.num_heads, self.num_levels, self.num_points, 2).repeat( bs, num_query, 1, 1, 1,1) # attention_weights = self.attention_weights(query).view( # bs, num_query, self.num_heads, self.num_levels * self.num_points) attention_weights = query.new_ones((bs, num_query, self.num_heads, self.num_levels * self.num_points)) attention_weights = attention_weights.softmax(-1) # import pdb;pdb.set_trace() attention_weights = attention_weights.view(bs, num_query, self.num_heads, self.num_levels, self.num_points) if reference_points.shape[-1] == 2: """ For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. For each referent point, we sample `num_points` sampling points. For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. """ offset_normalizer = torch.stack( [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) bs, num_query, num_Z_anchors, xy = reference_points.shape reference_points = reference_points[:, :, None, None, None, :, :] sampling_offsets = sampling_offsets / \ offset_normalizer[None, None, None, :, None, :] bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape sampling_offsets = sampling_offsets.view( bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) sampling_locations = reference_points + sampling_offsets bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape assert num_all_points == num_points * num_Z_anchors sampling_locations = sampling_locations.view( bs, num_query, num_heads, num_levels, num_all_points, xy) elif reference_points.shape[-1] == 4: assert False else: raise ValueError( f'Last dim of reference_points must be' f' 2 or 4, but get {reference_points.shape[-1]} instead.') # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points # if torch.cuda.is_available() and value.is_cuda: if value.dtype == torch.float16: MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 else: MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 output = MultiScaleDeformableAttnFunction.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, sampling_locations, attention_weights) if not self.batch_first: output = output.permute(1, 0, 2) return output ================================================ FILE: plugin/models/backbones/bevformer/temporal_net.py ================================================ import torch import torch.nn as nn from typing import Optional, Sequence, Tuple, Union from mmdet.models import NECKS from mmcv.cnn.utils import kaiming_init, constant_init from mmcv.cnn.resnet import conv3x3 from torch import Tensor from einops import rearrange class MyResBlock(nn.Module): def __init__(self, inplanes: int, planes: int, stride: int = 1, dilation: int = 1, style: str = 'pytorch', with_cp: bool = False): super().__init__() assert style in ['pytorch', 'caffe'] self.conv1 = conv3x3(inplanes, planes, stride, dilation) self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes) self.stride = stride self.dilation = dilation assert not with_cp def forward(self, x: Tensor) -> Tensor: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out += residual out = self.relu(out) return out @NECKS.register_module() class TemporalNet(nn.Module): def __init__(self, history_steps, hidden_dims, num_blocks): super(TemporalNet, self).__init__() self.history_steps = history_steps self.hidden_dims = hidden_dims self.num_blocks = num_blocks layers = [] in_dims = (history_steps+1) * hidden_dims self.conv_in = conv3x3(in_dims, hidden_dims, 1, 1) self.bn = nn.BatchNorm2d(hidden_dims) self.relu = nn.ReLU(inplace=True) for _ in range(self.num_blocks): layers.append(MyResBlock(hidden_dims, hidden_dims)) self.res_layer = nn.Sequential(*layers) def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) def forward(self, history_feats, curr_feat): input_feats = torch.cat([history_feats, curr_feat.unsqueeze(1)], dim=1) input_feats = rearrange(input_feats, 'b t c h w -> b (t c) h w') out = self.conv_in(input_feats) out = self.bn(out) out = self.relu(out) out = self.res_layer(out) if curr_feat.dim() == 3: out = out.squeeze(0) return out ================================================ FILE: plugin/models/backbones/bevformer/temporal_self_attention.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32 from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch import warnings import torch import torch.nn as nn from mmcv.cnn import xavier_init, constant_init from mmcv.cnn.bricks.registry import ATTENTION import math from mmcv.runner.base_module import BaseModule, ModuleList, Sequential from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, to_2tuple) from mmcv.utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) @ATTENTION.register_module() class TemporalSelfAttention(BaseModule): """An attention module used in BEVFormer based on Deformable-Detr. `Deformable DETR: Deformable Transformers for End-to-End Object Detection. `_. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 64. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 4. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_identity`. Default: 0.1. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to True. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV. the length of BEV queue is 2. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=4, num_bev_queue=2, im2col_step=64, dropout=0.1, batch_first=True, norm_cfg=None, init_cfg=None): super().__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.dropout = nn.Dropout(dropout) self.batch_first = batch_first self.fp16_enabled = False # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn( "You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points self.num_bev_queue = num_bev_queue self.sampling_offsets = nn.Linear( embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2) self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.output_proj = nn.Linear(embed_dims, embed_dims) self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" constant_init(self.sampling_offsets, 0.) thetas = torch.arange( self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view( self.num_heads, 1, 1, 2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1) for i in range(self.num_points): grid_init[:, :, i, :] *= i + 1 self.sampling_offsets.bias.data = grid_init.view(-1) constant_init(self.attention_weights, val=0., bias=0.) xavier_init(self.value_proj, distribution='uniform', bias=0.) xavier_init(self.output_proj, distribution='uniform', bias=0.) self._is_init = True def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, flag='decoder', **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape `(num_key, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_key, bs, embed_dims)`. identity (Tensor): The tensor used for addition, with the same shape as `query`. Default None. If None, `query` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, 2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. or (N, Length_{query}, num_levels, 4), add additional two dimensions is (w, h) to form reference boxes. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different levels. With shape (num_levels, 2), last dimension represents (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape ``(num_levels, )`` and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if value is None: assert self.batch_first bs, len_bev, c = query.shape value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c) # value = torch.cat([query, query], 0) if identity is None: identity = query if query_pos is not None: query = query + query_pos if not self.batch_first: # change to (bs, num_query ,embed_dims) query = query.permute(1, 0, 2) value = value.permute(1, 0, 2) bs, num_query, embed_dims = query.shape _, num_value, _ = value.shape assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value assert self.num_bev_queue == 2 query = torch.cat([value[:bs], query], -1) value = self.value_proj(value) if key_padding_mask is not None: value = value.masked_fill(key_padding_mask[..., None], 0.0) value = value.reshape(bs*self.num_bev_queue, num_value, self.num_heads, -1) sampling_offsets = self.sampling_offsets(query) sampling_offsets = sampling_offsets.view( bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels, self.num_points, 2) attention_weights = self.attention_weights(query).view( bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels * self.num_points) attention_weights = attention_weights.softmax(-1) attention_weights = attention_weights.view(bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels, self.num_points) attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\ .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous() sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\ .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2) if reference_points.shape[-1] == 2: offset_normalizer = torch.stack( [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets \ / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: sampling_locations = reference_points[:, :, None, :, None, :2] \ + sampling_offsets / self.num_points \ * reference_points[:, :, None, :, None, 2:] \ * 0.5 else: raise ValueError( f'Last dim of reference_points must be' f' 2 or 4, but get {reference_points.shape[-1]} instead.') if torch.cuda.is_available() and value.is_cuda: # using fp16 deformable attention is unstable because it performs many sum operations if value.dtype == torch.float16: MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 else: MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 output = MultiScaleDeformableAttnFunction.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, sampling_locations, attention_weights) # output shape (bs*num_bev_queue, num_query, embed_dims) # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue) output = output.permute(1, 2, 0) # fuse history value and current value # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue) output = output.view(num_query, embed_dims, bs, self.num_bev_queue) output = output.mean(-1) # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims) output = output.permute(2, 0, 1) output = self.output_proj(output) if not self.batch_first: output = output.permute(1, 0, 2) return self.dropout(output) + identity ================================================ FILE: plugin/models/backbones/bevformer/transformer.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- import numpy as np import torch import torch.nn as nn from mmcv.cnn import xavier_init from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence from mmcv.runner.base_module import BaseModule from mmdet.models.utils.builder import TRANSFORMER from torch.nn.init import normal_ from mmcv.runner.base_module import BaseModule from torchvision.transforms.functional import rotate from .temporal_self_attention import TemporalSelfAttention from .spatial_cross_attention import MSDeformableAttention3D from mmcv.runner import force_fp32, auto_fp16 from einops import rearrange @TRANSFORMER.register_module() class PerceptionTransformer(BaseModule): """Implements the Detr3D transformer. Args: as_two_stage (bool): Generate query from encoder features. Default: False. num_feature_levels (int): Number of feature maps from FPN: Default: 4. two_stage_num_proposals (int): Number of proposals when set `as_two_stage` as True. Default: 300. """ def __init__(self, num_feature_levels=4, num_cams=6, encoder=None, embed_dims=256, use_cams_embeds=True, **kwargs): super().__init__(**kwargs) self.encoder = build_transformer_layer_sequence(encoder) # self.decoder = build_transformer_layer_sequence(decoder) self.embed_dims = embed_dims self.num_feature_levels = num_feature_levels self.num_cams = num_cams self.fp16_enabled = False self.use_cams_embeds = use_cams_embeds self.init_layers() def init_layers(self): """Initialize layers of the Detr3DTransformer.""" self.level_embeds = nn.Parameter(torch.Tensor( self.num_feature_levels, self.embed_dims)) self.cams_embeds = nn.Parameter( torch.Tensor(self.num_cams, self.embed_dims)) # self.reference_points = nn.Linear(self.embed_dims, 3) def init_weights(self): """Initialize the transformer weights.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention): try: m.init_weight() except AttributeError: m.init_weights() normal_(self.level_embeds) normal_(self.cams_embeds) # xavier_init(self.reference_points, distribution='uniform', bias=0.) # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos')) def get_bev_features( self, mlvl_feats, bev_queries, bev_h, bev_w, bev_pos=None, prop_bev=None, prev_bev=None, warped_history_bev=None, **kwargs): """ obtain bev features. """ bs = mlvl_feats[0].size(0) bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1) bev_pos = bev_pos.flatten(2).permute(2, 0, 1) shift = bev_queries.new_tensor((0,0))[None].repeat(bs,1) feat_flatten = [] spatial_shapes = [] for lvl, feat in enumerate(mlvl_feats): bs, num_cam, c, h, w = feat.shape spatial_shape = (h, w) feat = feat.flatten(3).permute(1, 0, 3, 2) if self.use_cams_embeds: feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) feat = feat + self.level_embeds[None, None, lvl:lvl + 1, :].to(feat.dtype) spatial_shapes.append(spatial_shape) feat_flatten.append(feat) feat_flatten = torch.cat(feat_flatten, 2) spatial_shapes = torch.as_tensor( spatial_shapes, dtype=torch.long, device=bev_pos.device) level_start_index = torch.cat((spatial_shapes.new_zeros( (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) feat_flatten = feat_flatten.permute( 0, 2, 1, 3) # (num_cam, H*W, bs, embed_dims) # Fuse the propagated bev features from the prev step if prop_bev is not None: prop_bev = rearrange(prop_bev, 'b c h w -> (h w) b c') valid_mask = (prop_bev.sum(-1) > 0).to(bev_queries.dtype)[..., None] bev_queries = bev_queries * (1 - valid_mask) + prop_bev * valid_mask bev_embed = self.encoder( bev_queries, feat_flatten, feat_flatten, bev_h=bev_h, bev_w=bev_w, bev_pos=bev_pos, spatial_shapes=spatial_shapes, level_start_index=level_start_index, prev_bev=prev_bev, shift=shift, warped_history_bev=warped_history_bev, **kwargs ) return bev_embed @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos')) def forward(self, mlvl_feats, bev_queries, object_query_embed, bev_h, bev_w, grid_length=[0.512, 0.512], bev_pos=None, reg_branches=None, cls_branches=None, prev_bev=None, **kwargs): """Forward function for `Detr3DTransformer`. Args: mlvl_feats (list(Tensor)): Input queries from different level. Each element has shape [bs, num_cams, embed_dims, h, w]. bev_queries (Tensor): (bev_h*bev_w, c) bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w) object_query_embed (Tensor): The query embedding for decoder, with shape [num_query, c]. reg_branches (obj:`nn.ModuleList`): Regression heads for feature maps from each decoder layer. Only would be passed when `with_box_refine` is True. Default to None. Returns: tuple[Tensor]: results of decoder containing the following tensor. - bev_embed: BEV features - inter_states: Outputs from decoder. If return_intermediate_dec is True output has shape \ (num_dec_layers, bs, num_query, embed_dims), else has \ shape (1, bs, num_query, embed_dims). - init_reference_out: The initial value of reference \ points, has shape (bs, num_queries, 4). - inter_references_out: The internal value of reference \ points in decoder, has shape \ (num_dec_layers, bs,num_query, embed_dims) - enc_outputs_class: The classification score of \ proposals generated from \ encoder's feature maps, has shape \ (batch, h*w, num_classes). \ Only would be returned when `as_two_stage` is True, \ otherwise None. - enc_outputs_coord_unact: The regression results \ generated from encoder's feature maps., has shape \ (batch, h*w, 4). Only would \ be returned when `as_two_stage` is True, \ otherwise None. """ raise NotImplementedError ================================================ FILE: plugin/models/backbones/bevformer_backbone.py ================================================ import copy import torch import torch.nn as nn import torch.nn.functional as F from mmdet.models import BACKBONES from mmcv.runner import force_fp32, auto_fp16 from mmdet.models.utils import build_transformer from mmcv.cnn.bricks.transformer import FFN, build_positional_encoding from .bevformer.grid_mask import GridMask from mmdet3d.models import builder from contextlib import nullcontext class UpsampleBlock(nn.Module): def __init__(self, ins, outs): super(UpsampleBlock, self).__init__() self.gn = nn.GroupNorm(32, outs) self.conv = nn.Conv2d(ins, outs, kernel_size=3, stride=1, padding=1) # same self.relu = nn.ReLU(inplace=True) def init_weights(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, x): x = self.conv(x) x = self.relu(self.gn(x)) x = self.upsample2x(x) return x def upsample2x(self, x): _, _, h, w = x.shape x = F.interpolate(x, size=(h*2, w*2), mode='bilinear', align_corners=True) return x @BACKBONES.register_module() class BEVFormerBackbone(nn.Module): """Head of Detr3D. Args: with_box_refine (bool): Whether to refine the reference points in the decoder. Defaults to False. as_two_stage (bool) : Whether to generate the proposal from the outputs of encoder. transformer (obj:`ConfigDict`): ConfigDict is used for building the Encoder and Decoder. bev_h, bev_w (int): spatial shape of BEV queries. """ def __init__(self, roi_size, bev_h, bev_w, img_backbone=None, img_neck=None, transformer=None, positional_encoding=None, use_grid_mask=True, upsample=False, up_outdim=128, history_steps=None, **kwargs): super(BEVFormerBackbone, self).__init__() # image feature self.default_ratio = 0.5 self.default_prob = 0.7 self.grid_mask = GridMask( True, True, rotate=1, offset=False, ratio=self.default_ratio, mode=1, prob=self.default_prob) self.use_grid_mask = use_grid_mask if img_backbone: self.img_backbone = builder.build_backbone(img_backbone) if img_neck is not None: self.img_neck = builder.build_neck(img_neck) self.with_img_neck = True else: self.with_img_neck = False self.bev_h = bev_h self.bev_w = bev_w self.real_w = roi_size[0] self.real_h = roi_size[1] self.positional_encoding = build_positional_encoding( positional_encoding) self.transformer = build_transformer(transformer) self.embed_dims = self.transformer.embed_dims self.upsample = upsample if self.upsample: self.up = UpsampleBlock(self.transformer.embed_dims, up_outdim) self.history_steps = history_steps self._init_layers() self.init_weights() def _init_layers(self): """Initialize classification branch and regression branch of head.""" self.bev_embedding = nn.Embedding( self.bev_h * self.bev_w, self.embed_dims) def init_weights(self): """Initialize weights of the DeformDETR head.""" self.transformer.init_weights() self.img_backbone.init_weights() self.img_neck.init_weights() if self.upsample: self.up.init_weights() # @auto_fp16(apply_to=('img')) def extract_img_feat(self, img, img_metas, len_queue=None): """Extract features of images.""" B = img.size(0) if img is not None: # input_shape = img.shape[-2:] # # update real input shape of each single img # for img_meta in img_metas: # img_meta.update(input_shape=input_shape) if img.dim() == 5 and img.size(0) == 1: img = img.squeeze(0) elif img.dim() == 5 and img.size(0) > 1: B, N, C, H, W = img.size() img = img.reshape(B * N, C, H, W) if self.use_grid_mask: img = self.grid_mask(img) img_feats = self.img_backbone(img) if isinstance(img_feats, dict): img_feats = list(img_feats.values()) else: return None if self.with_img_neck: img_feats = self.img_neck(img_feats) img_feats_reshaped = [] for img_feat in img_feats: BN, C, H, W = img_feat.size() if len_queue is not None: img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W)) else: img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W)) return img_feats_reshaped def forward(self, img, img_metas, timestep, history_bev_feats, history_img_metas, all_history_coord, *args, prev_bev=None, img_backbone_gradient=True, **kwargs): """Forward function. Args: mlvl_feats (tuple[Tensor]): Features from the upstream network, each is a 5D-tensor with shape (B, N, C, H, W). prev_bev: previous bev featues Returns: all_cls_scores (Tensor): Outputs from the classification head, \ shape [nb_dec, bs, num_query, cls_out_channels]. Note \ cls_out_channels should includes background. all_bbox_preds (Tensor): Sigmoid outputs from the regression \ head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ Shape [nb_dec, bs, num_query, 9]. """ # Optionally turn off the gradient backprop for the 2D image backbones # but always keep the gradients on for the BEV transformer part backprop_context = torch.no_grad if img_backbone_gradient is False else nullcontext with backprop_context(): mlvl_feats = self.extract_img_feat(img=img, img_metas=img_metas) bs, num_cam, _, _, _ = mlvl_feats[0].shape dtype = mlvl_feats[0].dtype bev_queries = self.bev_embedding.weight.to(dtype) # Prepare the transformed history bev features, add the bev prop fusion here if len(history_bev_feats) > 0: all_warped_history_feat = [] for b_i in range(bs): history_coord = all_history_coord[b_i] history_bev_feats_i = torch.stack([feats[b_i] for feats in history_bev_feats], 0) warped_history_feat_i = F.grid_sample(history_bev_feats_i, history_coord, padding_mode='zeros', align_corners=False) all_warped_history_feat.append(warped_history_feat_i) all_warped_history_feat = torch.stack(all_warped_history_feat, dim=0) # BTCHW prop_bev_feat = all_warped_history_feat[:, -1] else: all_warped_history_feat = None prop_bev_feat = None # pad the bev history buffer to fixed length if len(history_bev_feats) < self.history_steps: num_repeat = self.history_steps - len(history_bev_feats) zero_bev_feats = torch.zeros([bs, bev_queries.shape[1], self.bev_h, self.bev_w]).to(bev_queries.device) padding_history_bev_feats = torch.stack([zero_bev_feats,] * num_repeat, dim=1) if all_warped_history_feat is not None: all_warped_history_feat = torch.cat([padding_history_bev_feats, all_warped_history_feat], dim=1) else: all_warped_history_feat = padding_history_bev_feats bev_mask = torch.zeros((bs, self.bev_h, self.bev_w), device=bev_queries.device).to(dtype) bev_pos = self.positional_encoding(bev_mask).to(dtype) outs = self.transformer.get_bev_features( mlvl_feats, bev_queries, self.bev_h, self.bev_w, grid_length=(self.real_h / self.bev_h, self.real_w / self.bev_w), bev_pos=bev_pos, prop_bev=prop_bev_feat, img_metas=img_metas, prev_bev=prev_bev, warped_history_bev=all_warped_history_feat, ) outs = outs.unflatten(1,(self.bev_h,self.bev_w)).permute(0,3,1,2).contiguous() if self.upsample: outs = self.up(outs) return outs, mlvl_feats ================================================ FILE: plugin/models/heads/MapDetectorHead.py ================================================ import copy import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import Conv2d, Linear, build_activation_layer, bias_init_with_prob, xavier_init from mmcv.runner import force_fp32 from mmcv.cnn.bricks.transformer import build_positional_encoding from mmdet.models.utils import build_transformer from mmdet.models import build_loss from mmdet.core import multi_apply, reduce_mean, build_assigner, build_sampler from mmdet.models import HEADS from mmdet.models.utils.transformer import inverse_sigmoid from einops import rearrange @HEADS.register_module(force=True) class MapDetectorHead(nn.Module): def __init__(self, num_queries, num_classes=3, in_channels=128, embed_dims=256, score_thr=0.1, num_points=20, coord_dim=2, roi_size=(60, 30), different_heads=True, predict_refine=False, bev_pos=None, sync_cls_avg_factor=True, bg_cls_weight=0., trans_loss_weight=0.0, transformer=dict(), loss_cls=dict(), loss_reg=dict(), assigner=dict() ): super().__init__() self.num_queries = num_queries self.num_classes = num_classes self.in_channels = in_channels self.embed_dims = embed_dims self.different_heads = different_heads self.predict_refine = predict_refine self.bev_pos = bev_pos self.num_points = num_points self.coord_dim = coord_dim self.sync_cls_avg_factor = sync_cls_avg_factor self.bg_cls_weight = bg_cls_weight self.trans_loss_weight = trans_loss_weight # NOTE: below is a simple MLP to transform the query from prev-frame to cur-frame, # we moved the propagation part outside, self.register_buffer('roi_size', torch.tensor(roi_size, dtype=torch.float32)) origin = (-roi_size[0]/2, -roi_size[1]/2) self.register_buffer('origin', torch.tensor(origin, dtype=torch.float32)) sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.transformer = build_transformer(transformer) self.loss_cls = build_loss(loss_cls) self.loss_reg = build_loss(loss_reg) self.assigner = build_assigner(assigner) if self.loss_cls.use_sigmoid: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 self._init_embedding() self._init_branch() self.init_weights() def init_weights(self): """Initialize weights of the DeformDETR head.""" for p in self.input_proj.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) xavier_init(self.reference_points_embed, distribution='uniform', bias=0.) self.transformer.init_weights() # init prediction branch for m in self.reg_branches: for param in m.parameters(): if param.dim() > 1: nn.init.xavier_uniform_(param) # focal loss init if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) if isinstance(self.cls_branches, nn.ModuleList): for m in self.cls_branches: if hasattr(m, 'bias'): nn.init.constant_(m.bias, bias_init) else: m = self.cls_branches nn.init.constant_(m.bias, bias_init) if hasattr(self, 'query_alpha'): for m in self.query_alpha: for param in m.parameters(): if param.dim() > 1: nn.init.zeros_(param) def _init_embedding(self): positional_encoding = dict( type='SinePositionalEncoding', num_feats=self.embed_dims//2, normalize=True ) self.bev_pos_embed = build_positional_encoding(positional_encoding) # query_pos_embed & query_embed self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims) self.reference_points_embed = nn.Linear(self.embed_dims, self.num_points * 2) def _init_branch(self,): """Initialize classification branch and regression branch of head.""" self.input_proj = Conv2d( self.in_channels, self.embed_dims, kernel_size=1) cls_branch = Linear(self.embed_dims, self.cls_out_channels) reg_branch = [ Linear(self.embed_dims, 2*self.embed_dims), nn.LayerNorm(2*self.embed_dims), nn.ReLU(), Linear(2*self.embed_dims, 2*self.embed_dims), nn.LayerNorm(2*self.embed_dims), nn.ReLU(), Linear(2*self.embed_dims, self.num_points * self.coord_dim), ] reg_branch = nn.Sequential(*reg_branch) num_layers = self.transformer.decoder.num_layers if self.different_heads: cls_branches = nn.ModuleList( [copy.deepcopy(cls_branch) for _ in range(num_layers)]) reg_branches = nn.ModuleList( [copy.deepcopy(reg_branch) for _ in range(num_layers)]) else: cls_branches = nn.ModuleList( [cls_branch for _ in range(num_layers)]) reg_branches = nn.ModuleList( [reg_branch for _ in range(num_layers)]) self.reg_branches = reg_branches self.cls_branches = cls_branches def _prepare_context(self, bev_features): """Prepare class label and vertex context.""" device = bev_features.device # Add 2D coordinate grid embedding B, C, H, W = bev_features.shape bev_mask = bev_features.new_zeros(B, H, W) bev_pos_embeddings = self.bev_pos_embed(bev_mask) # (bs, embed_dims, H, W) bev_features = self.input_proj(bev_features) + bev_pos_embeddings # (bs, embed_dims, H, W) assert list(bev_features.shape) == [B, self.embed_dims, H, W] return bev_features def forward_train(self, bev_features, img_metas, gts, track_query_info=None, memory_bank=None, return_matching=False): ''' Args: bev_feature (List[Tensor]): shape [B, C, H, W] feature in bev view Outs: preds_dict (list[dict]): lines (Tensor): Classification score of all decoder layers, has shape [bs, num_query, 2*num_points] scores (Tensor): [bs, num_query,] ''' bev_features = self._prepare_context(bev_features) bs, C, H, W = bev_features.shape img_masks = bev_features.new_zeros((bs, H, W)) # pos_embed = self.positional_encoding(img_masks) pos_embed = None query_embedding = self.query_embedding.weight[None, ...].repeat(bs, 1, 1) # [B, num_q, embed_dims] input_query_num = self.num_queries init_reference_points = self.reference_points_embed(query_embedding).sigmoid() # (bs, num_q, 2*num_pts) init_reference_points = init_reference_points.view(-1, self.num_queries, self.num_points, 2) # (bs, num_q, num_pts, 2) assert list(init_reference_points.shape) == [bs, self.num_queries, self.num_points, 2] assert list(query_embedding.shape) == [bs, self.num_queries, self.embed_dims] # Prepare the propagated track queries, concat with the original dummy queries if track_query_info is not None and 'track_query_hs_embeds' in track_query_info[0]: new_query_embeds = [] new_init_ref_pts = [] for b_i in range(bs): new_queries = torch.cat([track_query_info[b_i]['track_query_hs_embeds'], query_embedding[b_i], track_query_info[b_i]['pad_hs_embeds']], dim=0) new_query_embeds.append(new_queries) init_ref = rearrange(init_reference_points[b_i], 'n k c -> n (k c)', c=2) new_ref = torch.cat([track_query_info[b_i]['trans_track_query_boxes'], init_ref, track_query_info[b_i]['pad_query_boxes']], dim=0) new_ref = rearrange(new_ref, 'n (k c) -> n k c', c=2) new_init_ref_pts.append(new_ref) #print('length of track queries', track_query_info[b_i]['track_query_hs_embeds'].shape[0]) # concat to get the track+dummy queries query_embedding = torch.stack(new_query_embeds, dim=0) init_reference_points = torch.stack(new_init_ref_pts, dim=0) query_kp_mask = torch.stack([t['query_padding_mask'] for t in track_query_info], dim=0) else: query_kp_mask = query_embedding.new_zeros((bs, self.num_queries), dtype=torch.bool) # outs_dec: (num_layers, num_qs, bs, embed_dims) inter_queries, init_reference, inter_references = self.transformer( mlvl_feats=[bev_features,], mlvl_masks=[img_masks.type(torch.bool)], query_embed=query_embedding, mlvl_pos_embeds=[pos_embed], # not used memory_query=None, init_reference_points=init_reference_points, reg_branches=self.reg_branches, cls_branches=self.cls_branches, predict_refine=self.predict_refine, query_key_padding_mask=query_kp_mask, # mask used in self-attn, memory_bank=memory_bank, ) outputs = [] for i, (queries) in enumerate(inter_queries): reg_points = inter_references[i] # (bs, num_q, num_points, 2) bs = reg_points.shape[0] reg_points = reg_points.view(bs, -1, 2*self.num_points) # (bs, num_q, 2*num_points) scores = self.cls_branches[i](queries) # (bs, num_q, num_classes) reg_points_list = [] scores_list = [] for i in range(len(scores)): # padding queries should not be output reg_points_list.append(reg_points[i]) scores_list.append(scores[i]) pred_dict = { 'lines': reg_points_list, 'scores': scores_list } if return_matching: pred_dict['hs_embeds'] = queries outputs.append(pred_dict) # Pass in the track query information to massage the cost matrix loss_dict, det_match_idxs, det_match_gt_idxs, gt_info_list, matched_reg_cost = \ self.loss(gts=gts, preds=outputs, track_info=track_query_info) if return_matching: return loss_dict, outputs[-1], det_match_idxs[-1], det_match_gt_idxs[-1], matched_reg_cost[-1], gt_info_list[-1] else: return outputs, loss_dict, det_match_idxs, det_match_gt_idxs, gt_info_list def forward_test(self, bev_features, img_metas, track_query_info=None, memory_bank=None): ''' Args: bev_feature (List[Tensor]): shape [B, C, H, W] feature in bev view Outs: preds_dict (list[dict]): lines (Tensor): Classification score of all decoder layers, has shape [bs, num_query, 2*num_points] scores (Tensor): [bs, num_query,] ''' bev_features = self._prepare_context(bev_features) bs, C, H, W = bev_features.shape assert bs == 1, 'Only support bs=1 per-gpu for inference' img_masks = bev_features.new_zeros((bs, H, W)) # pos_embed = self.positional_encoding(img_masks) pos_embed = None query_embedding = self.query_embedding.weight[None, ...].repeat(bs, 1, 1) # [B, num_q, embed_dims] input_query_num = self.num_queries # num query: self.num_query + self.topk init_reference_points = self.reference_points_embed(query_embedding).sigmoid() # (bs, num_q, 2*num_pts) init_reference_points = init_reference_points.view(-1, self.num_queries, self.num_points, 2) # (bs, num_q, num_pts, 2) assert list(init_reference_points.shape) == [bs, input_query_num, self.num_points, 2] assert list(query_embedding.shape) == [bs, input_query_num, self.embed_dims] # Prepare the propagated track queries, concat with the original dummy queries if track_query_info is not None and 'track_query_hs_embeds' in track_query_info[0]: prev_hs_embed = torch.stack([t['track_query_hs_embeds'] for t in track_query_info]) prev_boxes = torch.stack([t['trans_track_query_boxes'] for t in track_query_info]) prev_boxes = rearrange(prev_boxes, 'b n (k c) -> b n k c', c=2) # concat to get the track+dummy queries query_embedding = torch.cat([prev_hs_embed, query_embedding], dim=1) init_reference_points = torch.cat([prev_boxes, init_reference_points], dim=1) query_kp_mask = query_embedding.new_zeros((bs, query_embedding.shape[1]), dtype=torch.bool) # outs_dec: (num_layers, num_qs, bs, embed_dims) inter_queries, init_reference, inter_references = self.transformer( mlvl_feats=[bev_features,], mlvl_masks=[img_masks.type(torch.bool)], query_embed=query_embedding, mlvl_pos_embeds=[pos_embed], # not used memory_query=None, init_reference_points=init_reference_points, reg_branches=self.reg_branches, cls_branches=self.cls_branches, predict_refine=self.predict_refine, query_key_padding_mask=query_kp_mask, # mask used in self-attn, memory_bank=memory_bank, ) outputs = [] for i_query, (queries) in enumerate(inter_queries): reg_points = inter_references[i_query] # (bs, num_q, num_points, 2) bs = reg_points.shape[0] reg_points = reg_points.view(bs, -1, 2*self.num_points) # (bs, num_q, 2*num_points) scores = self.cls_branches[i_query](queries) # (bs, num_q, num_classes) reg_points_list = [] scores_list = [] for i in range(len(scores)): # padding queries should not be output reg_points_list.append(reg_points[i]) scores_list.append(scores[i]) pred_dict = { 'lines': reg_points_list, 'scores': scores_list, 'hs_embeds': queries, } outputs.append(pred_dict) return outputs @force_fp32(apply_to=('score_pred', 'lines_pred', 'gt_lines')) def _get_target_single(self, score_pred, lines_pred, gt_labels, gt_lines, track_info=None, gt_bboxes_ignore=None): """ Compute regression and classification targets for one image. Outputs from a single decoder layer of a single feature level are used. Args: score_pred (Tensor): Box score logits from a single decoder layer for one image. Shape [num_query, cls_out_channels]. lines_pred (Tensor): shape [num_query, 2*num_points] gt_labels (torch.LongTensor) shape [num_gt, ] gt_lines (Tensor): shape [num_gt, 2*num_points]. Returns: tuple[Tensor]: a tuple containing the following for one sample. - labels (LongTensor): Labels of each image. shape [num_query, 1] - label_weights (Tensor]): Label weights of each image. shape [num_query, 1] - lines_target (Tensor): Lines targets of each image. shape [num_query, num_points, 2] - lines_weights (Tensor): Lines weights of each image. shape [num_query, num_points, 2] - pos_inds (Tensor): Sampled positive indices for each image. - neg_inds (Tensor): Sampled negative indices for each image. """ num_pred_lines = len(lines_pred) # assigner and sampler # We massage the matching cost here using the track info, following # the 3-type supervision of TrackFormer/MOTR assign_result, gt_permute_idx, matched_reg_cost = self.assigner.assign(preds=dict(lines=lines_pred, scores=score_pred,), gts=dict(lines=gt_lines, labels=gt_labels, ), track_info=track_info, gt_bboxes_ignore=gt_bboxes_ignore) sampling_result = self.sampler.sample( assign_result, lines_pred, gt_lines) num_gt = len(gt_lines) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds pos_gt_inds = sampling_result.pos_assigned_gt_inds labels = gt_lines.new_full( (num_pred_lines, ), self.num_classes, dtype=torch.long) # (num_q, ) labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] label_weights = gt_lines.new_ones(num_pred_lines) # (num_q, ) lines_target = torch.zeros_like(lines_pred) # (num_q, 2*num_pts) lines_weights = torch.zeros_like(lines_pred) # (num_q, 2*num_pts) if num_gt > 0: if gt_permute_idx is not None: # using permute invariant label # gt_permute_idx: (num_q, num_gt) # pos_inds: which query is positive # pos_gt_inds: which gt each pos pred is assigned # single_matched_gt_permute_idx: which permute order is matched single_matched_gt_permute_idx = gt_permute_idx[ pos_inds, pos_gt_inds ] lines_target[pos_inds] = gt_lines[pos_gt_inds, single_matched_gt_permute_idx].type( lines_target.dtype) # (num_q, 2*num_pts) else: lines_target[pos_inds] = sampling_result.pos_gt_bboxes.type( lines_target.dtype) # (num_q, 2*num_pts) lines_weights[pos_inds] = 1.0 # (num_q, 2*num_pts) # normalization # n = lines_weights.sum(-1, keepdim=True) # (num_q, 1) # lines_weights = lines_weights / n.masked_fill(n == 0, 1) # (num_q, 2*num_pts) # [0, ..., 0] for neg ind and [1/npts, ..., 1/npts] for pos ind return (labels, label_weights, lines_target, lines_weights, pos_inds, neg_inds, pos_gt_inds, matched_reg_cost) # @force_fp32(apply_to=('preds', 'gts')) def get_targets(self, preds, gts, track_info=None, gt_bboxes_ignore_list=None): """ Compute regression and classification targets for a batch image. Outputs from a single decoder layer of a single feature level are used. Args: preds (dict): - lines (Tensor): shape (bs, num_queries, 2*num_points) - scores (Tensor): shape (bs, num_queries, num_class_channels) gts (dict): - class_label (list[Tensor]): tensor shape (num_gts, ) - lines (list[Tensor]): tensor shape (num_gts, 2*num_points) gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: tuple: a tuple containing the following targets. - labels_list (list[Tensor]): Labels for all images. - label_weights_list (list[Tensor]): Label weights for all \ images. - lines_targets_list (list[Tensor]): Lines targets for all \ images. - lines_weight_list (list[Tensor]): Lines weights for all \ images. - num_total_pos (int): Number of positive samples in all \ images. - num_total_neg (int): Number of negative samples in all \ images. """ assert gt_bboxes_ignore_list is None, \ 'Only supports for gt_bboxes_ignore setting to None.' # format the inputs gt_labels = gts['labels'] gt_lines = gts['lines'] lines_pred = preds['lines'] if track_info is None: track_info = [track_info for _ in range(len(gt_labels))] (labels_list, label_weights_list, lines_targets_list, lines_weights_list, pos_inds_list, neg_inds_list,pos_gt_inds_list, matched_reg_cost) = multi_apply( self._get_target_single, preds['scores'], lines_pred, gt_labels, gt_lines, track_info, gt_bboxes_ignore=gt_bboxes_ignore_list) num_total_pos = sum((inds.numel() for inds in pos_inds_list)) num_total_neg = sum((inds.numel() for inds in neg_inds_list)) if track_info[0] is not None: # remove the padding elements from the neg counting padding_mask = torch.cat([t['query_padding_mask'] for t in track_info], dim=0) num_padding = padding_mask.sum() num_total_neg -= num_padding new_gts = dict( labels=labels_list, # list[Tensor(num_q, )], length=bs label_weights=label_weights_list, # list[Tensor(num_q, )], length=bs, all ones lines=lines_targets_list, # list[Tensor(num_q, 2*num_pts)], length=bs lines_weights=lines_weights_list, # list[Tensor(num_q, 2*num_pts)], length=bs ) return new_gts, num_total_pos, num_total_neg, pos_inds_list, pos_gt_inds_list, matched_reg_cost # @force_fp32(apply_to=('preds', 'gts')) def loss_single(self, preds, gts, track_info=None, gt_bboxes_ignore_list=None, reduction='none'): """ Loss function for outputs from a single decoder layer of a single feature level. Args: preds (dict): - lines (Tensor): shape (bs, num_queries, 2*num_points) - scores (Tensor): shape (bs, num_queries, num_class_channels) gts (dict): - class_label (list[Tensor]): tensor shape (num_gts, ) - lines (list[Tensor]): tensor shape (num_gts, 2*num_points) gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components for outputs from a single decoder layer. """ # Get target for each sample new_gts, num_total_pos, num_total_neg, pos_inds_list, pos_gt_inds_list, matched_reg_cost =\ self.get_targets(preds, gts, track_info, gt_bboxes_ignore_list) # Batched all data # for k, v in new_gts.items(): # new_gts[k] = torch.stack(v, dim=0) # tensor (bs, num_q, ...) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( preds['scores'][0].new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) if track_info is not None: cat_padding_mask = torch.cat([t['query_padding_mask'] for t in track_info], dim=0) padding_loss_mask = ~cat_padding_mask # Classification loss # since the inputs needs the second dim is the class dim, we permute the prediction. pred_scores = torch.cat(preds['scores'], dim=0) # (bs*num_q, cls_out_channles) cls_scores = pred_scores.reshape(-1, self.cls_out_channels) # (bs*num_q, cls_out_channels) cls_labels = torch.cat(new_gts['labels'], dim=0).reshape(-1) # (bs*num_q, ) cls_weights = torch.cat(new_gts['label_weights'], dim=0).reshape(-1) # (bs*num_q, ) if track_info is not None: cls_weights = cls_weights * padding_loss_mask.float() loss_cls = self.loss_cls( cls_scores, cls_labels, cls_weights, avg_factor=cls_avg_factor) # Compute the average number of gt boxes across all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() pred_lines = torch.cat(preds['lines'], dim=0) gt_lines = torch.cat(new_gts['lines'], dim=0) line_weights = torch.cat(new_gts['lines_weights'], dim=0) if track_info is not None: line_weights = line_weights * padding_loss_mask[:, None].float() assert len(pred_lines) == len(gt_lines) assert len(gt_lines) == len(line_weights) loss_reg = self.loss_reg( pred_lines, gt_lines, line_weights, avg_factor=num_total_pos) loss_dict = dict( cls=loss_cls, reg=loss_reg, ) new_gts_info = { 'labels': new_gts['labels'], 'lines': new_gts['lines'], } return loss_dict, pos_inds_list, pos_gt_inds_list, matched_reg_cost, new_gts_info @force_fp32(apply_to=('gt_lines_list', 'preds_dicts')) def loss(self, gts, preds, gt_bboxes_ignore=None, track_info=None, reduction='mean', ): """ Loss Function. Args: gts (list[dict]): list length: num_layers dict { 'label': list[tensor(num_gts, )], list length: batchsize, 'line': list[tensor(num_gts, 2*num_points)], list length: batchsize, ... } preds (list[dict]): list length: num_layers dict { 'lines': tensor(bs, num_queries, 2*num_points), 'scores': tensor(bs, num_queries, class_out_channels), } gt_bboxes_ignore (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert gt_bboxes_ignore is None, \ f'{self.__class__.__name__} only supports ' \ f'for gt_bboxes_ignore setting to None.' track_info = [track_info for _ in range(len(gts))] # Since there might have multi layer losses, pos_inds_lists, pos_gt_inds_lists, matched_reg_costs, gt_info_list = multi_apply( self.loss_single, preds, gts, track_info, reduction=reduction) # Format the losses loss_dict = dict() # loss from the last decoder layer for k, v in losses[-1].items(): loss_dict[k] = v # Loss from other decoder layers num_dec_layer = 0 for loss in losses[:-1]: for k, v in loss.items(): loss_dict[f'd{num_dec_layer}.{k}'] = v num_dec_layer += 1 return loss_dict, pos_inds_lists, pos_gt_inds_lists, gt_info_list, matched_reg_costs def post_process(self, preds_dict, tokens, track_dict=None, thr=0.0): lines = preds_dict['lines'] # List[Tensor(num_queries, 2*num_points)] bs = len(lines) scores = preds_dict['scores'] # (bs, num_queries, 3) results = [] for i in range(bs): tmp_vectors = lines[i] # set up the prop_flags tmp_prop_flags = torch.zeros(tmp_vectors.shape[0]).bool() tmp_prop_flags[-100:] = 0 tmp_prop_flags[:-100] = 1 num_preds, num_points2 = tmp_vectors.shape tmp_vectors = tmp_vectors.view(num_preds, num_points2//2, 2) if self.loss_cls.use_sigmoid: tmp_scores, tmp_labels = scores[i].max(-1) tmp_scores = tmp_scores.sigmoid() pos = tmp_scores > thr else: assert self.num_classes + 1 == self.cls_out_channels tmp_scores, tmp_labels = scores[i].max(-1) bg_cls = self.cls_out_channels pos = tmp_labels != bg_cls tmp_vectors = tmp_vectors[pos] tmp_scores = tmp_scores[pos] tmp_labels = tmp_labels[pos] tmp_prop_flags = tmp_prop_flags[pos] if len(tmp_scores) == 0: single_result = { 'vectors': [], 'scores': [], 'labels': [], 'props': [], 'token': tokens[i] } else: single_result = { 'vectors': tmp_vectors.detach().cpu().numpy(), 'scores': tmp_scores.detach().cpu().numpy(), 'labels': tmp_labels.detach().cpu().numpy(), 'props': tmp_prop_flags.detach().cpu().numpy(), 'token': tokens[i] } # also save the tracking information for analyzing if track_dict is not None and len(track_dict['lines'])>0: tmp_track_scores = track_dict['scores'][i] tmp_track_vectors = track_dict['lines'][i] tmp_track_scores, tmp_track_labels = tmp_track_scores.max(-1) tmp_track_scores = tmp_track_scores.sigmoid() single_result['track_scores'] = tmp_track_scores.detach().cpu().numpy() single_result['track_vectors'] = tmp_track_vectors.detach().cpu().numpy() single_result['track_labels'] = tmp_track_labels.detach().cpu().numpy() else: single_result['track_scores'] = [] single_result['track_vectors'] = [] single_result['track_labels'] = [] results.append(single_result) return results def prepare_temporal_propagation(self, preds_dict, scene_name, local_idx, memory_bank=None, thr_track=0.1, thr_det=0.5): lines = preds_dict['lines'] # List[Tensor(num_queries, 2*num_points)] queries = preds_dict['hs_embeds'] bs = len(lines) assert bs == 1, 'now only support bs=1 for temporal-evolving inference' scores = preds_dict['scores'] # (bs, num_queries, 3) first_frame = local_idx == 0 tmp_vectors = lines[0] tmp_queries = queries[0] # focal loss if self.loss_cls.use_sigmoid: tmp_scores, tmp_labels = scores[0].max(-1) tmp_scores = tmp_scores.sigmoid() pos_track = tmp_scores[:-100] > thr_track pos_det = tmp_scores[-100:] > thr_det pos = torch.cat([pos_track, pos_det], dim=0) else: raise RuntimeError('The experiment uses sigmoid for cls outputs') pos_vectors = tmp_vectors[pos] pos_labels = tmp_labels[pos] pos_queries = tmp_queries[pos] pos_scores = tmp_scores[pos] if first_frame: global_ids = torch.arange(len(pos_vectors)) num_instance = len(pos_vectors) else: prop_ids = self.prop_info['global_ids'] prop_num_instance = self.prop_info['num_instance'] global_ids_track = prop_ids[pos_track] num_newborn = int(pos_det.sum()) global_ids_newborn = torch.arange(num_newborn) + prop_num_instance global_ids = torch.cat([global_ids_track, global_ids_newborn]) num_instance = prop_num_instance + num_newborn self.prop_info = { 'vectors': pos_vectors, 'queries': pos_queries, 'scores': pos_scores, 'labels': pos_labels, 'scene_name': scene_name, 'local_idx': local_idx, 'global_ids': global_ids, 'num_instance': num_instance, } if memory_bank is not None: if first_frame: num_tracks = 0 else: num_tracks = self.prop_active_tracks pos_out_inds = torch.where(pos)[0] prev_out = { 'hs_embeds': queries, 'scores': scores, } memory_bank.update_memory(0, first_frame, pos_out_inds, prev_out, num_tracks, local_idx, memory_bank.curr_t) self.prop_active_tracks = len(pos_out_inds) save_pos_results = { 'vectors': pos_vectors.cpu().numpy(), 'scores': pos_scores.cpu().numpy(), 'labels': pos_labels.cpu().numpy(), 'global_ids': global_ids.cpu().numpy(), 'scene_name': scene_name, 'local_idx': local_idx, 'num_instance': num_instance, } return save_pos_results def get_track_info(self, scene_name, local_idx): prop_info = self.prop_info assert prop_info['scene_name'] == scene_name and (prop_info['local_idx']+1 == local_idx or \ prop_info['local_idx'] == local_idx) vectors = prop_info['vectors'] queries = prop_info['queries'] device = queries.device target = {} target['track_query_hs_embeds'] = queries target['track_query_boxes'] = vectors track_info = [target, ] return track_info def get_self_iter_track_query(self, preds_dict): num_tracks = self.prop_active_tracks lines = preds_dict['lines'] # List[Tensor(num_queries, 2*num_points)] queries = preds_dict['hs_embeds'] bs = len(lines) assert bs == 1, 'now only support bs=1 for temporal-evolving inference' scores = preds_dict['scores'] # (bs, num_queries, 3) queries = queries[0][:num_tracks] vectors = lines[0][:num_tracks] target = {} target['track_query_hs_embeds'] = queries target['track_query_boxes'] = vectors track_info = [target, ] return track_info def clear_temporal_cache(self): self.prop_info = None def train(self, *args, **kwargs): super().train(*args, **kwargs) def eval(self): super().eval() def forward(self, *args, return_loss=True, **kwargs): if return_loss: return self.forward_train(*args, **kwargs) else: return self.forward_test(*args, **kwargs) ================================================ FILE: plugin/models/heads/MapSegHead.py ================================================ import copy import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import Conv2d, Linear, build_activation_layer, bias_init_with_prob, xavier_init from mmcv.runner import force_fp32 from mmcv.cnn.bricks.transformer import build_positional_encoding from mmdet.models import build_loss from mmdet.models import HEADS from einops import repeat @HEADS.register_module(force=True) class MapSegHead(nn.Module): def __init__(self, num_classes=3, in_channels=256, embed_dims=256, bev_size=(100,50), canvas_size=(200,100), loss_seg=dict(), loss_dice=dict(), ): super().__init__() self.num_classes = num_classes self.in_channels = in_channels self.embed_dims = embed_dims self.bev_size = bev_size self.canvas_size = canvas_size self.loss_seg = build_loss(loss_seg) self.loss_dice = build_loss(loss_dice) if self.loss_seg.use_sigmoid: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 assert canvas_size[0] % bev_size[0] == 0, 'canvas size must be a multiple of the bev size' self.num_up_blocks = int(np.log2(canvas_size[0] // bev_size[0])) self.conv_in = nn.Conv2d(in_channels, embed_dims, kernel_size=3, padding=1, bias=False) self.relu = nn.ReLU(inplace=True) self.conv_mid_layers = nn.ModuleList([]) self.downsample_layers = nn.ModuleList([]) for _ in range(self.num_up_blocks): conv_mid = nn.Sequential( nn.Upsample(scale_factor=2, mode='nearest'), nn.Conv2d(embed_dims, embed_dims, kernel_size=3, padding=1), nn.ReLU(inplace=True), ) self.conv_mid_layers.append(conv_mid) self.downsample_layers.append(nn.Upsample(scale_factor=0.5, mode='bilinear')) self.conv_out = nn.Conv2d(embed_dims, self.cls_out_channels, kernel_size=1, padding=0) self.init_weights() def init_weights(self): if self.loss_seg.use_sigmoid: bias_init = bias_init_with_prob(0.01) m = self.conv_out nn.init.constant_(m.bias, bias_init) def forward_train(self, bev_features, gts, history_coords): x = self.relu(self.conv_in(bev_features)) for conv_mid in self.conv_mid_layers: x = conv_mid(x) preds = self.conv_out(x) seg_loss = self.loss_seg(preds, gts) dice_loss = self.loss_dice(preds, gts) # downsample the features to the original bev size seg_feats = x for downsample in self.downsample_layers: seg_feats = downsample(seg_feats) return preds, seg_feats, seg_loss, dice_loss def forward_test(self, bev_features): x = self.relu(self.conv_in(bev_features)) for conv_mid in self.conv_mid_layers: x = conv_mid(x) preds = self.conv_out(x) seg_feats = x for downsample in self.downsample_layers: seg_feats = downsample(seg_feats) return preds, seg_feats def train(self, *args, **kwargs): super().train(*args, **kwargs) def eval(self): super().eval() def forward(self, *args, return_loss=True, **kwargs): if return_loss: return self.forward_train(*args, **kwargs) else: return self.forward_test(*args, **kwargs) ================================================ FILE: plugin/models/heads/__init__.py ================================================ from .MapDetectorHead import MapDetectorHead from .MapSegHead import MapSegHead ================================================ FILE: plugin/models/heads/base_map_head.py ================================================ from abc import ABCMeta, abstractmethod import torch.nn as nn from mmcv.runner import auto_fp16 from mmcv.utils import print_log from mmdet.utils import get_root_logger class BaseMapHead(nn.Module, metaclass=ABCMeta): """Base class for mappers.""" def __init__(self): super(BaseMapHead, self).__init__() self.fp16_enabled = False def init_weights(self, pretrained=None): """Initialize the weights in detector. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ if pretrained is not None: logger = get_root_logger() print_log(f'load model from: {pretrained}', logger=logger) @auto_fp16(apply_to=('img', )) def forward(self, *args, **kwargs): pass @abstractmethod def loss(self, pred, gt): ''' Compute loss Output: dict( loss: torch.Tensor log_vars: dict( str: float, ) num_samples: int ) ''' return @abstractmethod def post_process(self, pred): ''' convert model predictions to vectorized outputs the output format should be consistent with the evaluation function ''' return ================================================ FILE: plugin/models/losses/__init__.py ================================================ from .detr_loss import LinesL1Loss, MasksLoss, LenLoss from .seg_loss import MaskFocalLoss, MaskDiceLoss ================================================ FILE: plugin/models/losses/detr_loss.py ================================================ import torch from torch import nn as nn from torch.nn import functional as F from mmdet.models.losses import l1_loss, smooth_l1_loss from mmdet.models.losses.utils import weighted_loss import mmcv from mmdet.models.builder import LOSSES @LOSSES.register_module() class LinesL1Loss(nn.Module): def __init__(self, reduction='mean', loss_weight=1.0, beta=0.5): """ L1 loss. The same as the smooth L1 loss Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. """ super().__init__() self.reduction = reduction self.loss_weight = loss_weight self.beta = beta def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: pred (torch.Tensor): The prediction. shape: [bs, ...] target (torch.Tensor): The learning target of the prediction. shape: [bs, ...] weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. it's useful when the predictions are not all valid. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if self.beta > 0: loss = smooth_l1_loss( pred, target, weight, reduction=reduction, avg_factor=avg_factor, beta=self.beta) else: loss = l1_loss( pred, target, weight, reduction=reduction, avg_factor=avg_factor) num_points = pred.shape[-1] // 2 loss = loss / num_points return loss*self.loss_weight @mmcv.jit(derivate=True, coderize=True) @weighted_loss def bce(pred, label, class_weight=None): """ pred: B,nquery,npts label: B,nquery,npts """ if label.numel() == 0: return pred.sum() * 0 assert pred.size() == label.size() loss = F.binary_cross_entropy_with_logits( pred, label.float(), pos_weight=class_weight, reduction='none') return loss @LOSSES.register_module() class MasksLoss(nn.Module): def __init__(self, reduction='mean', loss_weight=1.0): super(MasksLoss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: xxx """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss = bce(pred, target, weight, reduction=reduction, avg_factor=avg_factor) return loss*self.loss_weight @mmcv.jit(derivate=True, coderize=True) @weighted_loss def ce(pred, label, class_weight=None): """ pred: B*nquery,npts label: B*nquery, """ if label.numel() == 0: return pred.sum() * 0 loss = F.cross_entropy( pred, label, weight=class_weight, reduction='none') return loss @LOSSES.register_module() class LenLoss(nn.Module): def __init__(self, reduction='mean', loss_weight=1.0): super(LenLoss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: xxx """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss = ce(pred, target, weight, reduction=reduction, avg_factor=avg_factor) return loss*self.loss_weight ================================================ FILE: plugin/models/losses/seg_loss.py ================================================ import torch from torch import nn as nn from torch.nn import functional as F import mmcv from mmdet.models.builder import LOSSES from mmdet.models.losses import FocalLoss, weight_reduce_loss from einops import rearrange def py_sigmoid_focal_loss(pred, target, weight=None, gamma=2.0, alpha=0.25, reduction='mean', avg_factor=None): """PyTorch version of `Focal Loss `_. Args: pred (torch.Tensor): The prediction with shape (N, C), C is the number of classes target (torch.Tensor): The learning label of the prediction. weight (torch.Tensor, optional): Sample-wise loss weight. gamma (float, optional): The gamma for calculating the modulating factor. Defaults to 2.0. alpha (float, optional): A balanced form for Focal Loss. Defaults to 0.25. reduction (str, optional): The method used to reduce the loss into a scalar. Defaults to 'mean'. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. """ pred_sigmoid = pred.sigmoid() target = target.type_as(pred) pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target) focal_weight = (alpha * target + (1 - alpha) * (1 - target)) * pt.pow(gamma) loss = F.binary_cross_entropy_with_logits( pred, target, reduction='none') * focal_weight if weight is not None: if weight.shape != loss.shape: if weight.size(0) == loss.size(0): # For most cases, weight is of shape (num_priors, ), # which means it does not have the second axis num_class weight = weight.view(-1, 1) else: # Sometimes, weight per anchor per class is also needed. e.g. # in FSAF. But it may be flattened of shape # (num_priors x num_class, ), while loss is still of shape # (num_priors, num_class). assert weight.numel() == loss.numel() weight = weight.view(loss.size(0), -1) assert weight.ndim == loss.ndim loss = weight_reduce_loss(loss, weight, reduction, avg_factor) return loss @LOSSES.register_module() class MaskFocalLoss(FocalLoss): def __init__(self,**kwargs): super(MaskFocalLoss, self).__init__(**kwargs) def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if not self.use_sigmoid: raise NotImplementedError num_classes = pred.size(1) loss = 0 for index in range(num_classes): loss += self.loss_weight * py_sigmoid_focal_loss( pred[:,index], target[:,index], weight, gamma=self.gamma, alpha=self.alpha, reduction=reduction, avg_factor=avg_factor) loss /= num_classes return loss * self.loss_weight @LOSSES.register_module() class MaskDiceLoss(nn.Module): """Dice Loss PyTorch Created by: Zhang Shuai Email: shuaizzz666@gmail.com dice_loss = 1 - 2*p*t / (p^2 + t^2). p and t represent predict and target. Args: weight: An array of shape [C,] predict: A float32 tensor of shape [N, C, *], for Semantic segmentation task is [N, C, H, W] target: A int64 tensor of shape [N, *], for Semantic segmentation task is [N, H, W] Return: diceloss """ def __init__(self, loss_weight): super(MaskDiceLoss, self).__init__() self.smooth = 1e-5 self.loss_weight = loss_weight def forward(self, pred, target): bs, num_classes = pred.shape[:2] pred = rearrange(pred, 'b n h w -> b n (h w)') target = rearrange(target, 'b n h w -> b n (h w)') pred = pred.sigmoid() intersection = torch.sum(pred * target, dim=2) # (N, C) union = torch.sum(pred.pow(2), dim=2) + torch.sum(target, dim=2) # (N, C) ## p^2 + t^2 >= 2*p*t, target_onehot^2 == target_onehot dice_coef = (2 * intersection + self.smooth) / (union + self.smooth) # (N, C) dice_loss = 1 - torch.mean(dice_coef) # 1 loss = self.loss_weight * dice_loss return loss ================================================ FILE: plugin/models/mapers/MapTracker.py ================================================ """ MapTracker main module, adapted from StreamMapNet """ import numpy as np import torch import torch.nn as nn from mmdet3d.models.builder import (build_backbone, build_head) from .base_mapper import BaseMapper, MAPPERS from ..utils.query_update import MotionMLP from copy import deepcopy from mmdet.core import multi_apply from einops import rearrange, repeat from scipy.spatial.transform import Rotation as R from .vector_memory import VectorInstanceMemory @MAPPERS.register_module() class MapTracker(BaseMapper): def __init__(self, bev_h, bev_w, roi_size, backbone_cfg=dict(), head_cfg=dict(), neck_cfg=None, seg_cfg=None, model_name=None, pretrained=None, history_steps=None, test_time_history_steps=None, mem_select_dist_ranges=[0,0,0,0], skip_vector_head=False, freeze_bev=False, freeze_bev_iters=None, track_fp_aug=True, use_memory=False, mem_len=None, mem_warmup_iters=-1, **kwargs): super().__init__() #Attribute self.model_name = model_name self.last_epoch = None self.backbone = build_backbone(backbone_cfg) if neck_cfg is not None: self.neck = build_head(neck_cfg) else: self.neck = nn.Identity() self.head = build_head(head_cfg) self.num_decoder_layers = self.head.transformer.decoder.num_layers self.skip_vector_head = skip_vector_head self.freeze_bev = freeze_bev # whether freeze bev related parameters self.freeze_bev_iters = freeze_bev_iters # whether freeze bev related parameters self.track_fp_aug = track_fp_aug self.use_memory = use_memory self.mem_warmup_iters = mem_warmup_iters # the track query propagation module, using relative pose c_dim = 7 # quaternion for rotation (4) + translation (3) self.query_propagate = MotionMLP(c_dim=c_dim, f_dim=self.head.embed_dims, identity=True) # BEV semantic seg head self.seg_decoder = build_head(seg_cfg) # BEV self.bev_h = bev_h self.bev_w = bev_w self.roi_size = roi_size self.history_steps = history_steps self.mem_len = mem_len # Set up test time memory selection hyper-parameters if test_time_history_steps is None: self.test_time_history_steps = history_steps else: self.test_time_history_steps = test_time_history_steps self.mem_select_dist_ranges = mem_select_dist_ranges # vector instance memory module if self.use_memory: self.memory_bank = VectorInstanceMemory( dim_in=head_cfg.embed_dims, number_ins=head_cfg.num_queries, bank_size=mem_len, mem_len=mem_len, mem_select_dist_ranges=self.mem_select_dist_ranges, ) xmin, xmax = -roi_size[0]/2, roi_size[0]/2 ymin, ymax = -roi_size[1]/2, roi_size[1]/2 x = torch.linspace(xmin, xmax, bev_w) y = torch.linspace(ymax, ymin, bev_h) y, x = torch.meshgrid(y, x) z = torch.zeros_like(x) ones = torch.ones_like(x) plane = torch.stack([x, y, z, ones], dim=-1) self.register_buffer('plane', plane.double()) self.init_weights(pretrained) def init_weights(self, pretrained=None): """Initialize model weights.""" if pretrained: import logging logger = logging.getLogger() from mmcv.runner import load_checkpoint load_checkpoint(self, pretrained, strict=False, logger=logger) else: try: self.neck.init_weights() except AttributeError: pass def temporal_propagate(self, curr_bev_feats, img_metas, all_history_curr2prev, all_history_prev2curr, use_memory, track_query_info=None, timestep=None, get_trans_loss=False): ''' Args: curr_bev_feat: torch.Tensor of shape [B, neck_input_channels, H, W] img_metas: current image metas (List of #bs samples) bev_memory: where to load and store (training and testing use different buffer) pose_memory: where to load and store (training and testing use different buffer) Out: fused_bev_feat: torch.Tensor of shape [B, neck_input_channels, H, W] ''' bs = curr_bev_feats.size(0) if get_trans_loss: # init the trans_loss related variables here trans_reg_loss = curr_bev_feats.new_zeros((1,)) trans_cls_loss = curr_bev_feats.new_zeros((1,)) back_trans_reg_loss = curr_bev_feats.new_zeros((1,)) back_trans_cls_loss = curr_bev_feats.new_zeros((1,)) num_pos = 0 num_tracks = 0 if use_memory: self.memory_bank.clear_dict() for b_i in range(bs): curr_e2g_trans = self.plane.new_tensor(img_metas[b_i]['ego2global_translation'], dtype=torch.float64) curr_e2g_rot = self.plane.new_tensor(img_metas[b_i]['ego2global_rotation'], dtype=torch.float64) if use_memory: self.memory_bank.curr_rot[b_i] = curr_e2g_rot self.memory_bank.curr_trans[b_i] = curr_e2g_trans if self.memory_bank.curr_t > 0: self.memory_bank.trans_memory_bank(self.query_propagate, b_i, img_metas[b_i]) # transform the track queries if track_query_info is not None: history_curr2prev_matrix = all_history_curr2prev[b_i] history_prev2curr_matrix = all_history_prev2curr[b_i] track_pts = track_query_info[b_i]['track_query_boxes'].clone() track_pts = rearrange(track_pts, 'n (k c) -> n k c', c=2) # from (0, 1) to (-30, 30) or (-15, 15), prep for transform track_pts = self._denorm_lines(track_pts) # Transform the track ref-points using relative pose between prev and curr N, num_points = track_pts.shape[0], track_pts.shape[1] track_pts = torch.cat([ track_pts, track_pts.new_zeros((N, num_points, 1)), # z-axis track_pts.new_ones((N, num_points, 1)) # 4-th dim ], dim=-1) # (num_prop, num_pts, 4) pose_matrix = history_prev2curr_matrix[-1].float()[:3] rot_mat = pose_matrix[:, :3].cpu().numpy() rot = R.from_matrix(rot_mat) translation = pose_matrix[:, 3] trans_matrix = history_prev2curr_matrix[-1].clone() # Add training-time perturbation here for the transformation matrix if self.training: rot, translation = self.add_noise_to_pose(rot, translation) trans_matrix[:3, :3] = torch.tensor(rot.as_matrix()).to(trans_matrix.device) trans_matrix[:3, 3] = torch.tensor(translation).to(trans_matrix.device) trans_track_pts = torch.einsum('lk,ijk->ijl', trans_matrix, track_pts.double()).float() trans_track_pts = trans_track_pts[..., :2] trans_track_pts = self._norm_lines(trans_track_pts) trans_track_pts = torch.clip(trans_track_pts, min=0., max=1.) trans_track_pts = rearrange(trans_track_pts, 'n k c -> n (k c)', c=2) track_query_info[b_i]['trans_track_query_boxes'] = trans_track_pts prop_q = track_query_info[b_i]['track_query_hs_embeds'] rot_quat = torch.tensor(rot.as_quat()).float().to(pose_matrix.device) pose_info = torch.cat([rot_quat.view(-1), translation], dim=0) track_query_updated = self.query_propagate( prop_q, # (topk, embed_dims) pose_info.repeat(len(prop_q), 1) ) # Do not let future-frame loss backprop through the track queries track_query_info[b_i]['track_query_hs_embeds'] = track_query_updated.clone().detach() if get_trans_loss: pred = self.head.reg_branches[-1](track_query_updated).sigmoid() # (num_prop, 2*num_pts) pred_scores = self.head.cls_branches[-1](track_query_updated) assert list(pred.shape) == [N, 2*num_points] gt_pts = track_query_info[b_i]['track_query_gt_lines'].clone() gt_labels = track_query_info[b_i]['track_query_gt_labels'].clone() weights = gt_pts.new_ones((N, 2*num_points)) weights_labels = gt_labels.new_ones((N,)) bg_idx = gt_labels == 3 num_pos = num_pos + (N - bg_idx.sum()) num_tracks += len(gt_labels) weights[bg_idx, :] = 0.0 gt_pts = rearrange(gt_pts, 'n (k c) -> n k c', c=2) denormed_targets = self._denorm_lines(gt_pts) denormed_targets = torch.cat([ denormed_targets, denormed_targets.new_zeros((N, num_points, 1)), # z-axis denormed_targets.new_ones((N, num_points, 1)) # 4-th dim ], dim=-1) # (num_prop, num_pts, 4) assert list(denormed_targets.shape) == [N, num_points, 4] curr_targets = torch.einsum('lk,ijk->ijl', trans_matrix.float(), denormed_targets) curr_targets = curr_targets[..., :2] normed_targets = self._norm_lines(curr_targets) normed_targets = rearrange(normed_targets, 'n k c -> n (k c)', c=2) # set the weight of invalid normed targets to 0 (outside current bev frame) invalid_bev_mask = (normed_targets <= 0) | (normed_targets>=1) weights[invalid_bev_mask] = 0 # (num_prop, 2*num_pts) trans_reg_loss += self.head.loss_reg(pred, normed_targets, weights, avg_factor=1.0) if len(gt_labels) > 0: trans_score = self.head.loss_cls(pred_scores, gt_labels, weights_labels, avg_factor=1.0) else: trans_score = 0.0 trans_cls_loss += trans_score # backward trans loss pose_matrix_inv = torch.inverse(trans_matrix).float()[:3] rot_mat_inv = pose_matrix_inv[:, :3].cpu().numpy() rot_inv = R.from_matrix(rot_mat_inv) rot_quat_inv = torch.tensor(rot_inv.as_quat()).float().to(pose_matrix_inv.device) translation_inv = pose_matrix_inv[:, 3] pose_info_inv = torch.cat([rot_quat_inv.view(-1), translation_inv], dim=0) track_query_backtrans = self.query_propagate( track_query_updated, # (topk, embed_dims) pose_info_inv.repeat(len(prop_q), 1) ) pred_backtrans = self.head.reg_branches[-1](track_query_backtrans).sigmoid() # (num_prop, 2*num_pts) pred_scores_backtrans = self.head.cls_branches[-1](track_query_backtrans) prev_gt_pts = track_query_info[b_i]['track_query_gt_lines'] back_trans_reg_loss += self.head.loss_reg(pred_backtrans, prev_gt_pts, weights, avg_factor=1.0) if len(gt_labels) > 0: trans_score_bak = self.head.loss_cls(pred_scores_backtrans, gt_labels, weights_labels, avg_factor=1.0) else: trans_score_bak = 0.0 back_trans_cls_loss += trans_score_bak if get_trans_loss: trans_loss = self.head.trans_loss_weight * (trans_reg_loss / (num_pos + 1e-10) + trans_cls_loss / (num_tracks + 1e-10)) back_trans_loss = self.head.trans_loss_weight * (back_trans_reg_loss / (num_pos + 1e-10) + back_trans_cls_loss / (num_tracks + 1e-10)) trans_loss_dict = { 'f_trans': trans_loss, 'b_trans': back_trans_loss, } return trans_loss_dict def add_noise_to_pose(self, rot, trans): rot_euler = rot.as_euler('zxy') # 0.08 mean is around 5-degree, 3-sigma is 15-degree noise_euler = np.random.randn(*list(rot_euler.shape)) * 0.08 rot_euler += noise_euler noisy_rot = R.from_euler('zxy', rot_euler) # error within 0.25 meter noise_trans = torch.randn_like(trans) * 0.25 noise_trans[2] = 0 noisy_trans = trans + noise_trans return noisy_rot, noisy_trans def process_history_info(self, img_metas, history_img_metas): bs = len(img_metas) all_history_curr2prev = [] all_history_prev2curr = [] all_history_coord = [] if len(history_img_metas) == 0: return all_history_curr2prev, all_history_prev2curr, all_history_coord for b_i in range(bs): history_e2g_trans = torch.stack([self.plane.new_tensor(prev[b_i]['ego2global_translation'], dtype=torch.float64) for prev in history_img_metas], dim=0) history_e2g_rot = torch.stack([self.plane.new_tensor(prev[b_i]['ego2global_rotation'], dtype=torch.float64) for prev in history_img_metas], dim=0) curr_e2g_trans = self.plane.new_tensor(img_metas[b_i]['ego2global_translation'], dtype=torch.float64) curr_e2g_rot = self.plane.new_tensor(img_metas[b_i]['ego2global_rotation'], dtype=torch.float64) # Do the coords transformation for all features in the history buffer ## Prepare the transformation matrix history_g2e_matrix = torch.stack([torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device),]*len(history_e2g_trans), dim=0) history_g2e_matrix[:, :3, :3] = torch.transpose(history_e2g_rot, 1, 2) history_g2e_matrix[:, :3, 3] = -torch.bmm(torch.transpose(history_e2g_rot, 1, 2), history_e2g_trans[..., None]).squeeze(-1) curr_g2e_matrix = torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device) curr_g2e_matrix[:3, :3] = curr_e2g_rot.T curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans) curr_e2g_matrix = torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device) curr_e2g_matrix[:3, :3] = curr_e2g_rot curr_e2g_matrix[:3, 3] = curr_e2g_trans history_e2g_matrix = torch.stack([torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device),]*len(history_e2g_trans), dim=0) history_e2g_matrix[:, :3, :3] = history_e2g_rot history_e2g_matrix[:, :3, 3] = history_e2g_trans history_curr2prev_matrix = torch.bmm(history_g2e_matrix, repeat(curr_e2g_matrix,'n1 n2 -> r n1 n2', r=len(history_g2e_matrix))) history_prev2curr_matrix = torch.bmm(repeat(curr_g2e_matrix, 'n1 n2 -> r n1 n2', r=len(history_e2g_matrix)), history_e2g_matrix) history_coord = torch.einsum('nlk,ijk->nijl', history_curr2prev_matrix, self.plane).float()[..., :2] # from (-30, 30) or (-15, 15) to (-1, 1) history_coord[..., 0] = history_coord[..., 0] / (self.roi_size[0]/2) history_coord[..., 1] = -history_coord[..., 1] / (self.roi_size[1]/2) all_history_curr2prev.append(history_curr2prev_matrix) all_history_prev2curr.append(history_prev2curr_matrix) all_history_coord.append(history_coord) return all_history_curr2prev, all_history_prev2curr, all_history_coord def forward_train(self, img, vectors, semantic_mask, points=None, img_metas=None, all_prev_data=None, all_local2global_info=None, **kwargs): ''' Args: img: torch.Tensor of shape [B, N, 3, H, W] N: number of cams vectors: list[list[Tuple(lines, length, label)]] - lines: np.array of shape [num_points, 2]. - length: int - label: int len(vectors) = batch_size len(vectors[_b]) = num of lines in sample _b img_metas: img_metas['lidar2img']: [B, N, 4, 4] Out: loss, log_vars, num_sample ''' # prepare labels and images gts, img, img_metas, valid_idx, points = self.batch_data( vectors, img, img_metas, img.device, points) bs = img.shape[0] _use_memory = self.use_memory and self.num_iter > self.mem_warmup_iters if all_prev_data is not None: num_prev_frames = len(all_prev_data) all_gts_prev, all_img_prev, all_img_metas_prev, all_semantic_mask_prev = [], [], [], [] for prev_data in all_prev_data: gts_prev, img_prev, img_metas_prev, valid_idx_prev, _ = self.batch_data( prev_data['vectors'], prev_data['img'], prev_data['img_metas'], img.device ) all_gts_prev.append(gts_prev) all_img_prev.append(img_prev) all_img_metas_prev.append(img_metas_prev) all_semantic_mask_prev.append(prev_data['semantic_mask']) else: num_prev_frames = 0 assert points is None if self.skip_vector_head: backprop_backbone_ids = [0, num_prev_frames] # first and last frame train the backbone (bev pretrain) else: backprop_backbone_ids = [num_prev_frames, ] # only the last frame trains the backbone (all other settings) track_query_info = None all_loss_dict_prev = [] all_trans_loss = [] all_outputs_prev = [] self.tracked_query_length = {} if _use_memory: self.memory_bank.set_bank_size(self.mem_len) self.memory_bank.init_memory(bs=bs) # History records for bev features history_bev_feats = [] history_img_metas = [] gt_semantic = torch.flip(semantic_mask, [2,]) # Iterate through all prev frames for t in range(num_prev_frames): # Backbone for prev img_backbone_gradient = (t in backprop_backbone_ids) all_history_curr2prev, all_history_prev2curr, all_history_coord = \ self.process_history_info(all_img_metas_prev[t], history_img_metas) _bev_feats, mlvl_feats = self.backbone(all_img_prev[t], all_img_metas_prev[t], t, history_bev_feats, history_img_metas, all_history_coord, points=None, img_backbone_gradient=img_backbone_gradient) # Neck for prev bev_feats = self.neck(_bev_feats) if _use_memory: self.memory_bank.curr_t = t # Transform prev-frame feature & pts to curr frame if self.skip_vector_head or t == 0: self.temporal_propagate(bev_feats, all_img_metas_prev[t], all_history_curr2prev, all_history_prev2curr, _use_memory, track_query_info, timestep=t, get_trans_loss=False) else: trans_loss_dict = self.temporal_propagate(bev_feats, all_img_metas_prev[t], all_history_curr2prev, all_history_prev2curr, _use_memory, track_query_info, timestep=t, get_trans_loss=True) ######################################################## # Debugging use: visualize the first-frame track query. and the corresponding # ground-truth information # Do this for every timestep > 0 #self._viz_temporal_supervision(outputs_prev, track_query_info, gts_next[-1], gts_prev[-1], # gts_semantic_curr, gts_semantic_prev, img_metas_next, img_metas_prev, t) #import pdb; pdb.set_trace() ######################################################## img_metas_prev = all_img_metas_prev[t] img_metas_next = all_img_metas_prev[t+1] if t < num_prev_frames-1 else img_metas gts_prev = all_gts_prev[t] gts_next = all_gts_prev[t+1] if t!=num_prev_frames-1 else gts gts_semantic_prev = torch.flip(all_semantic_mask_prev[t], [2,]) gts_semantic_curr = torch.flip(all_semantic_mask_prev[t+1], [2,]) if t!=num_prev_frames-1 else gt_semantic local2global_prev = all_local2global_info[t] local2global_next = all_local2global_info[t+1] # Compute the semantic segmentation loss seg_preds, seg_feats, seg_loss, seg_dice_loss = self.seg_decoder(bev_feats, gts_semantic_prev, all_history_coord, return_loss=True) # Save the history history_bev_feats.append(bev_feats) history_img_metas.append(all_img_metas_prev[t]) if len(history_bev_feats) > self.history_steps: history_bev_feats.pop(0) history_img_metas.pop(0) if not self.skip_vector_head: # Prepare the two-frame instance matching info gt_cur2prev, gt_prev2cur = self.get_two_frame_matching(local2global_prev, local2global_next, gts_prev, gts_next) if t == 0: memory_bank = None else: memory_bank = self.memory_bank if _use_memory else None # 1). Compute the loss for prev frame # 2). Get the matching results for computing the track query to next frame loss_dict_prev, outputs_prev, prev_inds_list, prev_gt_inds_list, prev_matched_reg_cost, \ prev_gt_list = self.head( bev_features=bev_feats, img_metas=img_metas_prev, gts=gts_prev, track_query_info=track_query_info, memory_bank=memory_bank, return_loss=True, return_matching=True) all_outputs_prev.append(outputs_prev) if t > 0: all_trans_loss.append(trans_loss_dict) # Do the query prop and negative sampling, prepare the corrpespnding # updated G.T. labels. The prepared queries will be passed to the model, # and combind with the original queries inside the head model pos_th = 0.4 track_query_info = self.prepare_track_queries_and_targets(gts_next, prev_inds_list, prev_gt_inds_list, prev_matched_reg_cost, prev_gt_list, outputs_prev, gt_cur2prev, gt_prev2cur, img_metas_prev, _use_memory, pos_th=pos_th, timestep=t) else: loss_dict_prev = {} loss_dict_prev['seg'] = seg_loss loss_dict_prev['seg_dice'] = seg_dice_loss all_loss_dict_prev.append(loss_dict_prev) if _use_memory: self.memory_bank.curr_t = num_prev_frames # NOTE: we separate the last frame to be consistent with single-frame only setting) # Backbone for curr img_backbone_gradient = num_prev_frames in backprop_backbone_ids all_history_curr2prev, all_history_prev2curr, all_history_coord = self.process_history_info(img_metas, history_img_metas) _bev_feats, mlvl_feats = self.backbone(img, img_metas, num_prev_frames, history_bev_feats, history_img_metas, all_history_coord, points=None, img_backbone_gradient=img_backbone_gradient) # Neck for curr bev_feats = self.neck(_bev_feats) if self.skip_vector_head or num_prev_frames == 0: # Transform prev-frame feature & pts to curr frame using the relative pose assert track_query_info is None self.temporal_propagate(bev_feats, img_metas, all_history_curr2prev, all_history_prev2curr, _use_memory, track_query_info, timestep=num_prev_frames, get_trans_loss=False) else: trans_loss_dict = self.temporal_propagate(bev_feats, img_metas, all_history_curr2prev, all_history_prev2curr, _use_memory, track_query_info, timestep=num_prev_frames, get_trans_loss=True) all_trans_loss.append(trans_loss_dict) ######################################################## # Debugging use: visualize the first-frame track query. and the corresponding # ground-truth information # Do this for every timestep > 0 #assert num_prev_frames > 0 #self._viz_temporal_supervision(outputs_prev, track_query_info, gts_next[-1], gts_prev[-1], gt_semantic, # gts_semantic_prev, img_metas_next, img_metas_prev, timestep=num_prev_frames) #import pdb; pdb.set_trace() ######################################################## seg_preds, seg_feats, seg_loss, seg_dice_loss = self.seg_decoder(bev_feats, gt_semantic, all_history_coord, return_loss=True) if not self.skip_vector_head: memory_bank = self.memory_bank if _use_memory else None # 3. run the head again and compute the loss for the second frame preds_list, loss_dict, det_match_idxs, det_match_gt_idxs, gt_list = self.head( bev_features=bev_feats, img_metas=img_metas, gts=gts, track_query_info=track_query_info, memory_bank=memory_bank, return_loss=True) else: loss_dict = {} loss_dict['seg'] = seg_loss loss_dict['seg_dice'] = seg_dice_loss # format loss, average over all frames (2 frames for now) loss = 0 losses_t = [] for loss_dict_t in (all_loss_dict_prev + [loss_dict,]): loss_t = 0 for name, var in loss_dict_t.items(): loss_t = loss_t + var losses_t.append(loss_t) loss += loss_t for trans_loss_dict_t in all_trans_loss: trans_loss_t = trans_loss_dict_t['f_trans'] + trans_loss_dict_t['b_trans'] loss += trans_loss_t # update the log log_vars = {k: v.item() for k, v in loss_dict.items()} for t, loss_dict_t in enumerate(all_loss_dict_prev): log_vars_t = {k+'_t{}'.format(t): v.item() for k, v in loss_dict_t.items()} log_vars.update(log_vars_t) for t, loss_t in enumerate(losses_t): log_vars.update({'total_t{}'.format(t): loss_t.item()}) for t, trans_loss_dict_t in enumerate(all_trans_loss): log_vars_t = {k+'_t{}'.format(t): v.item() for k, v in trans_loss_dict_t.items()} log_vars.update(log_vars_t) log_vars.update({'total': loss.item()}) num_sample = img.size(0) return loss, log_vars, num_sample @torch.no_grad() def forward_test(self, img, points=None, img_metas=None, seq_info=None, **kwargs): ''' inference pipeline ''' assert img.shape[0] == 1, 'Only support bs=1 per-gpu for inference' tokens = [] for img_meta in img_metas: tokens.append(img_meta['token']) scene_name, local_idx, seq_length = seq_info[0] first_frame = (local_idx == 0) img_metas[0]['local_idx'] = local_idx if first_frame: if self.use_memory: self.memory_bank.set_bank_size(self.test_time_history_steps) #self.memory_bank.set_bank_size(self.mem_len) self.memory_bank.init_memory(bs=1) self.history_bev_feats_all = [] self.history_img_metas_all = [] if self.use_memory: self.memory_bank.curr_t = local_idx selected_mem_ids = self.select_memory_entries(self.history_img_metas_all, img_metas) history_img_metas = [self.history_img_metas_all[idx] for idx in selected_mem_ids] history_bev_feats = [self.history_bev_feats_all[idx] for idx in selected_mem_ids] all_history_curr2prev, all_history_prev2curr, all_history_coord = \ self.process_history_info(img_metas, history_img_metas) _bev_feats, mlvl_feats = self.backbone(img, img_metas, local_idx, history_bev_feats, history_img_metas, all_history_coord, points=points) img_shape = [_bev_feats.shape[2:] for i in range(_bev_feats.shape[0])] # Neck bev_feats = self.neck(_bev_feats) if self.skip_vector_head or first_frame: self.temporal_propagate(bev_feats, img_metas, all_history_curr2prev, \ all_history_prev2curr, self.use_memory, track_query_info=None) seg_preds, seg_feats = self.seg_decoder(bev_features=bev_feats, return_loss=False) if not self.skip_vector_head: preds_list = self.head(bev_feats, img_metas=img_metas, return_loss=False) track_dict = None else: # Using the saved prev-frame output to prepare the track query inputs track_query_info = self.head.get_track_info(scene_name, local_idx) # Transform prev-frame feature & pts to curr frame using the relative pose self.temporal_propagate(bev_feats, img_metas, all_history_curr2prev, all_history_prev2curr, self.use_memory, track_query_info) seg_preds, seg_feats = self.seg_decoder(bev_features=bev_feats, return_loss=False) # Run the vector map decoder with instance-level memory memory_bank = self.memory_bank if self.use_memory else None preds_list = self.head(bev_feats, img_metas=img_metas, track_query_info=track_query_info, memory_bank=memory_bank, return_loss=False) track_dict = self._process_track_query_info(track_query_info) if not self.skip_vector_head: # take predictions from the last layer preds_dict = preds_list[-1] else: preds_dict = None # Save the BEV and meta-info history self.history_bev_feats_all.append(bev_feats) self.history_img_metas_all.append(img_metas) if len(self.history_bev_feats_all) > self.test_time_history_steps: self.history_bev_feats_all.pop(0) self.history_img_metas_all.pop(0) if not self.skip_vector_head: memory_bank = self.memory_bank if self.use_memory else None thr_det = 0.4 if first_frame else 0.6 pos_results = self.head.prepare_temporal_propagation(preds_dict, scene_name, local_idx, memory_bank, thr_track=0.5, thr_det=thr_det) if not self.skip_vector_head: results_list = self.head.post_process(preds_dict, tokens, track_dict) results_list[0]['pos_results'] = pos_results results_list[0]['meta'] = img_metas[0] else: results_list = [{'vectors': [], 'scores': [], 'labels': [], 'props': [], 'token': token} for token in tokens] # Add the segmentation preds to the results to be saved for b_i in range(len(results_list)): tmp_scores, tmp_labels = seg_preds[b_i].max(0) tmp_scores = tmp_scores.sigmoid() preds_i = torch.zeros(tmp_labels.shape, dtype=torch.uint8).to(tmp_scores.device) pos_ids = tmp_scores >= 0.4 preds_i[pos_ids] = tmp_labels[pos_ids].type(torch.uint8) + 1 preds_i = preds_i.cpu().numpy() results_list[b_i]['semantic_mask'] = preds_i if 'token' not in results_list[b_i]: results_list[b_i]['token'] = tokens[b_i] return results_list def batch_data(self, vectors, imgs, img_metas, device, points=None): bs = len(vectors) # filter none vector's case num_gts = [] for idx in range(bs): num_gts.append(sum([len(v) for k, v in vectors[idx].items()])) valid_idx = [i for i in range(bs) if num_gts[i] > 0] assert len(valid_idx) == bs # make sure every sample has gts all_labels_list = [] all_lines_list = [] all_gt2local = [] all_local2gt = [] for idx in range(bs): labels = [] lines = [] gt2local = [] local2gt = {} for label, _lines in vectors[idx].items(): for _ins_id, _line in enumerate(_lines): labels.append(label) gt2local.append([label, _ins_id]) local2gt[(label, _ins_id)] = len(lines) if len(_line.shape) == 3: # permutation num_permute, num_points, coords_dim = _line.shape lines.append(torch.tensor(_line).reshape(num_permute, -1)) # (38, 40) elif len(_line.shape) == 2: lines.append(torch.tensor(_line).reshape(-1)) # (40, ) else: assert False all_labels_list.append(torch.tensor(labels, dtype=torch.long).to(device)) all_lines_list.append(torch.stack(lines).float().to(device)) all_gt2local.append(gt2local) all_local2gt.append(local2gt) gts = { 'labels': all_labels_list, 'lines': all_lines_list, 'gt2local': all_gt2local, 'local2gt': all_local2gt, } gts = [deepcopy(gts) for _ in range(self.num_decoder_layers)] return gts, imgs, img_metas, valid_idx, points def get_two_frame_matching(self, local2global_prev, local2global_curr, gts_prev, gts): """ Get the G.T. matching between the two frames Terminology: (1). local --> local idx inside each category; (2). global --> global instance id inside category (3). gt --> index in the flattened G.T. sequence Args: prev_ins_ids (_type_): global ids (pre-prepared) for prev frame curr_ins_ids (_type_): global ids (pre-prepared) for curr frame gts_prev (_type_): processed G.T. for prev frame gts (_type_): processed G.T. for curr frame """ bs = len(local2global_prev) gt2local_curr = gts[-1]['gt2local'] # don't need the per-block supervision, just take one gt2local_prev = gts_prev[-1]['gt2local'] local2gt_prev = gts_prev[-1]['local2gt'] # the comma is to take the single-element output from multi_apply global2local_prev, = multi_apply(self._reverse_id_mapping, local2global_prev) all_gt_cur2prev, all_gt_prev2cur = multi_apply(self._compute_cur2prev, gt2local_curr, gt2local_prev, local2gt_prev, local2global_curr, global2local_prev) return all_gt_cur2prev, all_gt_prev2cur def _compute_cur2prev(self, gt2local_curr, gt2local_prev, local2gt_prev, local2global_curr, global2local_prev): cur2prev = torch.zeros(len(gt2local_curr)) prev2cur = torch.zeros(len(gt2local_prev)) prev2cur[:] = -1 for gt_idx_curr in range(len(gt2local_curr)): label = gt2local_curr[gt_idx_curr][0] local_idx = gt2local_curr[gt_idx_curr][1] seq_id = local2global_curr[label][local_idx] if seq_id in global2local_prev[label]: local_id_prev = global2local_prev[label][seq_id] gt_idx_prev = local2gt_prev[(label, local_id_prev)] else: gt_idx_prev = -1 cur2prev[gt_idx_curr] = gt_idx_prev if gt_idx_prev != -1: # there is a positive match in prev frame prev2cur[gt_idx_prev] = gt_idx_curr # update the information return cur2prev, prev2cur def _reverse_id_mapping(self, id_mapping): reversed_mapping = {} for label, mapping in id_mapping.items(): r_map = {v:k for k,v in mapping.items()} reversed_mapping[label] = r_map return reversed_mapping, def prepare_track_queries_and_targets(self, gts, prev_inds_list, prev_gt_inds_list, prev_matched_reg_cost, prev_gt_list, prev_out, gt_cur2prev, gt_prev2cur, metas_prev, use_memory, pos_th=0.4, timestep=None): bs = len(prev_inds_list) device = prev_out['lines'][0].device targets = [] for b_i in range(bs): results = {} for key, val in gts[-1].items(): results[key] = val[b_i] targets.append(results) # for each sample in the batch for b_i, (target, prev_out_ind, prev_target_ind) in enumerate(zip(targets, prev_inds_list, prev_gt_inds_list)): scene_seq_id = metas_prev[b_i]['local_idx'] scores = prev_out['scores'][b_i].detach() scores, labels = scores.max(-1) scores = scores.sigmoid() match_cost = prev_matched_reg_cost[b_i] target_prev2cur = gt_prev2cur[b_i].to(device) target['prev_target_ind'] = prev_target_ind # record the matched g.t. index target['prev_out_ind'] = prev_out_ind target['gt_prev2cur'] = target_prev2cur assert len(target_prev2cur) == len(prev_gt_inds_list[b_i]) # 1). filter the ones with low scores, create FN; prev_pos_scores = scores[prev_out_ind] score_filter_mask = prev_pos_scores >= pos_th keep_mask = score_filter_mask prev_out_ind_filtered = prev_out_ind[keep_mask] prev_target_ind_filtered = prev_target_ind[keep_mask] target_prev2cur = target_prev2cur[prev_target_ind_filtered] target_ind_matching = (target_prev2cur != -1) # -1 means no matching g.t. in curr frame # matched g.t. index in the current frame target_ind_matched_idx = target_prev2cur[target_prev2cur!=-1] target['track_query_match_ids'] = target_ind_matched_idx if timestep == 0: pad_bound = self.head.num_queries else: pad_bound = self.tracked_query_length[b_i] + self.head.num_queries not_prev_out_ind = torch.arange(prev_out['lines'][b_i].shape[0]).to(device) not_prev_out_ind = torch.tensor([ ind.item() for ind in not_prev_out_ind if ind not in prev_out_ind and ind < pad_bound]) # Get all non-matched pred with >0.5 conf score, serve as FP neg_scores = scores[not_prev_out_ind] neg_score_mask = neg_scores >= pos_th # Randomly pick 10% neg output instances and serve as FP _rand_insert = torch.rand([len(neg_scores)]).to(device) if self.track_fp_aug: rand_insert_mask = _rand_insert >= 0.95 fp_select_mask = neg_score_mask | rand_insert_mask else: fp_select_mask = neg_score_mask false_out_ind = not_prev_out_ind[fp_select_mask] prev_out_ind_final = torch.tensor(prev_out_ind_filtered.tolist() + false_out_ind.tolist()).long() target_ind_matching = torch.cat([ target_ind_matching, torch.tensor([False, ] * len(false_out_ind)).bool().to(device) ]) target_prev2cur_aug = torch.cat([ target_prev2cur, torch.tensor([-1, ] * len(false_out_ind)).to(device) ]) target['track_to_cur_gt_ids'] = target_prev2cur_aug # track query masks track_queries_mask = torch.ones_like(target_ind_matching).bool() track_queries_fal_pos_mask = torch.zeros_like(target_ind_matching).bool() track_queries_fal_pos_mask[~target_ind_matching] = True # set prev frame info target['track_query_hs_embeds'] = prev_out['hs_embeds'][b_i, prev_out_ind_final] target['track_query_boxes'] = prev_out['lines'][b_i][prev_out_ind_final].detach() tmp_labels = labels[prev_out_ind_final] tmp_scores = scores[prev_out_ind_final] target['track_query_labels'] = tmp_labels target['track_query_scores'] = tmp_scores # Prepare the G.T. line coords for the track queries, used in the transformation loss prev_gt_lines = prev_gt_list['lines'][b_i] prev_gt_labels = prev_gt_list['labels'][b_i] target['track_query_gt_lines'] = prev_gt_lines[prev_out_ind_final] target['track_query_gt_labels'] = prev_gt_labels[prev_out_ind_final] target['track_queries_mask'] = torch.cat([ track_queries_mask, torch.tensor([False, ] * self.head.num_queries).to(device) ]).bool() target['track_queries_fal_pos_mask'] = torch.cat([ track_queries_fal_pos_mask, torch.tensor([False, ] * self.head.num_queries).to(device) ]).bool() if use_memory: is_first_frame = (timestep == 0) num_tracks = 0 if timestep == 0 else self.tracked_query_length[b_i] self.memory_bank.update_memory(b_i, is_first_frame, prev_out_ind_final, prev_out, num_tracks, scene_seq_id, timestep) targets = self._batchify_tracks(targets) return targets def _batchify_tracks(self, targets): lengths = [len(t['track_queries_mask']) for t in targets] max_len = max(lengths) device = targets[0]['track_query_hs_embeds'].device for b_i in range(len(lengths)): target = targets[b_i] padding_len = max_len - lengths[b_i] pad_hs_embeds = torch.zeros([padding_len, target['track_query_hs_embeds'].shape[1]]).to(device) pad_query_boxes = torch.zeros([padding_len, target['track_query_boxes'].shape[1]]).to(device) query_padding_mask = torch.zeros([max_len]).bool().to(device) query_padding_mask[lengths[b_i]:] = True target['pad_hs_embeds'] = pad_hs_embeds target['pad_query_boxes'] = pad_query_boxes target['query_padding_mask'] = query_padding_mask self.tracked_query_length[b_i] = lengths[b_i] - self.head.num_queries return targets def train(self, *args, **kwargs): super().train(*args, **kwargs) if self.freeze_bev: self._freeze_bev() elif self.freeze_bev_iters is not None and self.num_iter < self.freeze_bev_iters: self._freeze_bev() else: self._unfreeze_bev() def eval(self): super().eval() def _freeze_bev(self,): """Freeze all bev-related backbone parameters, including the backbone and the seg head """ for param in self.backbone.parameters(): param.requires_grad = False for param in self.seg_decoder.parameters(): param.requires_grad = False def _unfreeze_bev(self,): """unfreeze all bev-related backbone parameters, including the backbone and the seg head """ for param in self.backbone.parameters(): param.requires_grad = True for param in self.seg_decoder.parameters(): param.requires_grad = True def _denorm_lines(self, line_pts): """from (0,1) to the BEV space in meters""" line_pts[..., 0] = line_pts[..., 0] * self.roi_size[0] \ - self.roi_size[0] / 2 line_pts[..., 1] = line_pts[..., 1] * self.roi_size[1] \ - self.roi_size[1] / 2 return line_pts def _norm_lines(self, line_pts): """from the BEV space in meters to (0,1) """ line_pts[..., 0] = (line_pts[..., 0] + self.roi_size[0] / 2) \ / self.roi_size[0] line_pts[..., 1] = (line_pts[..., 1] + self.roi_size[1] / 2) \ / self.roi_size[1] return line_pts def _process_track_query_info(self, track_info): bs = len(track_info) all_scores = [] all_lines = [] for b_i in range(bs): embeds = track_info[b_i]['track_query_hs_embeds'] scores = self.head.cls_branches[-1](embeds) coords = self.head.reg_branches[-1](embeds).sigmoid() coords = rearrange(coords, 'n1 (n2 n3) -> n1 n2 n3', n3=2) all_scores.append(scores) all_lines.append(coords) track_results = { 'lines': all_lines, 'scores': all_scores, } return track_results def select_memory_entries(self, history_metas, curr_meta): """ Only used at test time, to select a subset from the long history bank """ if len(history_metas) <= self.history_steps: return np.arange(len(history_metas)) else: history_e2g_trans = np.array([item[0]['ego2global_translation'] for item in history_metas])[:, :2] curr_e2g_trans = np.array(curr_meta[0]['ego2global_translation'])[:2] dists = np.linalg.norm(history_e2g_trans - curr_e2g_trans[None, :], axis=1) sorted_indices = np.argsort(dists) sorted_dists = dists[sorted_indices] covered = np.zeros_like(sorted_indices).astype(np.bool) selected_ids = [] for dist_range in self.mem_select_dist_ranges[::-1]: outter_valid_flags = (sorted_dists >= dist_range) & ~covered if outter_valid_flags.any(): pick_id = np.where(outter_valid_flags)[0][0] covered[pick_id:] = True else: inner_valid_flags = (sorted_dists < dist_range) & ~covered if inner_valid_flags.any(): pick_id = np.where(inner_valid_flags)[0][-1] covered[pick_id] = True else: return np.arange(len(history_metas))[-4:] selected_ids.append(pick_id) selected_mem_ids = sorted_indices[np.array(selected_ids)] return selected_mem_ids ##################################################################### # # Debugging visualization of the temporal propagation supervision # ##################################################################### def _viz_temporal_supervision(self, outputs_prev, all_track_info, gts, gts_prev, semantic_mask, semantic_mask_prev, img_metas, img_metas_prev, timestep): """For debugging use: draw the visualization of the track queries and the corresponding matched G.T. information...""" import os from ..utils.renderer_track import Renderer viz_dir = './viz/debug_noisy_trans' if not os.path.exists(viz_dir): os.makedirs(viz_dir) cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } renderer = Renderer(cat2id, self.roi_size, 'nusc') for b_i in range(len(all_track_info)): track_info = all_track_info[b_i] # prev pred info prev_pred_lines = outputs_prev['lines'][b_i] prev_pred_scores = outputs_prev['scores'][b_i] prev_target_inds = track_info['prev_target_ind'] prev_out_inds = track_info['prev_out_ind'] gt_prev2cur = track_info['gt_prev2cur'] prev_scores, prev_labels = prev_pred_scores.max(-1) prev_scores = prev_scores.sigmoid() prev_lines = rearrange(prev_pred_lines[prev_out_inds], 'n (k c) -> n k c', c=2) prev_labels = prev_labels[prev_out_inds] prev_lines = self._denorm_lines(prev_lines) prev_scores = prev_scores[prev_out_inds] out_path_prev = os.path.join(viz_dir, f't={timestep}_{b_i}_prev.png') renderer.render_bev_from_vectors(prev_lines, prev_labels, out_path_prev, id_info=prev_target_inds, score_info=prev_scores) # gt info gt_labels = gts['labels'][b_i] gt_lines = torch.clip(gts['lines'][b_i][:, 0], 0, 1) gt_lines = rearrange(gt_lines, 'n (k c) -> n k c', c=2) gt_lines = self._denorm_lines(gt_lines) out_path_gt = os.path.join(viz_dir, f't={timestep}_{b_i}_gt.png') gt_ids = np.arange(len(gt_lines)) renderer.render_bev_from_vectors(gt_lines, gt_labels, out_path_gt, id_info=gt_ids) gt_semantic = semantic_mask[b_i].cpu().numpy() out_path_gt_semantic = os.path.join(viz_dir, f't={timestep}_{b_i}_gt_semantic.png') renderer.render_bev_from_mask(gt_semantic, out_path_gt_semantic) # gt info for prev frame gt_labels = gts_prev['labels'][b_i] gt_lines = torch.clip(gts_prev['lines'][b_i][:, 0], 0, 1) gt_lines = rearrange(gt_lines, 'n (k c) -> n k c', c=2) gt_lines = self._denorm_lines(gt_lines) out_path_gt = os.path.join(viz_dir, f't={timestep}_{b_i}_prev_gt.png') gt_ids = np.arange(len(gt_lines)) renderer.render_bev_from_vectors(gt_lines, gt_labels, out_path_gt, id_info=gt_ids) gt_semantic = semantic_mask_prev[b_i].cpu().numpy() out_path_gt_semantic = os.path.join(viz_dir, f't={timestep}_{b_i}_prev_gt_semantic.png') renderer.render_bev_from_mask(gt_semantic, out_path_gt_semantic) # track query info track_to_cur_gt_ids = track_info['track_to_cur_gt_ids'] trans_track_lines = track_info['trans_track_query_boxes'] trans_track_lines = rearrange(trans_track_lines, 'n (k c) -> n k c', c=2) trans_track_lines = self._denorm_lines(trans_track_lines) #tp_track_mask = ~track_info['track_queries_fal_pos_mask'][:-100] trans_track_lines = trans_track_lines track_labels = track_info['track_query_labels'] track_scores = track_info['track_query_scores'] out_path_track = os.path.join(viz_dir, f't={timestep}_{b_i}_track.png') renderer.render_bev_from_vectors(trans_track_lines, track_labels, out_path_track, id_info=track_to_cur_gt_ids, score_info=track_scores) ================================================ FILE: plugin/models/mapers/__init__.py ================================================ from .MapTracker import MapTracker ================================================ FILE: plugin/models/mapers/base_mapper.py ================================================ from abc import ABCMeta, abstractmethod import torch.nn as nn from mmcv.runner import auto_fp16 from mmcv.utils import print_log from mmdet.utils import get_root_logger from mmdet3d.models.builder import DETECTORS MAPPERS = DETECTORS class BaseMapper(nn.Module, metaclass=ABCMeta): """Base class for mappers.""" def __init__(self): super(BaseMapper, self).__init__() self.fp16_enabled = False @property def with_neck(self): """bool: whether the detector has a neck""" return hasattr(self, 'neck') and self.neck is not None # TODO: these properties need to be carefully handled # for both single stage & two stage detectors @property def with_shared_head(self): """bool: whether the detector has a shared head in the RoI Head""" return hasattr(self, 'roi_head') and self.roi_head.with_shared_head @property def with_bbox(self): """bool: whether the detector has a bbox head""" return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox) or (hasattr(self, 'bbox_head') and self.bbox_head is not None)) @property def with_mask(self): """bool: whether the detector has a mask head""" return ((hasattr(self, 'roi_head') and self.roi_head.with_mask) or (hasattr(self, 'mask_head') and self.mask_head is not None)) #@abstractmethod def extract_feat(self, imgs): """Extract features from images.""" pass def forward_train(self, *args, **kwargs): pass #@abstractmethod def simple_test(self, img, img_metas, **kwargs): pass #@abstractmethod def aug_test(self, imgs, img_metas, **kwargs): """Test function with test time augmentation.""" pass def init_weights(self, pretrained=None): """Initialize the weights in detector. Args: pretrained (str, optional): Path to pre-trained weights. Defaults to None. """ if pretrained is not None: logger = get_root_logger() print_log(f'load model from: {pretrained}', logger=logger) def forward_test(self, *args, **kwargs): """ Args: """ if True: self.simple_test() else: self.aug_test() # @auto_fp16(apply_to=('img', )) def forward(self, *args, return_loss=True, **kwargs): """Calls either :func:`forward_train` or :func:`forward_test` depending on whether ``return_loss`` is ``True``. Note this setting will change the expected inputs. When ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor and List[dict]), and when ``resturn_loss=False``, img and img_meta should be double nested (i.e. List[Tensor], List[List[dict]]), with the outer list indicating test time augmentations. """ if return_loss: return self.forward_train(*args, **kwargs) else: kwargs.pop('rescale') return self.forward_test(*args, **kwargs) def train_step(self, data_dict, optimizer): """The iteration step during training. This method defines an iteration step during training, except for the back propagation and optimizer updating, which are done in an optimizer hook. Note that in some complicated cases or models, the whole process including back propagation and optimizer updating is also defined in this method, such as GAN. Args: data_dict (dict): The output of dataloader. optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of runner is passed to ``train_step()``. This argument is unused and reserved. Returns: dict: It should contain at least 3 keys: ``loss``, ``log_vars``, \ ``num_samples``. - ``loss`` is a tensor for back propagation, which can be a \ weighted sum of multiple losses. - ``log_vars`` contains all the variables to be sent to the logger. - ``num_samples`` indicates the batch size (when the model is \ DDP, it means the batch size on each GPU), which is used for \ averaging the logs. """ loss, log_vars, num_samples = self(**data_dict) outputs = dict( loss=loss, log_vars=log_vars, num_samples=num_samples) return outputs def val_step(self, data, optimizer): """The iteration step during validation. This method shares the same signature as :func:`train_step`, but used during val epochs. Note that the evaluation after training epochs is not implemented with this method, but an evaluation hook. """ loss, log_vars, num_samples = self(**data) outputs = dict( loss=loss, log_vars=log_vars, num_samples=num_samples) return outputs def show_result(self, **kwargs): img = None return img ================================================ FILE: plugin/models/mapers/vector_memory.py ================================================ import torch from torch import nn from einops import repeat, rearrange from scipy.spatial.transform import Rotation as R import numpy as np def get_emb(sin_inp): """ Gets a base embedding for one dimension with sin and cos intertwined """ emb = torch.stack((sin_inp.sin(), sin_inp.cos()), dim=-1) return torch.flatten(emb, -2, -1) class PositionalEncoding1D(nn.Module): def __init__(self, channels): """ :param channels: The last dimension of the tensor you want to apply pos emb to. """ super(PositionalEncoding1D, self).__init__() self.org_channels = channels channels = int(np.ceil(channels / 2) * 2) self.channels = channels inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels)) self.register_buffer("inv_freq", inv_freq) self.register_buffer("cached_penc", None) def forward(self, tensor): """ :param tensor: A 3d tensor of size (batch_size, x, ch) :return: Positional Encoding Matrix of size (batch_size, x, ch) """ if len(tensor.shape) != 3: raise RuntimeError("The input tensor has to be 3d!") if self.cached_penc is not None and self.cached_penc.shape == tensor.shape: return self.cached_penc self.cached_penc = None batch_size, x, orig_ch = tensor.shape pos_x = torch.arange(x, device=tensor.device).type(self.inv_freq.type()) sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq) emb_x = get_emb(sin_inp_x) emb = torch.zeros((x, self.channels), device=tensor.device).type(tensor.type()) emb[:, : self.channels] = emb_x self.cached_penc = emb[None, :, :orig_ch].repeat(batch_size, 1, 1) return self.cached_penc class VectorInstanceMemory(nn.Module): def __init__(self, dim_in, number_ins, bank_size, mem_len, mem_select_dist_ranges ): super().__init__() self.max_number_ins = 3 * number_ins # make sure this is not exceeded at initial training when results could be quite random self.bank_size = bank_size self.mem_len = mem_len self.dim_in = dim_in self.mem_select_dist_ranges = mem_select_dist_ranges p_enc_1d = PositionalEncoding1D(dim_in) fake_tensor = torch.zeros((1, 1000, dim_in)) # suppose all sequences are shorter than 1000 self.cached_pe = p_enc_1d(fake_tensor)[0] for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) def set_bank_size(self, bank_size): self.bank_size = bank_size def init_memory(self, bs): self.mem_bank = torch.zeros((self.bank_size, bs, self.max_number_ins, self.dim_in), dtype=torch.float32).cuda() self.mem_bank_seq_id = torch.zeros((self.bank_size, bs, self.max_number_ins), dtype=torch.long).cuda() self.mem_bank_trans = torch.zeros((self.bank_size, bs, 3),dtype=torch.float32).cuda() self.mem_bank_rot = torch.zeros((self.bank_size, bs, 3, 3),dtype=torch.float32).cuda() self.batch_mem_embeds_dict = {} self.batch_mem_relative_pe_dict = {} self.batch_key_padding_dict = {} self.curr_rot = torch.zeros((bs,3,3),dtype=torch.float32).cuda() self.curr_trans = torch.zeros((bs,3),dtype=torch.float32).cuda() self.gt_lines_info = {} # memory recording information self.instance2mem = [{} for _ in range(bs)] self.num_ins = [0 for _ in range(bs)] self.active_mem_ids = [None for _ in range(bs)] self.valid_track_idx = [None for _ in range(bs)] self.random_bev_masks = [None for _ in range(bs)] init_entry_length = torch.tensor([0]*self.max_number_ins).long() self.mem_entry_lengths = [init_entry_length.clone() for _ in range(bs)] def update_memory(self, batch_i, is_first_frame, propagated_ids, prev_out, num_tracks, seq_idx, timestep): if is_first_frame: mem_instance_ids = torch.arange(propagated_ids.shape[0]) track2mem_info = {i: i for i in range(len(propagated_ids))} num_instances = len(propagated_ids) else: track2mem_info_prev = self.instance2mem[batch_i] track2mem_info = {} num_instances = self.num_ins[batch_i] for pred_i, propagated_id in enumerate(propagated_ids): if propagated_id < num_tracks: # existing tracks track2mem_info[pred_i] = track2mem_info_prev[propagated_id.item()] else: # newborn instances track2mem_info[pred_i] = num_instances num_instances += 1 mem_instance_ids = torch.tensor([track2mem_info[item] for item in range(len(propagated_ids))]).long() assert num_instances < self.max_number_ins, 'Number of instances larger than mem size!' #NOTE: put information into the memory, need to detach the scores to block gradient backprop # from future time steps prev_embeddings = prev_out['hs_embeds'][batch_i] prev_scores = prev_out['scores'][batch_i] prev_scores, prev_labels = prev_scores.max(-1) prev_scores = prev_scores.sigmoid().detach() mem_lens_per_ins = self.mem_entry_lengths[batch_i][mem_instance_ids] # insert information into mem bank for ins_idx, mem_id in enumerate(mem_instance_ids): if mem_lens_per_ins[ins_idx] < self.bank_size: self.mem_bank[mem_lens_per_ins[ins_idx], batch_i, mem_id] = prev_embeddings[propagated_ids[ins_idx]] self.mem_bank_seq_id[mem_lens_per_ins[ins_idx], batch_i, mem_id] = seq_idx else: self.mem_bank[:self.bank_size-1, batch_i, mem_id] = self.mem_bank[1:self.bank_size, batch_i, mem_id] self.mem_bank[-1, batch_i, mem_id] = prev_embeddings[propagated_ids[ins_idx]] self.mem_bank_seq_id[:self.bank_size-1, batch_i, mem_id] = self.mem_bank_seq_id[1:self.bank_size, batch_i, mem_id] self.mem_bank_seq_id[-1, batch_i, mem_id] = seq_idx if self.curr_t < self.bank_size: self.mem_bank_rot[self.curr_t, batch_i] = self.curr_rot[batch_i] self.mem_bank_trans[self.curr_t, batch_i] = self.curr_trans[batch_i] else: self.mem_bank_rot[:self.bank_size-1, batch_i] = self.mem_bank_rot[1:, batch_i].clone() self.mem_bank_rot[-1, batch_i] = self.curr_rot[batch_i] self.mem_bank_trans[:self.bank_size-1, batch_i] = self.mem_bank_trans[1:, batch_i].clone() self.mem_bank_trans[-1, batch_i] = self.curr_trans[batch_i] # Update the mem recording information self.instance2mem[batch_i] = track2mem_info self.num_ins[batch_i] = num_instances self.mem_entry_lengths[batch_i][mem_instance_ids] += 1 self.active_mem_ids[batch_i] = mem_instance_ids.long().to(propagated_ids.device) active_mem_entry_lens = self.mem_entry_lengths[batch_i][self.active_mem_ids[batch_i]] self.valid_track_idx[batch_i] = torch.where(active_mem_entry_lens >= 1)[0] #print('Active memory ids:', self.active_mem_ids[batch_i]) #print('Memory entry lens:', active_mem_entry_lens) #print('Valid track idx:', self.valid_track_idx[batch_i]) def prepare_transformation_batch(self,history_e2g_trans,history_e2g_rot,curr_e2g_trans,curr_e2g_rot): history_g2e_matrix = torch.stack([torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device),]*len(history_e2g_trans), dim=0) history_g2e_matrix[:, :3, :3] = torch.transpose(history_e2g_rot, 1, 2) history_g2e_matrix[:, :3, 3] = -torch.bmm(torch.transpose(history_e2g_rot, 1, 2), history_e2g_trans[..., None]).squeeze(-1) curr_g2e_matrix = torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device) curr_g2e_matrix[:3, :3] = curr_e2g_rot.T curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans) curr_e2g_matrix = torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device) curr_e2g_matrix[:3, :3] = curr_e2g_rot curr_e2g_matrix[:3, 3] = curr_e2g_trans history_e2g_matrix = torch.stack([torch.eye(4, dtype=torch.float64, device=history_e2g_trans.device),]*len(history_e2g_trans), dim=0) history_e2g_matrix[:, :3, :3] = history_e2g_rot history_e2g_matrix[:, :3, 3] = history_e2g_trans history_curr2prev_matrix = torch.bmm(history_g2e_matrix, repeat(curr_e2g_matrix,'n1 n2 -> r n1 n2', r=len(history_g2e_matrix))) history_prev2curr_matrix = torch.bmm(repeat(curr_g2e_matrix, 'n1 n2 -> r n1 n2', r=len(history_e2g_matrix)), history_e2g_matrix) return history_curr2prev_matrix, history_prev2curr_matrix def clear_dict(self,): self.batch_mem_embeds_dict = {} self.batch_mem_relative_pe_dict = {} self.batch_key_padding_dict = {} def trans_memory_bank(self, query_prop, b_i, metas): seq_id = metas['local_idx'] active_mem_ids = self.active_mem_ids[b_i] mem_entry_lens = self.mem_entry_lengths[b_i][active_mem_ids] num_track_ins = len(active_mem_ids) valid_mem_len = min(self.curr_t, self.mem_len) valid_bank_size = min(self.curr_t, self.bank_size) mem_trans = self.mem_bank_trans[:, b_i] mem_rots = self.mem_bank_rot[:, b_i] if self.training: # Note: at training time, bank_size must be the same as mem_len, no selection needed assert self.mem_len == self.bank_size, 'at training time, bank_size must be the same as mem_len' mem_embeds = self.mem_bank[:, b_i, active_mem_ids] mem_seq_ids = self.mem_bank_seq_id[:, b_i, active_mem_ids] else: # at test time, the bank size can be much longer, and we need the selection strategy mem_embeds = torch.zeros_like(self.mem_bank[:self.mem_len, b_i, active_mem_ids]) mem_seq_ids = torch.zeros_like(self.mem_bank_seq_id[:self.mem_len, b_i, active_mem_ids]) # Put information into mem embeddings and pos_ids, prepare for attention-fusion # Also prepare the pose information for the query propagation all_pose_select_indices = [] all_select_indices = [] for idx, active_idx in enumerate(active_mem_ids): effective_len = mem_entry_lens[idx] valid_mem_trans = mem_trans[:valid_bank_size] trunc_eff_len = min(effective_len, self.bank_size) valid_pose_ids = torch.arange(valid_bank_size-trunc_eff_len, valid_bank_size) #print('ins {}, valid pose ids {}'.format(idx, valid_pose_ids)) if effective_len <= self.mem_len: select_indices = torch.arange(effective_len) else: select_indices = self.select_memory_entries(valid_mem_trans[-trunc_eff_len:], metas) pose_select_indices = valid_pose_ids[select_indices] mem_embeds[:len(select_indices), idx] = self.mem_bank[select_indices, b_i, active_idx] mem_seq_ids[:len(select_indices), idx] = self.mem_bank_seq_id[select_indices, b_i, active_idx] all_pose_select_indices.append(pose_select_indices) all_select_indices.append(select_indices) # prepare mem padding mask key_padding_mask = torch.ones((self.mem_len, num_track_ins)).bool().cuda() padding_trunc_loc = torch.clip(mem_entry_lens, max=self.mem_len) for ins_i in range(num_track_ins): key_padding_mask[:padding_trunc_loc[ins_i], ins_i] = False key_padding_mask = key_padding_mask.T # prepare relative seq idx gap relative_seq_idx = torch.zeros_like(mem_embeds[:,:,0]).long() relative_seq_idx[:valid_mem_len] = seq_id - mem_seq_ids[:valid_mem_len] relative_seq_pe = self.cached_pe[relative_seq_idx].to(mem_embeds.device) # prepare relative pose information for each active instance curr2prev_matrix, prev2curr_matrix = self.prepare_transformation_batch(mem_trans[:valid_bank_size], mem_rots[:valid_bank_size], self.curr_trans[b_i], self.curr_rot[b_i]) pose_matrix = prev2curr_matrix.float()[:,:3] rot_mat = pose_matrix[..., :3].cpu().numpy() rot = R.from_matrix(rot_mat) translation = pose_matrix[..., 3] if self.training: rot, translation = self.add_noise_to_pose(rot, translation) rot_quat = torch.tensor(rot.as_quat()).float().to(pose_matrix.device) pose_info = torch.cat([rot_quat, translation], dim=1) pose_info_per_ins = torch.zeros((valid_mem_len, num_track_ins, pose_info.shape[1])).to(pose_info.device) for ins_idx in range(num_track_ins): pose_select_indices = all_pose_select_indices[ins_idx] pose_info_per_ins[:len(pose_select_indices), ins_idx] = pose_info[pose_select_indices] mem_embeds_new = mem_embeds.clone() mem_embeds_valid = rearrange(mem_embeds[:valid_mem_len], 't n c -> (t n) c') pose_info_per_ins = rearrange(pose_info_per_ins, 't n c -> (t n) c') mem_embeds_prop = query_prop( mem_embeds_valid, pose_info_per_ins ) mem_embeds_new[:valid_mem_len] = rearrange(mem_embeds_prop, '(t n) c -> t n c', t=valid_mem_len) self.batch_mem_embeds_dict[b_i] = mem_embeds_new.clone().detach() self.batch_mem_relative_pe_dict[b_i] = relative_seq_pe self.batch_key_padding_dict[b_i] = key_padding_mask def add_noise_to_pose(self, rot, trans): rot_euler = rot.as_euler('zxy') # 0.08 mean is around 5-degree, 3-sigma is 15-degree noise_euler = np.random.randn(*list(rot_euler.shape)) * 0.08 rot_euler += noise_euler noisy_rot = R.from_euler('zxy', rot_euler) # error within 0.25 meter noise_trans = torch.randn_like(trans) * 0.25 noise_trans[:, 2] = 0 noisy_trans = trans + noise_trans return noisy_rot, noisy_trans def select_memory_entries(self, mem_trans, curr_meta): history_e2g_trans = mem_trans[:, :2].cpu().numpy() curr_e2g_trans = np.array(curr_meta['ego2global_translation'][:2]) dists = np.linalg.norm(history_e2g_trans - curr_e2g_trans[None, :], axis=1) sorted_indices = np.argsort(dists) sorted_dists = dists[sorted_indices] covered = np.zeros_like(sorted_indices).astype(np.bool) selected_ids = [] for dist_range in self.mem_select_dist_ranges[::-1]: outter_valid_flags = (sorted_dists >= dist_range) & ~covered if outter_valid_flags.any(): pick_id = np.where(outter_valid_flags)[0][0] covered[pick_id:] = True else: inner_valid_flags = (sorted_dists < dist_range) & ~covered if inner_valid_flags.any(): pick_id = np.where(inner_valid_flags)[0][-1] covered[pick_id] = True else: # return the mem_len closest one, but in the order of far -> close return np.array(sorted_indices[:4][::-1]) selected_ids.append(pick_id) selected_mem_ids = sorted_indices[np.array(selected_ids)] return selected_mem_ids ================================================ FILE: plugin/models/necks/__init__.py ================================================ from .gru import ConvGRU ================================================ FILE: plugin/models/necks/gru.py ================================================ import torch import torch.nn as nn from mmdet.models import NECKS from mmcv.cnn.utils import kaiming_init, constant_init @NECKS.register_module() class ConvGRU(nn.Module): def __init__(self, out_channels): super(ConvGRU, self).__init__() kernel_size = 1 padding = kernel_size // 2 self.convz = nn.Conv2d(2*out_channels, out_channels, kernel_size=kernel_size, padding=padding, bias=False) self.convr = nn.Conv2d(2*out_channels, out_channels, kernel_size=kernel_size, padding=padding, bias=False) self.convq = nn.Conv2d(2*out_channels, out_channels, kernel_size=kernel_size, padding=padding, bias=False) self.ln = nn.LayerNorm(out_channels) self.zero_out = nn.Conv2d(out_channels, out_channels, 1, 1, bias=True) def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) nn.init.zeros_(self.zero_out.weight) nn.init.zeros_(self.zero_out.bias) def forward(self, h, x): if len(h.shape) == 3: h = h.unsqueeze(0) if len(x.shape) == 3: x = x.unsqueeze(0) hx = torch.cat([h, x], dim=1) # [1, 2c, h, w] z = torch.sigmoid(self.convz(hx)) r = torch.sigmoid(self.convr(hx)) new_x = torch.cat([r * h, x], dim=1) # [1, 2c, h, w] q = self.convq(new_x) out = ((1 - z) * h + z * q) # (1, C, H, W) out = self.ln(out.permute(0, 2, 3, 1)).permute(0, 3, 1, 2).contiguous() out = self.zero_out(out) out = out + x out = out.squeeze(0) return out ================================================ FILE: plugin/models/transformer_utils/CustomMSDeformableAttention.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch import mmcv import cv2 as cv import copy import warnings from matplotlib import pyplot as plt import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import xavier_init, constant_init from mmcv.cnn.bricks.registry import (ATTENTION, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import TransformerLayerSequence import math from mmcv.runner.base_module import BaseModule, ModuleList, Sequential from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, to_2tuple) from mmcv.utils import ext_loader from mmcv.ops.multi_scale_deform_attn import (MultiScaleDeformableAttnFunction, multi_scale_deformable_attn_pytorch) from .fp16_dattn import MultiScaleDeformableAttnFunctionFp32 @ATTENTION.register_module() class CustomMSDeformableAttention(BaseModule): """An attention module used in Deformable-Detr. `Deformable DETR: Deformable Transformers for End-to-End Object Detection. `_. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 64. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 4. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_identity`. Default: 0.1. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=4, im2col_step=64, dropout=0.1, use_sampling_offsets=True, batch_first=False, norm_cfg=None, init_cfg=None): super().__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.dropout = nn.Dropout(dropout) self.batch_first = batch_first self.fp16_enabled = False # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn( "You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points self.use_sampling_offsets = use_sampling_offsets if use_sampling_offsets: self.sampling_offsets = nn.Linear( embed_dims, num_heads * num_levels * num_points * 2) self.attention_weights = nn.Linear(embed_dims, num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.output_proj = nn.Linear(embed_dims, embed_dims) self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" if self.use_sampling_offsets: constant_init(self.sampling_offsets, 0.) thetas = torch.arange( self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view( self.num_heads, 1, 1, 2).repeat(1, self.num_levels, self.num_points, 1) for i in range(self.num_points): grid_init[:, :, i, :] *= i + 1 self.sampling_offsets.bias.data = grid_init.view(-1) constant_init(self.attention_weights, val=0., bias=0.) xavier_init(self.value_proj, distribution='uniform', bias=0.) xavier_init(self.output_proj, distribution='uniform', bias=0.) self._is_init = True @deprecated_api_warning({'residual': 'identity'}, cls_name='MultiScaleDeformableAttention') def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, flag='decoder', **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape `(num_key, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_key, bs, embed_dims)`. identity (Tensor): The tensor used for addition, with the same shape as `query`. Default None. If None, `query` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, num_points, 2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different levels. With shape (num_levels, 2), last dimension represents (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape ``(num_levels, )`` and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if value is None: value = query if identity is None: identity = query if query_pos is not None: query = query + query_pos if not self.batch_first: # change to (bs, num_query ,embed_dims) query = query.permute(1, 0, 2) value = value.permute(1, 0, 2) bs, num_query, _ = query.shape bs, num_value, _ = value.shape assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value value = self.value_proj(value) if key_padding_mask is not None: value = value.masked_fill(key_padding_mask[..., None], 0.0) value = value.view(bs, num_value, self.num_heads, -1) if self.use_sampling_offsets: sampling_offsets = self.sampling_offsets(query).view( bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) else: sampling_offsets = query.new_zeros((bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)) attention_weights = self.attention_weights(query).view( bs, num_query, self.num_heads, self.num_levels * self.num_points) attention_weights = attention_weights.softmax(-1) attention_weights = attention_weights.view(bs, num_query, self.num_heads, self.num_levels, self.num_points) # TODO: try remove sampling offsets offset_normalizer = torch.stack( [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) # changed to (h, w) _, _, num_points, _ = reference_points.shape # (bs, num_queries, num_pts, 2) -> # (bs, num_queries, num_heads, num_lvls, num_pts, 2) reference_points = reference_points[:, :, None, None, :, :] # reference_points[..., 1:2] = -reference_points[..., 1:2] sampling_locations = reference_points + \ (sampling_offsets # (bs, num_queries, num_heads, num_lvls, num_pts, 2) / offset_normalizer[None, None, None, :, None, :]) assert list(sampling_locations.shape) == [bs, num_query, self.num_heads, self.num_levels, num_points, 2] if torch.cuda.is_available() and value.is_cuda: # using fp16 deformable attention is unstable because it performs many sum operations output = MultiScaleDeformableAttnFunctionFp32.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, sampling_locations, attention_weights) output = self.output_proj(output) if not self.batch_first: # (num_query, bs ,embed_dims) output = output.permute(1, 0, 2) return self.dropout(output) + identity ================================================ FILE: plugin/models/transformer_utils/MapTransformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import warnings import copy import torch import torch.nn as nn from mmcv.cnn import build_activation_layer, build_norm_layer, xavier_init from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import (BaseTransformerLayer, TransformerLayerSequence, build_transformer_layer) from mmcv.runner.base_module import BaseModule, ModuleList from mmdet.models.utils.builder import TRANSFORMER from mmdet.models.utils.transformer import Transformer from .CustomMSDeformableAttention import CustomMSDeformableAttention from mmdet.models.utils.transformer import inverse_sigmoid @TRANSFORMER_LAYER_SEQUENCE.register_module() class MapTransformerDecoder_new(BaseModule): """Implements the decoder in DETR transformer. Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. """ def __init__(self, transformerlayers=None, num_layers=None, prop_add_stage=0, return_intermediate=True, init_cfg=None): super().__init__(init_cfg) if isinstance(transformerlayers, dict): transformerlayers = [ copy.deepcopy(transformerlayers) for _ in range(num_layers) ] else: assert isinstance(transformerlayers, list) and \ len(transformerlayers) == num_layers self.num_layers = num_layers self.layers = ModuleList() for i in range(num_layers): self.layers.append(build_transformer_layer(transformerlayers[i])) self.embed_dims = self.layers[0].embed_dims self.pre_norm = self.layers[0].pre_norm self.return_intermediate = return_intermediate self.prop_add_stage = prop_add_stage assert prop_add_stage >= 0 and prop_add_stage < num_layers def forward(self, query, key, value, query_pos, key_padding_mask, query_key_padding_mask, reference_points, spatial_shapes, level_start_index, reg_branches, cls_branches, predict_refine, memory_bank=None, **kwargs): """Forward function for `TransformerDecoder`. Args: query (Tensor): Input query with shape `(num_query, bs, embed_dims)`. reference_points (Tensor): The reference points of offset. has shape (bs, num_query, num_points, 2). valid_ratios (Tensor): The radios of valid points on the feature map, has shape (bs, num_levels, 2) reg_branch: (obj:`nn.ModuleList`): Used for refining the regression results. Only would be passed when with_box_refine is True, otherwise would be passed a `None`. Returns: Tensor: Results with shape [1, num_query, bs, embed_dims] when return_intermediate is `False`, otherwise it has shape [num_layers, num_query, bs, embed_dims]. """ num_queries, bs, embed_dims = query.shape output = query intermediate = [] intermediate_reference_points = [] for lid, layer in enumerate(self.layers): tmp = reference_points.clone() tmp[..., 1:2] = 1.0 - reference_points[..., 1:2] # reverse y-axis output = layer( output, key, value, query_pos=query_pos, key_padding_mask=key_padding_mask, reference_points=tmp, spatial_shapes=spatial_shapes, level_start_index=level_start_index, query_key_padding_mask=query_key_padding_mask, memory_bank=memory_bank, **kwargs) reg_points = reg_branches[lid](output.permute(1, 0, 2)) # (bs, num_q, 2*num_points) bs, num_queries, num_points2 = reg_points.shape reg_points = reg_points.view(bs, num_queries, num_points2//2, 2) # range (0, 1) if predict_refine: new_reference_points = reg_points + inverse_sigmoid( reference_points ) new_reference_points = new_reference_points.sigmoid() else: new_reference_points = reg_points.sigmoid() # (bs, num_q, num_points, 2) reference_points = new_reference_points.clone().detach() if self.return_intermediate: intermediate.append(output.permute(1, 0, 2)) # [(bs, num_q, embed_dims)] intermediate_reference_points.append(new_reference_points) # (bs, num_q, num_points, 2) if self.return_intermediate: return intermediate, intermediate_reference_points return output, reference_points @TRANSFORMER_LAYER.register_module() class MapTransformerLayer(BaseTransformerLayer): """Base `TransformerLayer` for vision transformer. It can be built from `mmcv.ConfigDict` and support more flexible customization, for example, using any number of `FFN or LN ` and use different kinds of `attention` by specifying a list of `ConfigDict` named `attn_cfgs`. It is worth mentioning that it supports `prenorm` when you specifying `norm` as the first element of `operation_order`. More details about the `prenorm`: `On Layer Normalization in the Transformer Architecture `_ . Args: attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for `self_attention` or `cross_attention` modules, The order of the configs in the list should be consistent with corresponding attentions in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. Default: None. ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for FFN, The order of the configs in the list should be consistent with corresponding ffn in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Support `prenorm` when you specifying first element as `norm`. Default:None. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. """ def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=False, **kwargs): super().__init__( attn_cfgs=attn_cfgs, ffn_cfgs=ffn_cfgs, operation_order=operation_order, norm_cfg=norm_cfg, init_cfg=init_cfg, batch_first=batch_first, **kwargs ) def forward(self, query, key=None, value=None, memory_query=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, memory_bank=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ if memory_bank is not None: bs = query.shape[1] all_valid_track_idx = [] for b_i in range(bs): all_valid_track_idx.append(memory_bank.valid_track_idx[b_i]) norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: if layer == 'self_attn': if memory_query is None: temp_key = temp_value = query else: temp_key = temp_value = torch.cat([memory_query, query], dim=0) query = self.attentions[attn_index]( query, temp_key, temp_value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=query_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 elif layer == 'cross_attn': if attn_index == 1: query_bev = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=key_pos, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, **kwargs) attn_index += 1 else: # Memory cross attention assert attn_index == 2 if memory_bank is not None: bs = query.shape[1] query_i_list = [] for b_i in range(bs): valid_track_idx = all_valid_track_idx[b_i] query_i = query[:, b_i].clone() query_i = query_i[None,:] if len(valid_track_idx) != 0: mem_embeds = memory_bank.batch_mem_embeds_dict[b_i][:, valid_track_idx, :] mem_key_padding_mask = memory_bank.batch_key_padding_dict[b_i][valid_track_idx] mem_key_pos = memory_bank.batch_mem_relative_pe_dict[b_i][:, valid_track_idx] query_i[:, valid_track_idx] = self.attentions[attn_index]( query_i[:,valid_track_idx], mem_embeds, mem_embeds, identity=None, query_pos=None, key_pos=mem_key_pos, attn_mask=None, key_padding_mask=mem_key_padding_mask, **kwargs) query_i_list.append(query_i[0]) query_memory = torch.stack(query_i_list).permute(1, 0, 2) else: query_memory = torch.zeros_like(query_bev) query = query_memory + query_bev identity = query attn_index += 1 elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query @TRANSFORMER.register_module() class MapTransformer(Transformer): """Implements the DeformableDETR transformer. Args: as_two_stage (bool): Generate query from encoder features. Default: False. num_feature_levels (int): Number of feature maps from FPN: Default: 4. two_stage_num_proposals (int): Number of proposals when set `as_two_stage` as True. Default: 300. """ def __init__(self, num_feature_levels=1, num_points=20, coord_dim=2, **kwargs): super().__init__(**kwargs) self.num_feature_levels = num_feature_levels self.embed_dims = self.encoder.embed_dims self.coord_dim = coord_dim self.num_points = num_points self.init_layers() def init_layers(self): """Initialize layers of the DeformableDetrTransformer.""" # self.level_embeds = nn.Parameter( # torch.Tensor(self.num_feature_levels, self.embed_dims)) def init_weights(self): """Initialize the transformer weights.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, CustomMSDeformableAttention): m.init_weights() def forward(self, mlvl_feats, mlvl_masks, query_embed, mlvl_pos_embeds, init_reference_points, reg_branches=None, cls_branches=None, memory_query=None, memory_bank=None, **kwargs): """Forward function for `Transformer`. Args: mlvl_feats (list(Tensor)): Input queries from different level. Each element has shape [bs, embed_dims, h, w]. mlvl_masks (list(Tensor)): The key_padding_mask from different level used for encoder and decoder, each element has shape [bs, h, w]. query_embed (Tensor): The query embedding for decoder, with shape [num_query, c]. mlvl_pos_embeds (list(Tensor)): The positional encoding of feats from different level, has the shape [bs, embed_dims, h, w]. reg_branches (obj:`nn.ModuleList`): Regression heads for feature maps from each decoder layer. Only would be passed when `with_box_refine` is True. Default to None. cls_branches (obj:`nn.ModuleList`): Classification heads for feature maps from each decoder layer. Only would be passed when `as_two_stage` is True. Default to None. Returns: tuple[Tensor]: results of decoder containing the following tensor. - inter_states: Outputs from decoder. If return_intermediate_dec is True output has shape \ (num_dec_layers, bs, num_query, embed_dims), else has \ shape (1, bs, num_query, embed_dims). - init_reference_out: The initial value of reference \ points, has shape (bs, num_queries, 4). - inter_references_out: The internal value of reference \ points in decoder, has shape \ (num_dec_layers, bs,num_query, embed_dims) - enc_outputs_class: The classification score of \ proposals generated from \ encoder's feature maps, has shape \ (batch, h*w, num_classes). \ Only would be returned when `as_two_stage` is True, \ otherwise None. - enc_outputs_coord_unact: The regression results \ generated from encoder's feature maps., has shape \ (batch, h*w, 4). Only would \ be returned when `as_two_stage` is True, \ otherwise None. """ feat_flatten = [] mask_flatten = [] # lvl_pos_embed_flatten = [] spatial_shapes = [] for lvl, (feat, mask, pos_embed) in enumerate( zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): bs, c, h, w = feat.shape spatial_shape = (h, w) spatial_shapes.append(spatial_shape) feat = feat.flatten(2).transpose(1, 2) mask = mask.flatten(1) feat_flatten.append(feat) mask_flatten.append(mask) feat_flatten = torch.cat(feat_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) spatial_shapes = torch.as_tensor( spatial_shapes, dtype=torch.long, device=feat_flatten.device) level_start_index = torch.cat((spatial_shapes.new_zeros( (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) # decoder query = query_embed.permute(1, 0, 2) # (num_q, bs, embed_dims) if memory_query is not None: memory_query = memory_query.permute(1, 0, 2) inter_states, inter_references = self.decoder( query=query, key=None, value=feat_flatten, query_pos=None, key_padding_mask=mask_flatten, reference_points=init_reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index, reg_branches=reg_branches, cls_branches=cls_branches, memory_query=memory_query, memory_bank=memory_bank, **kwargs) return inter_states, init_reference_points, inter_references ================================================ FILE: plugin/models/transformer_utils/__init__.py ================================================ from .deformable_transformer import DeformableDetrTransformer_, DeformableDetrTransformerDecoder_ from .base_transformer import PlaceHolderEncoder from .CustomMSDeformableAttention import CustomMSDeformableAttention from .MapTransformer import MapTransformer, MapTransformerDecoder_new, MapTransformerLayer ================================================ FILE: plugin/models/transformer_utils/base_transformer.py ================================================ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import xavier_init, constant_init from mmcv.cnn.bricks.registry import (ATTENTION, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import (MultiScaleDeformableAttention, TransformerLayerSequence, build_transformer_layer_sequence) from mmcv.runner.base_module import BaseModule from mmdet.models.utils.builder import TRANSFORMER @TRANSFORMER_LAYER_SEQUENCE.register_module() class PlaceHolderEncoder(nn.Module): def __init__(self, *args, embed_dims=None, **kwargs): super(PlaceHolderEncoder, self).__init__() self.embed_dims = embed_dims def forward(self, *args, query=None, **kwargs): return query ================================================ FILE: plugin/models/transformer_utils/deformable_transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import warnings import torch import torch.nn as nn from mmcv.cnn import build_activation_layer, build_norm_layer, xavier_init from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import (BaseTransformerLayer, TransformerLayerSequence, build_transformer_layer_sequence) from mmcv.runner.base_module import BaseModule from torch.nn.init import normal_ from mmdet.models.utils.builder import TRANSFORMER from mmdet.models.utils.transformer import Transformer try: from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention except ImportError: warnings.warn( '`MultiScaleDeformableAttention` in MMCV has been moved to ' '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV') from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention from .fp16_dattn import MultiScaleDeformableAttentionFp16 def inverse_sigmoid(x, eps=1e-5): """Inverse function of sigmoid. Args: x (Tensor): The tensor to do the inverse. eps (float): EPS avoid numerical overflow. Defaults 1e-5. Returns: Tensor: The x has passed the inverse function of sigmoid, has same shape with input. """ x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1 / x2) @TRANSFORMER_LAYER_SEQUENCE.register_module() class DeformableDetrTransformerDecoder_(TransformerLayerSequence): """Implements the decoder in DETR transformer. Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. """ def __init__(self, *args, return_intermediate=False, coord_dim=2, kp_coord_dim=2, **kwargs): super(DeformableDetrTransformerDecoder_, self).__init__(*args, **kwargs) self.return_intermediate = return_intermediate self.coord_dim = coord_dim self.kp_coord_dim = kp_coord_dim def forward(self, query, *args, reference_points=None, valid_ratios=None, reg_branches=None, **kwargs): """Forward function for `TransformerDecoder`. Args: query (Tensor): Input query with shape `(num_query, bs, embed_dims)`. reference_points (Tensor): The reference points of offset. has shape (bs, num_query, 4) when as_two_stage, otherwise has shape ((bs, num_query, 2). valid_ratios (Tensor): The radios of valid points on the feature map, has shape (bs, num_levels, 2) reg_branch: (obj:`nn.ModuleList`): Used for refining the regression results. Only would be passed when with_box_refine is True, otherwise would be passed a `None`. Returns: Tensor: Results with shape [1, num_query, bs, embed_dims] when return_intermediate is `False`, otherwise it has shape [num_layers, num_query, bs, embed_dims]. """ output = query intermediate = [] intermediate_reference_points = [] for lid, layer in enumerate(self.layers): reference_points_input = \ reference_points[:, :, None,:self.kp_coord_dim] * \ valid_ratios[:, None,:,:self.kp_coord_dim] # if reference_points.shape[-1] == 3 and self.kp_coord_dim==2: output = layer( output, *args, reference_points=reference_points_input[...,:self.kp_coord_dim], **kwargs) output = output.permute(1, 0, 2) if reg_branches is not None: tmp = reg_branches[lid](output) new_reference_points = tmp new_reference_points[..., :self.kp_coord_dim] = tmp[ ..., :self.kp_coord_dim] + inverse_sigmoid(reference_points) new_reference_points = new_reference_points.sigmoid() if reference_points.shape[-1] == 3 and self.kp_coord_dim==2: reference_points[...,-1] = tmp[...,-1].sigmoid().detach() reference_points[...,:self.coord_dim] = new_reference_points.detach() output = output.permute(1, 0, 2) if self.return_intermediate: intermediate.append(output) intermediate_reference_points.append(reference_points) if self.return_intermediate: return torch.stack(intermediate), torch.stack( intermediate_reference_points) return output, reference_points @TRANSFORMER.register_module() class DeformableDetrTransformer_(Transformer): """Implements the DeformableDETR transformer. Args: as_two_stage (bool): Generate query from encoder features. Default: False. num_feature_levels (int): Number of feature maps from FPN: Default: 4. two_stage_num_proposals (int): Number of proposals when set `as_two_stage` as True. Default: 300. """ def __init__(self, as_two_stage=False, num_feature_levels=1, two_stage_num_proposals=300, coord_dim=2, **kwargs): super(DeformableDetrTransformer_, self).__init__(**kwargs) self.as_two_stage = as_two_stage self.num_feature_levels = num_feature_levels self.two_stage_num_proposals = two_stage_num_proposals self.embed_dims = self.encoder.embed_dims self.coord_dim = coord_dim self.init_layers() def init_layers(self): """Initialize layers of the DeformableDetrTransformer.""" self.level_embeds = nn.Parameter( torch.Tensor(self.num_feature_levels, self.embed_dims)) if self.as_two_stage: self.enc_output = nn.Linear(self.embed_dims, self.embed_dims) self.enc_output_norm = nn.LayerNorm(self.embed_dims) self.pos_trans = nn.Linear(self.embed_dims * 2, self.embed_dims * 2) self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2) else: self.reference_points_embed = nn.Linear(self.embed_dims, self.coord_dim) def init_weights(self): """Initialize the transformer weights.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, MultiScaleDeformableAttention): m.init_weights() elif isinstance(m,MultiScaleDeformableAttentionFp16): m.init_weights() if not self.as_two_stage: xavier_init(self.reference_points_embed, distribution='uniform', bias=0.) normal_(self.level_embeds) @staticmethod def get_reference_points(spatial_shapes, valid_ratios, device): """Get the reference points used in decoder. Args: spatial_shapes (Tensor): The shape of all feature maps, has shape (num_level, 2). valid_ratios (Tensor): The radios of valid points on the feature map, has shape (bs, num_levels, 2) device (obj:`device`): The device where reference_points should be. Returns: Tensor: reference points used in decoder, has \ shape (bs, num_keys, num_levels, 2). """ reference_points_list = [] for lvl, (H, W) in enumerate(spatial_shapes): # TODO check this 0.5 ref_y, ref_x = torch.meshgrid( torch.linspace( 0.5, H - 0.5, H, dtype=torch.float32, device=device), torch.linspace( 0.5, W - 0.5, W, dtype=torch.float32, device=device)) ref_y = ref_y.reshape(-1)[None] / ( valid_ratios[:, None, lvl, 1] * H) ref_x = ref_x.reshape(-1)[None] / ( valid_ratios[:, None, lvl, 0] * W) ref = torch.stack((ref_x, ref_y), -1) reference_points_list.append(ref) reference_points = torch.cat(reference_points_list, 1) reference_points = reference_points[:, :, None] * valid_ratios[:, None] return reference_points def get_valid_ratio(self, mask): """Get the valid radios of feature maps of all level.""" _, H, W = mask.shape valid_H = torch.sum(~mask[:, :, 0], 1) valid_W = torch.sum(~mask[:, 0, :], 1) valid_ratio_h = valid_H.float() / H valid_ratio_w = valid_W.float() / W valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) return valid_ratio def get_proposal_pos_embed(self, proposals, num_pos_feats=128, temperature=10000): """Get the position embedding of proposal.""" scale = 2 * math.pi dim_t = torch.arange( num_pos_feats, dtype=torch.float32, device=proposals.device) dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats) # N, L, 4 proposals = proposals.sigmoid() * scale # N, L, 4, 128 pos = proposals[:, :, :, None] / dim_t # N, L, 4, 64, 2 pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) return pos def forward(self, mlvl_feats, mlvl_masks, query_embed, mlvl_pos_embeds, reg_branches=None, cls_branches=None, **kwargs): """Forward function for `Transformer`. Args: mlvl_feats (list(Tensor)): Input queries from different level. Each element has shape [bs, embed_dims, h, w]. mlvl_masks (list(Tensor)): The key_padding_mask from different level used for encoder and decoder, each element has shape [bs, h, w]. query_embed (Tensor): The query embedding for decoder, with shape [num_query, c]. mlvl_pos_embeds (list(Tensor)): The positional encoding of feats from different level, has the shape [bs, embed_dims, h, w]. reg_branches (obj:`nn.ModuleList`): Regression heads for feature maps from each decoder layer. Only would be passed when `with_box_refine` is True. Default to None. cls_branches (obj:`nn.ModuleList`): Classification heads for feature maps from each decoder layer. Only would be passed when `as_two_stage` is True. Default to None. Returns: tuple[Tensor]: results of decoder containing the following tensor. - inter_states: Outputs from decoder. If return_intermediate_dec is True output has shape \ (num_dec_layers, bs, num_query, embed_dims), else has \ shape (1, bs, num_query, embed_dims). - init_reference_out: The initial value of reference \ points, has shape (bs, num_queries, 4). - inter_references_out: The internal value of reference \ points in decoder, has shape \ (num_dec_layers, bs,num_query, embed_dims) - enc_outputs_class: The classification score of \ proposals generated from \ encoder's feature maps, has shape \ (batch, h*w, num_classes). \ Only would be returned when `as_two_stage` is True, \ otherwise None. - enc_outputs_coord_unact: The regression results \ generated from encoder's feature maps., has shape \ (batch, h*w, 4). Only would \ be returned when `as_two_stage` is True, \ otherwise None. """ assert self.as_two_stage or query_embed is not None feat_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] for lvl, (feat, mask, pos_embed) in enumerate( zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): bs, c, h, w = feat.shape spatial_shape = (h, w) spatial_shapes.append(spatial_shape) feat = feat.flatten(2).transpose(1, 2) mask = mask.flatten(1) pos_embed = pos_embed.flatten(2).transpose(1, 2) lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1) lvl_pos_embed_flatten.append(lvl_pos_embed) feat_flatten.append(feat) mask_flatten.append(mask) feat_flatten = torch.cat(feat_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) spatial_shapes = torch.as_tensor( spatial_shapes, dtype=torch.long, device=feat_flatten.device) level_start_index = torch.cat((spatial_shapes.new_zeros( (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) valid_ratios = torch.stack( [self.get_valid_ratio(m) for m in mlvl_masks], 1) # reference_points = \ # self.get_reference_points(spatial_shapes, # valid_ratios, # device=feat.device) feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) # lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute( # 1, 0, 2) # (H*W, bs, embed_dims) # memory = self.encoder( # query=feat_flatten, # key=None, # value=None, # query_pos=lvl_pos_embed_flatten, # query_key_padding_mask=mask_flatten, # spatial_shapes=spatial_shapes, # reference_points=reference_points, # level_start_index=level_start_index, # valid_ratios=valid_ratios, # **kwargs) memory = feat_flatten.permute(1, 0, 2) bs, _, c = memory.shape query_pos, query = torch.split(query_embed, c, dim=-1) reference_points = self.reference_points_embed(query_pos).sigmoid() init_reference_out = reference_points # decoder query = query.permute(1, 0, 2) memory = memory.permute(1, 0, 2) query_pos = query_pos.permute(1, 0, 2) inter_states, inter_references = self.decoder( query=query, key=None, value=memory, query_pos=query_pos, key_padding_mask=mask_flatten, reference_points=reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index, valid_ratios=valid_ratios, reg_branches=reg_branches, **kwargs) inter_references_out = inter_references return inter_states, init_reference_out, inter_references_out ================================================ FILE: plugin/models/transformer_utils/fp16_dattn.py ================================================ from turtle import forward import warnings try: from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention except ImportError: warnings.warn( '`MultiScaleDeformableAttention` in MMCV has been moved to ' '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV') from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention from mmcv.runner import force_fp32, auto_fp16 from mmcv.cnn.bricks.registry import ATTENTION from mmcv.runner.base_module import BaseModule, ModuleList, Sequential from mmcv.cnn.bricks.transformer import build_attention import math import warnings import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd.function import Function, once_differentiable from mmcv import deprecated_api_warning from mmcv.cnn import constant_init, xavier_init from mmcv.cnn.bricks.registry import ATTENTION from mmcv.runner import BaseModule from mmcv.utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) from torch.cuda.amp import custom_bwd, custom_fwd @ATTENTION.register_module() class MultiScaleDeformableAttentionFp16(BaseModule): def __init__(self, attn_cfg=None,init_cfg=None,**kwarg): super(MultiScaleDeformableAttentionFp16,self).__init__(init_cfg) # import ipdb; ipdb.set_trace() self.deformable_attention = build_attention(attn_cfg) self.deformable_attention.init_weights() self.fp16_enabled = False @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points','identity')) def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, **kwargs): # import ipdb; ipdb.set_trace() return self.deformable_attention(query, key=key, value=value, identity=identity, query_pos=query_pos, key_padding_mask=key_padding_mask, reference_points=reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index,**kwargs) class MultiScaleDeformableAttnFunctionFp32(Function): @staticmethod @custom_fwd(cast_inputs=torch.float32) def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): """GPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), im2col_step (Tensor): The step used in image to column. Returns: Tensor: has shape (bs, num_queries, embed_dims) """ ctx.im2col_step = im2col_step output = ext_module.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step=ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): """GPU version of backward function. Args: grad_output (Tensor): Gradient of output tensor of forward. Returns: Tuple[Tensor]: Gradient of input tensors in forward. """ value, value_spatial_shapes, value_level_start_index,\ sampling_locations, attention_weights = ctx.saved_tensors grad_value = torch.zeros_like(value) grad_sampling_loc = torch.zeros_like(sampling_locations) grad_attn_weight = torch.zeros_like(attention_weights) ext_module.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output.contiguous(), grad_value, grad_sampling_loc, grad_attn_weight, im2col_step=ctx.im2col_step) return grad_value, None, None, \ grad_sampling_loc, grad_attn_weight, None def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): """CPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), Returns: Tensor: has shape (bs, num_queries, embed_dims) """ bs, _, num_heads, embed_dims = value.shape _, num_queries, num_heads, num_levels, num_points, _ =\ sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for level, (H_, W_) in enumerate(value_spatial_shapes): # bs, H_*W_, num_heads, embed_dims -> # bs, H_*W_, num_heads*embed_dims -> # bs, num_heads*embed_dims, H_*W_ -> # bs*num_heads, embed_dims, H_, W_ value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape( bs * num_heads, embed_dims, H_, W_) # bs, num_queries, num_heads, num_points, 2 -> # bs, num_heads, num_queries, num_points, 2 -> # bs*num_heads, num_queries, num_points, 2 sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1) # bs*num_heads, embed_dims, num_queries, num_points sampling_value_l_ = F.grid_sample( value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (bs, num_queries, num_heads, num_levels, num_points) -> # (bs, num_heads, num_queries, num_levels, num_points) -> # (bs, num_heads, 1, num_queries, num_levels*num_points) attention_weights = attention_weights.transpose(1, 2).reshape( bs * num_heads, 1, num_queries, num_levels * num_points) output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(bs, num_heads * embed_dims, num_queries) return output.transpose(1, 2).contiguous() @ATTENTION.register_module() class MultiScaleDeformableAttentionFP32(BaseModule): """An attention module used in Deformable-Detr. `Deformable DETR: Deformable Transformers for End-to-End Object Detection. `_. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 64. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 4. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_identity`. Default: 0.1. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=4, im2col_step=64, dropout=0.1, batch_first=False, norm_cfg=None, init_cfg=None): super().__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.dropout = nn.Dropout(dropout) self.batch_first = batch_first # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn( "You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points self.sampling_offsets = nn.Linear( embed_dims, num_heads * num_levels * num_points * 2) self.attention_weights = nn.Linear(embed_dims, num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.output_proj = nn.Linear(embed_dims, embed_dims) self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" constant_init(self.sampling_offsets, 0.) thetas = torch.arange( self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view( self.num_heads, 1, 1, 2).repeat(1, self.num_levels, self.num_points, 1) for i in range(self.num_points): grid_init[:, :, i, :] *= i + 1 self.sampling_offsets.bias.data = grid_init.view(-1) constant_init(self.attention_weights, val=0., bias=0.) xavier_init(self.value_proj, distribution='uniform', bias=0.) xavier_init(self.output_proj, distribution='uniform', bias=0.) self._is_init = True @deprecated_api_warning({'residual': 'identity'}, cls_name='MultiScaleDeformableAttention') def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape `(num_key, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_key, bs, embed_dims)`. identity (Tensor): The tensor used for addition, with the same shape as `query`. Default None. If None, `query` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, 2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. or (N, Length_{query}, num_levels, 4), add additional two dimensions is (w, h) to form reference boxes. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different levels. With shape (num_levels, 2), last dimension represents (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape ``(num_levels, )`` and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if value is None: value = query if identity is None: identity = query if query_pos is not None: query = query + query_pos if not self.batch_first: # change to (bs, num_query ,embed_dims) query = query.permute(1, 0, 2) value = value.permute(1, 0, 2) bs, num_query, _ = query.shape bs, num_value, _ = value.shape assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value value = self.value_proj(value) if key_padding_mask is not None: value = value.masked_fill(key_padding_mask[..., None], 0.0) value = value.view(bs, num_value, self.num_heads, -1) sampling_offsets = self.sampling_offsets(query).view( bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) attention_weights = self.attention_weights(query).view( bs, num_query, self.num_heads, self.num_levels * self.num_points) attention_weights = attention_weights.softmax(-1) attention_weights = attention_weights.view(bs, num_query, self.num_heads, self.num_levels, self.num_points) if reference_points.shape[-1] == 2: offset_normalizer = torch.stack( [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets \ / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: sampling_locations = reference_points[:, :, None, :, None, :2] \ + sampling_offsets / self.num_points \ * reference_points[:, :, None, :, None, 2:] \ * 0.5 else: raise ValueError( f'Last dim of reference_points must be' f' 2 or 4, but get {reference_points.shape[-1]} instead.') if torch.cuda.is_available(): output = MultiScaleDeformableAttnFunctionFp32.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) output = self.output_proj(output) if not self.batch_first: # (num_query, bs ,embed_dims) output = output.permute(1, 0, 2) return self.dropout(output) + identity ================================================ FILE: plugin/models/utils/__init__.py ================================================ ================================================ FILE: plugin/models/utils/query_update.py ================================================ import math import torch import torch.nn as nn import numpy as np from mmcv.cnn import bias_init_with_prob, xavier_init class Embedder: def __init__(self, **kwargs): self.kwargs = kwargs self.create_embedding_fn() def create_embedding_fn(self): embed_fns = [] d = self.kwargs['input_dims'] out_dim = 0 if self.kwargs['include_input']: embed_fns.append(lambda x : x) out_dim += d max_freq = self.kwargs['max_freq_log2'] N_freqs = self.kwargs['num_freqs'] if self.kwargs['log_sampling']: freq_bands = 2.**torch.linspace(0., max_freq, steps=N_freqs) else: freq_bands = torch.linspace(2.**0., 2.**max_freq, steps=N_freqs) for freq in freq_bands: for p_fn in self.kwargs['periodic_fns']: embed_fns.append(lambda x, p_fn=p_fn, freq=freq : p_fn(x * freq)) out_dim += d self.embed_fns = embed_fns self.out_dim = out_dim def embed(self, inputs): return torch.cat([fn(inputs) for fn in self.embed_fns], -1) class MotionMLP(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=512, identity=True): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.identity = identity multires = 10 embed_kwargs = { 'include_input' : True, 'input_dims' : c_dim, 'max_freq_log2' : multires-1, 'num_freqs' : multires, 'log_sampling' : True, 'periodic_fns' : [torch.sin, torch.cos], } self.pos_embedder = Embedder(**embed_kwargs) self.fc = nn.Sequential( nn.Linear(f_dim + self.pos_embedder.out_dim, 2*f_dim), nn.LayerNorm(2*f_dim), nn.ReLU(), nn.Linear(2*f_dim, f_dim) ) self.init_weights() def init_weights(self): for m in self.fc: for param in m.parameters(): if param.dim() > 1: nn.init.xavier_uniform_(param) def forward(self, x, pose_info): pose_embed = self.pos_embedder.embed(pose_info) xc = torch.cat([x, pose_embed], dim=-1) out = self.fc(xc) if self.identity: out = out + x return out ================================================ FILE: plugin/models/utils/renderer_track.py ================================================ import os.path as osp import os #import av2.geometry.interpolate as interp_utils import numpy as np import copy import cv2 import matplotlib.pyplot as plt from PIL import Image def remove_nan_values(uv): is_u_valid = np.logical_not(np.isnan(uv[:, 0])) is_v_valid = np.logical_not(np.isnan(uv[:, 1])) is_uv_valid = np.logical_and(is_u_valid, is_v_valid) uv_valid = uv[is_uv_valid] return uv_valid def points_ego2img(pts_ego, extrinsics, intrinsics): pts_ego_4d = np.concatenate([pts_ego, np.ones([len(pts_ego), 1])], axis=-1) pts_cam_4d = extrinsics @ pts_ego_4d.T uv = (intrinsics @ pts_cam_4d[:3, :]).T uv = remove_nan_values(uv) depth = uv[:, 2] uv = uv[:, :2] / uv[:, 2].reshape(-1, 1) return uv, depth def draw_polyline_ego_on_img(polyline_ego, img_bgr, extrinsics, intrinsics, color_bgr, thickness): if polyline_ego.shape[1] == 2: zeros = np.zeros((polyline_ego.shape[0], 1)) polyline_ego = np.concatenate([polyline_ego, zeros], axis=1) polyline_ego = interp_utils.interp_arc(t=500, points=polyline_ego) uv, depth = points_ego2img(polyline_ego, extrinsics, intrinsics) h, w, c = img_bgr.shape is_valid_x = np.logical_and(0 <= uv[:, 0], uv[:, 0] < w - 1) is_valid_y = np.logical_and(0 <= uv[:, 1], uv[:, 1] < h - 1) is_valid_z = depth > 0 is_valid_points = np.logical_and.reduce([is_valid_x, is_valid_y, is_valid_z]) if is_valid_points.sum() == 0: return uv = np.round(uv[is_valid_points]).astype(np.int32) draw_visible_polyline_cv2( copy.deepcopy(uv), valid_pts_bool=np.ones((len(uv), 1), dtype=bool), image=img_bgr, color=color_bgr, thickness_px=thickness, ) def draw_visible_polyline_cv2(line, valid_pts_bool, image, color, thickness_px): """Draw a polyline onto an image using given line segments. Args: line: Array of shape (K, 2) representing the coordinates of line. valid_pts_bool: Array of shape (K,) representing which polyline coordinates are valid for rendering. For example, if the coordinate is occluded, a user might specify that it is invalid. Line segments touching an invalid vertex will not be rendered. image: Array of shape (H, W, 3), representing a 3-channel BGR image color: Tuple of shape (3,) with a BGR format color thickness_px: thickness (in pixels) to use when rendering the polyline. """ line = np.round(line).astype(int) # type: ignore for i in range(len(line) - 1): if (not valid_pts_bool[i]) or (not valid_pts_bool[i + 1]): continue x1 = line[i][0] y1 = line[i][1] x2 = line[i + 1][0] y2 = line[i + 1][1] # Use anti-aliasing (AA) for curves image = cv2.line(image, pt1=(x1, y1), pt2=(x2, y2), color=color, thickness=thickness_px, lineType=cv2.LINE_AA) COLOR_MAPS_BGR = { # bgr colors 'divider': (0, 0, 255), 'boundary': (0, 255, 0), 'ped_crossing': (255, 0, 0), 'centerline': (51, 183, 255), 'drivable_area': (171, 255, 255) } COLOR_MAPS_PLT = { 'divider': 'r', 'boundary': 'g', 'ped_crossing': 'b', 'centerline': 'orange', 'drivable_area': 'y', } CAM_NAMES_AV2 = ['ring_front_center', 'ring_front_right', 'ring_front_left', 'ring_rear_right','ring_rear_left', 'ring_side_right', 'ring_side_left', ] CAM_NAMES_NUSC = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT',] class Renderer(object): """Render map elements on image views. Args: cat2id (dict): category to class id roi_size (tuple): bev range dataset (str): 'av2' or 'nusc' """ def __init__(self, cat2id, roi_size, dataset='av2'): self.roi_size = roi_size self.cat2id = cat2id self.id2cat = {v: k for k, v in cat2id.items()} if dataset == 'av2': self.cam_names = CAM_NAMES_AV2 else: self.cam_names = CAM_NAMES_NUSC def render_bev_from_vectors(self, vectors, labels, out_path, id_info=None, score_info=None): '''Render bev segmentation using vectorized map elements. Args: vectors (list): list of vectorized map elements. labels (list): list of labels of map elements. out_dir (str): output directory ''' if id_info is not None: assert len(vectors) == len(id_info) if score_info is not None: assert len(vectors) == len(score_info) car_img = Image.open('resources/car.png') plt.figure(figsize=(self.roi_size[0], self.roi_size[1])) plt.xlim(-self.roi_size[0] / 2, self.roi_size[0] / 2) plt.ylim(-self.roi_size[1] / 2, self.roi_size[1] / 2) plt.axis('off') plt.imshow(car_img, extent=[-2.5, 2.5, -2.0, 2.0]) for idx in range(len(labels)): cat = self.id2cat[labels[idx].item()] color = COLOR_MAPS_PLT[cat] vector = vectors[idx].detach().cpu().numpy() pts = vector[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) # plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], angles='xy', color=color, # scale_units='xy', scale=1) # for i in range(len(x)): plt.plot(x, y, 'o-', color=color, linewidth=20, markersize=50) if id_info is not None: vec_id = int(id_info[idx]) mid_idx = len(x) // 2 if vec_id == -1: plt.text(x[mid_idx], y[mid_idx], 'FP', fontsize=100, color=color) else: plt.text(x[mid_idx], y[mid_idx], '{}'.format(vec_id), fontsize=100, color=color) if score_info is not None: mid_idx = len(x) // 2 plt.text(x[mid_idx]-1, y[mid_idx]+2, '{:.2f}'.format(score_info[idx]), fontsize=100, color='purple') plt.savefig(out_path, bbox_inches='tight', dpi=40) plt.close() def render_bev_from_mask(self, semantic_mask, out_path): '''Render bev segmentation from semantic_mask. Args: semantic_mask (array): semantic mask. out_dir (str): output directory ''' c, h, w = semantic_mask.shape bev_img = np.ones((3, h, w), dtype=np.uint8) * 255 if 'drivable_area' in self.cat2id: drivable_area_mask = semantic_mask[self.cat2id['drivable_area']] bev_img[:, drivable_area_mask == 1] = \ np.array(COLOR_MAPS_BGR['drivable_area']).reshape(3, 1) # NOTE: the semantic mask has been changed into instance masks for our use for label in range(c): cat = self.id2cat[label] if cat == 'drivable_area': continue valid = semantic_mask[label] == 1 bev_img[:, valid] = np.array(COLOR_MAPS_BGR[cat]).reshape(3, 1) #for label in range(c): # cat = self.id2cat[label] # if cat == 'drivable_area': # continue # mask = semantic_mask[label] # valid = mask == 1 # bev_img[:, valid] = np.array(COLOR_MAPS_BGR[cat]).reshape(3, 1) cv2.imwrite(out_path, bev_img.transpose((1, 2, 0))) ================================================ FILE: requirements.txt ================================================ av2 nuscenes-devkit einops==0.6.1 numpy==1.23.5 numba==0.53.0 Shapely==1.8.5 yapf==0.40.1 setuptools==59.5.0 imageio-ffmpeg==0.4.9 ================================================ FILE: tools/benchmark.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import time import torch from mmcv import Config from mmcv.parallel import MMDataParallel from mmcv.runner import load_checkpoint, wrap_fp16_model import sys from mmdet3d.datasets import build_dataset from mmdet3d.models import build_detector from tools.misc.fuse_conv_bn import fuse_module def parse_args(): parser = argparse.ArgumentParser(description='MMDet benchmark a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', default=None, help='checkpoint file') parser.add_argument('--samples', default=2000, help='samples to benchmark') parser.add_argument( '--log-interval', default=50, help='interval of logging') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') args = parser.parse_args() return args def main(): args = parse_args() cfg = Config.fromfile(args.config) import sys, os sys.path.append(os.path.abspath('.')) if hasattr(cfg, 'plugin'): if cfg.plugin: import importlib if hasattr(cfg, 'plugin_dir'): def import_path(plugin_dir): _module_dir = os.path.dirname(plugin_dir) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m print(_module_path) plg_lib = importlib.import_module(_module_path) plugin_dirs = cfg.plugin_dir if not isinstance(plugin_dirs,list): plugin_dirs = [plugin_dirs,] for plugin_dir in plugin_dirs: import_path(plugin_dir) else: # import dir is the dirpath for the config file _module_dir = os.path.dirname(args.config) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m print(_module_path) plg_lib = importlib.import_module(_module_path) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) from plugin.datasets.builder import build_dataloader data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False, shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), ) # build the model and load checkpoint cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) if args.checkpoint is not None: load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_module(model) model = MMDataParallel(model, device_ids=[0]) model.eval() # the first several iterations may be very slow so skip them num_warmup = 5 pure_inf_time = 0 # benchmark with several samples and take the average for i, data in enumerate(data_loader): torch.cuda.synchronize() start_time = time.perf_counter() with torch.no_grad(): model(return_loss=False, rescale=True, **data) torch.cuda.synchronize() elapsed = time.perf_counter() - start_time if i >= num_warmup: pure_inf_time += elapsed if (i + 1) % args.log_interval == 0: fps = (i + 1 - num_warmup) / pure_inf_time print(f'Done image [{i + 1:<3}/ {args.samples}], ' f'fps: {fps:.1f} img / s') if (i + 1) == args.samples: pure_inf_time += elapsed fps = (i + 1 - num_warmup) / pure_inf_time print(f'Overall fps: {fps:.1f} img / s') break if __name__ == '__main__': main() ================================================ FILE: tools/data_converter/__init__.py ================================================ ================================================ FILE: tools/data_converter/argoverse_converter.py ================================================ from functools import partial from multiprocessing import Pool import multiprocessing from random import sample import time import mmcv import logging from pathlib import Path from os import path as osp import os from av2.datasets.sensor.av2_sensor_dataloader import AV2SensorDataLoader import argparse CAM_NAMES = ['ring_front_center', 'ring_front_right', 'ring_front_left', 'ring_rear_right','ring_rear_left', 'ring_side_right', 'ring_side_left', # 'stereo_front_left', 'stereo_front_right', ] FAIL_LOGS = [ '01bb304d-7bd8-35f8-bbef-7086b688e35e', '453e5558-6363-38e3-bf9b-42b5ba0a6f1d', '75e8adad-50a6-3245-8726-5e612db3d165', '54bc6dbc-ebfb-3fba-b5b3-57f88b4b79ca', 'af170aac-8465-3d7b-82c5-64147e94af7d', '6e106cf8-f6dd-38f6-89c8-9be7a71e7275', ] def parse_args(): parser = argparse.ArgumentParser(description='Data converter arg parser') parser.add_argument( '--data-root', type=str, help='specify the root path of dataset') parser.add_argument( '--newsplit', action='store_true') parser.add_argument( '--nproc', type=int, default=64, required=False, help='workers to process data') args = parser.parse_args() return args def create_av2_infos_mp(root_path, info_prefix, log_ids, split, dest_path=None, num_multithread=64, newsplit=False): """Create info file of av2 dataset. Given the raw data, generate its related info file in pkl format. Args: root_path (str): Path of the data root. info_prefix (str): Prefix of the info file to be generated. dest_path (str): Path to store generated file, default to root_path split (str): Split of the data. Default: 'train' """ if dest_path is None: dest_path = root_path for i in FAIL_LOGS: if i in log_ids: log_ids.remove(i) # dataloader by original split train_loader = AV2SensorDataLoader(Path(osp.join(root_path, 'train')), Path(osp.join(root_path, 'train'))) val_loader = AV2SensorDataLoader(Path(osp.join(root_path, 'val')), Path(osp.join(root_path, 'val'))) test_loader = AV2SensorDataLoader(Path(osp.join(root_path, 'test')), Path(osp.join(root_path, 'test'))) loaders = [train_loader, val_loader, test_loader] print('collecting samples...') start_time = time.time() print('num cpu:', multiprocessing.cpu_count()) print(f'using {num_multithread} threads') # ignore warning from av2.utils.synchronization_database sdb_logger = logging.getLogger('av2.utils.synchronization_database') prev_level = sdb_logger.level sdb_logger.setLevel(logging.CRITICAL) pool = Pool(num_multithread) fn = partial(get_data_from_logid, loaders=loaders, data_root=root_path) rt = pool.map_async(fn, log_ids) pool.close() pool.join() results = rt.get() samples = [] discarded = 0 sample_idx = 0 for _samples, _discarded in results: for i in range(len(_samples)): _samples[i]['sample_idx'] = sample_idx sample_idx += 1 samples.extend(_samples) discarded += _discarded sdb_logger.setLevel(prev_level) print(f'{len(samples)} available samples, {discarded} samples discarded') id2map = {} for log_id in log_ids: for i in range(3): if log_id in loaders[i]._sdb.get_valid_logs(): loader = loaders[i] map_path_dir = osp.join(loader._data_dir, log_id, 'map') map_fname = os.path.basename(str(list(Path(map_path_dir).glob("log_map_archive_*.json"))[0])) map_fname = osp.join(map_path_dir, map_fname) id2map[log_id] = map_fname print('collected in {:.1f}s'.format(time.time() - start_time)) infos = dict(samples=samples, id2map=id2map) if newsplit: info_path = osp.join(dest_path, '{}_map_infos_{}_newsplit.pkl'.format(info_prefix, split)) else: info_path = osp.join(dest_path, '{}_map_infos_{}.pkl'.format(info_prefix, split)) print(f'saving results to {info_path}') mmcv.dump(infos, info_path) def get_data_from_logid(log_id, loaders, data_root): samples = [] discarded = 0 # find corresponding loader for i in range(3): if log_id in loaders[i]._sdb.get_valid_logs(): loader = loaders[i] # use lidar timestamps to query all sensors. # the frequency is 10Hz cam_timestamps = loader._sdb.per_log_lidar_timestamps_index[log_id] prev = -1 for ts in cam_timestamps: cam_ring_fpath = [loader.get_closest_img_fpath( log_id, cam_name, ts ) for cam_name in CAM_NAMES] lidar_fpath = loader.get_closest_lidar_fpath(log_id, ts) # if bad sensor synchronization, discard the sample if None in cam_ring_fpath or lidar_fpath is None: discarded += 1 continue cams = {} for i, cam_name in enumerate(CAM_NAMES): pinhole_cam = loader.get_log_pinhole_camera(log_id, cam_name) cams[cam_name] = dict( img_fpath=str(cam_ring_fpath[i]), intrinsics=pinhole_cam.intrinsics.K, extrinsics=pinhole_cam.extrinsics, ) city_SE3_ego = loader.get_city_SE3_ego(log_id, int(ts)) e2g_translation = city_SE3_ego.translation e2g_rotation = city_SE3_ego.rotation samples.append(dict( e2g_translation=e2g_translation, e2g_rotation=e2g_rotation, cams=cams, lidar_fpath=str(lidar_fpath), prev=prev, # map_fpath=map_fname, token=str(ts), log_id=log_id, scene_name=log_id)) prev = str(ts) return samples, discarded if __name__ == '__main__': args = parse_args() with open('tools/data_converter/av2_train_split.txt') as f: train_split = [s.strip() for s in f.readlines()] with open('tools/data_converter/av2_val_split.txt') as f: val_split = [s.strip() for s in f.readlines()] test_split = None if not args.newsplit: train_split = os.listdir(osp.join(args.data_root, 'train')) val_split = os.listdir(osp.join(args.data_root, 'val')) test_split = os.listdir(osp.join(args.data_root, 'test')) create_av2_infos_mp( root_path=args.data_root, split='train', log_ids=train_split, info_prefix='av2', dest_path=args.data_root, newsplit=args.newsplit) create_av2_infos_mp( root_path=args.data_root, split='val', log_ids=val_split, info_prefix='av2', dest_path=args.data_root, newsplit=args.newsplit) if test_split: create_av2_infos_mp( root_path=args.data_root, split='test', log_ids=test_split, info_prefix='av2', dest_path=args.data_root,) ================================================ FILE: tools/data_converter/av2_train_split.txt ================================================ bb110668-5037-3c04-bd34-34cf1ace8d0f 8beeb8db-28f9-396c-b752-17f906505948 247f91e7-3177-33ad-b99e-0e0a4dc76751 40bfcbec-ec59-3731-8e75-67f0bddc3b01 ef4a46c4-138e-3478-b94e-3e60a567ec7d cf6a99cb-b8bc-34d7-bdca-30e50e66cd74 575d8d24-ba88-3b18-84c0-df5b29dccfde e66d1403-755b-3f63-938b-a2a69446a48a ed93e1bb-7bbc-3444-8fc8-08a271438fc8 0d37aee4-6508-33a2-998d-724834e80030 7ce2a2ce-eed6-36d7-ba62-dda9acddb070 06852209-b868-306b-b492-ee6dbc914cf8 e424d4f7-4b28-322f-b630-31d42ae528eb 768cf7e2-eb6c-3468-969e-e3b0fd87b34e b2a8a9aa-19cd-3ffd-b02c-0f2a47d1d0eb 928e282f-d1a0-3e85-9582-0b33664c49e8 5d8f4b0a-27f8-3889-925f-e9a146a395eb 58a6bfcf-071e-3a6d-90f4-0e4cbdc298eb 90f360d8-35f7-3c54-b2da-e99b354bc4cb e5178032-d260-3bc8-968e-a5cb98b6ae5a af170aac-8465-3d7b-82c5-64147e94af7d a0cb0614-ee71-3cf3-b891-a4274883362f de56b100-508b-3479-81fe-735349f8e8de b87683ae-14c5-321f-8af3-623e7bafc3a7 92b900b1-ac4a-3d41-b118-e42c66382c91 2e95b33b-8ea1-3b48-875b-2f35f3092059 75e8adad-50a6-3245-8726-5e612db3d165 ccb4e29d-e88f-3fbe-8958-67cfd62350a3 a3f59292-ad1d-370a-afde-64a9e16b341c f648b945-6c70-3105-bd23-9502894e37d4 df5d0b0e-5bcb-304a-a167-18b92d0f1d45 6aae7f38-21de-31bf-8761-29d458338958 80f31501-6533-3257-9870-b0c4dbf61967 57356998-297c-330a-af4e-c6a1ad64f923 0b9321c1-9bc2-4727-beb8-8046aa4bb6c4 f7cf93d8-f7bd-3799-8500-fbe842a96f63 108d2060-8bef-3d1c-88c5-c8295f596595 4667e48c-4d16-38be-b277-6b0013d6588c 3576c0f5-c1b5-35bb-a0c4-ee95cba5c754 a059b6b9-ca26-4881-bcf7-d202433de0c2 53a8391f-b2d7-341f-95ce-b9174d48e040 fd4e2c4c-f7e9-3110-8e32-28d3add3937d 1ab241cf-f9c5-3f8a-88bd-4e14baad8ede 2aea7bd1-432a-43c5-9445-651102487f65 d9530d0a-b83e-44a3-910a-2b5bb8f1fb80 072c8e90-a51c-3429-9cdf-4dababb4e9d8 97738d01-b24f-365e-8818-2463149154d2 b8a5a7a9-1c4f-4f2e-96a6-565e727b24d0 c71cd96c-8e3f-3861-9ece-fcbabebc63a8 c08279c0-10b4-3d21-b13f-a1c1a0b87f8b f3f8f680-e471-3662-a06a-0c00e6d88f43 d78b78a0-2322-32c2-833a-e42ddc132d30 271f4204-dd77-350b-b9db-5dabc4191985 4766da89-ca13-3e92-b53e-00bc710e9bba b8d83d8e-1574-3efd-b067-a3ed422a259e 2a9b2658-097e-3f8b-a817-22f2553c5de8 c6c55112-0078-3867-a63f-1861a0125b8d 286790ed-9dec-305b-bcad-4f8153301e7c c2c0e6bc-05e5-30dd-8e5e-0e7b6106ad30 4f363f6a-e51c-4d22-b232-cf78f1520966 8e5022ed-87a9-4480-b2a7-a7c0494f5c7b 95acebfe-c694-3dab-9e6d-01cb501ff426 14c8d182-9586-3f21-ad20-c4e19ec03e2c 5ccb359a-2986-466c-88b2-a16f51774a8f 4eb237d9-9f8c-3426-9da6-4aad349ff8aa a783b484-437b-3569-bd44-4f83ad9e05cf d3efe9ba-f10a-35e7-b17e-6850c66693fe 8a11791c-1d8f-3b12-bacc-38aa982b0003 8c019de9-7043-37bc-9498-b5858e7240af 63f32613-2856-4ab0-898d-f881d74eb8bf 56c24ed8-68c5-3a08-8e3c-19646ac670e5 8bc34c99-1b8f-3463-b0e7-12bf1eb222b3 71d1938d-536e-39eb-beb4-bb4f5b607427 7b0bf9d6-084a-31d4-9e52-d9b582a0ec84 b09ba294-96b3-3c45-aeed-c40a309e5f4e 6dc6e668-549e-33b8-b952-ed8e807b1d48 595acd37-183c-489f-bb8a-c299a86b74c0 a9a24c80-600f-3f85-b4d9-a70ceccad385 718a2f8d-954a-3cd8-89e6-43898cf21fee c8ec7be0-92aa-3222-946e-fbcf398c841e 7039e410-b5ab-35aa-96bc-2c4b89d3c5e3 72cf3ca1-1a9e-3254-bca0-29c62521e454 65387aee-4490-38b9-8f4f-1fc43bd4ac06 27c03d98-6ac3-38a3-ba5e-102b184d01ef 58fed0d4-97d5-469b-89a4-4394838e10c7 d1395998-7e8a-417d-91e9-5ca6ec045ee1 e72ef05c-8b94-3885-a34f-fff3b2b954b4 36aec72e-5086-376c-b109-295b128e77e1 ecbe6def-7560-352c-8822-b2b92613e1e4 544a8102-0ef5-3044-921e-dc0544370376 cf5aaa11-4f92-3377-a7a2-861f305023eb 20dd185d-b4eb-3024-a17a-b4e5d8b15b65 ded5ef6e-46ea-3a66-9180-18a6fa0a2db4 42f92807-0c5e-3397-bd45-9d5303b4db2a f668074d-d6c6-3ea7-a7b5-aad0a1203b03 d3ca0450-2167-38fb-b34b-449741cb38f3 1ad57a00-cc61-3f5f-9e2a-9981a57e9856 418da7f6-88e4-388b-a69f-44cabd24ed55 613558a1-6a8e-3fda-8fa6-1045a064a0f9 6626b7b2-bcc8-4497-ae92-307ceacd5010 9a82e3c8-1738-3f85-9245-1d3717171d2f a69fa035-5121-3a39-a3ce-e33e9f54b506 14896a70-a440-34d0-b68e-fd9882557da6 c42d34f3-78d5-35be-9c47-77d297caebfe c9fc62c5-a289-36e3-a900-7e7807eb2716 e368d49a-e02a-3374-876e-8325f66c3574 6b012ed3-c322-3522-b52c-b4f24f894d4c 30e94a6b-ca9e-3d2c-9099-86700ce845f9 5f016e44-0f38-3837-9111-58ec18d1a5e6 4bf8e9ff-e1a1-3a22-a9d1-80f3846c0263 841fe537-5e76-3b3a-8298-75fa1a41a14b 64037371-4aa4-3fed-97f4-bafc1674caee b5ea60b0-2540-4efe-b60e-f421ade3c128 34fe8fbd-2b1a-3552-94c4-e80d1e85e5c2 67d5fb0a-baf8-32f4-9316-18ce755f3e8b 4619e709-c9c0-3b26-923f-23a78e231136 285dcef7-9f00-3c9c-baca-6c8269210ac4 67d00dd9-fd33-3518-85f5-e26353373c33 156a412d-3699-3c1c-9ada-6ab587347996 9d65b03d-b59d-3a45-ba66-e313d3cdab40 022af476-9937-3e70-be52-f65420d52703 539b7a79-54c6-30ef-8e06-210d42c79125 b4fc7eef-819c-35a6-b937-358ffb5c2aa0 ec02cf7e-36d3-3e9d-8835-3b6c27975bea 6ee14358-31cb-3c6d-82f5-54d6a20444de a7bcdabb-f9b7-3c16-806d-3ddf1c2d49a2 54bc6dbc-ebfb-3fba-b5b3-57f88b4b79ca 45433055-2b69-3cff-8135-67b3bfa04034 f61bcee1-2964-3c4b-95a5-697df5f42f47 902d5e72-b665-3615-af2d-a2b6164864b2 812a45e8-b5d9-389b-9151-09c57ce969df 5d391e54-adec-3584-adf0-5025d7564e1b 9f6d282e-f573-31d5-80e6-9a193e80cd7d f6350a4f-eee8-31bd-8520-28f9c81c98a8 ac3e33eb-5a53-369d-9e5a-7950392bfe99 3844f8a9-b7d4-4919-8e9b-a0370ad29ec0 57636c80-9335-3aae-af70-11755db93854 38f30522-2d43-3ff3-a94b-84887ab1671d 968e77fb-9ab1-3427-8984-9e3028b186e2 7186d7d2-453e-4193-b327-72b66bbe3fd0 3a1b3424-700b-3b65-9e89-90772a8e24ea 7e3d8631-3b7d-38c1-b833-ee7cfa7235ca 93b755f1-f865-44dc-a98d-cae5eb1a25d0 3ca11a5e-50b2-3cc3-af7a-ce7ab02b9954 c6940de1-dccf-3b42-9c39-bbb9feb2d638 caabc342-aed4-3104-8195-7461a4add481 afbdd2e0-696a-3222-a20a-2023baf8e5af 70e92418-e4e6-32a2-98c8-9844b1c24f92 4bab74cd-aba9-4752-9e1f-006cc639d63e 982bcae9-1840-37f4-9278-3dbb63031aac 23808d42-e4df-3a0d-b713-fe20e09a4f39 e7e7ebad-79d2-3ae1-95ce-f3035bc8f719 eec8ae97-8de4-354a-b11a-d3a14b276479 d9c267be-f19a-3183-afe0-f0625a375743 e1e9d341-716f-3613-9ec2-2201c72361af 4aae26d1-aa71-30ae-b838-2a25d1f317f5 1c8f1189-c4fe-3303-bf2a-f88f5751b81e 4c18ef76-84ba-4a78-8275-7663101fffcf 26daba71-ca0a-37e9-9dc6-0f81f02c0afd 737314f0-997a-3cd1-a652-78453bfe2b57 fdb0578d-4fa7-37a7-b60d-5472b4d39136 f4c6ade0-7b9e-4ad7-8d86-13d2f4c91499 8aaa2fac-59f9-3a3d-98ee-f3dde8b4d781 6937b4e5-b5b4-3970-b5dd-9ad194e6c338 8911207d-fc3d-3009-bb35-18138197724f 7ebbdbeb-a8de-3612-8f22-6ce91980841d b4d5e738-b937-33fd-8131-bf1df36f598a 1c7d3b85-6cec-373c-a4eb-5137d7cc6a7c e38c1e1d-f0e9-3d73-8193-29cbea481b4c 98fd128c-4f32-40fc-a23c-7feb50c4478a eed8593d-60e3-3e41-9fea-55f544b01749 52c9e613-61b3-3d17-9f6d-b28de8a14829 50d508e2-6753-4519-a8c3-ad94a76ee948 d2901fe5-4b64-3144-98e1-67ef5ef83fa7 db447b86-8103-3ee4-93de-2c838ba061dc aed1b616-9d9a-36d3-a047-07ad3955fbb7 cd822baf-4aa1-33fa-bfe5-d91386598edb 0c143226-9c39-387c-a935-1391bed6dc75 d72c43a5-95bf-3a3e-9019-cf25cc0a61c0 f46707f9-435f-3a06-9017-deae11feab53 956dd277-e000-4c6c-af4a-aee4d86971c9 7f40c022-9f9d-3805-abf8-7533175b3f25 08734a1b-0289-3aa3-a6ba-8c7121521e26 41b6f7d7-e431-3992-b783-74b9edf42215 59a668bc-5caf-3ccc-8335-e9cff4c61d0e 9cdbe6f4-938f-4ac5-88f7-94a82bea715b 382dfbe0-836e-385c-86f2-f1afcf57a402 e4f6dbab-f2eb-3bd2-9dbc-88640e3b4a5f a3e09a66-a921-3c4a-89e6-7fecf6854a3a bba89165-0e5d-3052-abb6-6a61e37861a3 c0d36fde-5672-430a-9dd1-3e2a5d4f6cab 1e6f21fd-5c8a-3526-ac50-72adae89d6a8 aa630060-2eda-37bd-ae88-a513fd9fc8e3 a8a3297d-62f3-31ac-8db7-95ef53ce0d31 8346e544-4a73-3c88-9339-c7a21fbd3a2f 93c97162-a834-3331-b15c-e4ab278b1c6d 5037a27d-95f5-352f-9c64-5b8e75f574b2 74dd5c1d-7a9a-32d1-903a-fc57e07109b9 ebece6dc-ec92-326f-97ed-f66b2970e358 bec0f69b-832c-3898-b589-0127ddc282f3 27fba275-0b37-3033-b20d-8f9848f78b1c 7c539ecc-658a-3956-a9a4-6e7f5bd67373 6fdd8d39-7b04-365a-9941-e9e805b05ecf 43efcbe2-fe24-35b0-9e69-b07c1b0725d3 5f2b8881-3447-3905-99f8-def9d72aae42 b5f3900c-b421-3032-aef2-2e91a69d1163 4977e8a8-4e1f-3ca2-a44e-454cd3756a5f fb2cce69-655d-3203-990a-74301895408d 74f15437-b85e-314a-9d86-7294b98bf07f 75a9cbdf-0bec-39f9-b536-5b37aacadf96 b6c04ab6-1c07-3e17-97d5-e870db090e52 4e302e79-1cb9-358f-a3fb-e133a655af4f e7e178aa-931a-4674-9bff-9278a54e6aae 121007f3-a0cc-3795-9606-85108b800772 7ef4a6cc-7266-3a31-8dd4-01c3d3c58dcb 4d9e3bdf-7216-3161-8281-72863f3c2bf6 b7843066-abbd-3275-ac52-90a8363f65f7 633addf9-441c-35e7-868a-738aa612d51c 105f47eb-491c-3cab-91af-83c5bc1f6c48 66a40dcc-4de0-3f72-bff0-ca543ac5019d 098fe60e-bab0-32e2-89bc-bedced881911 d6ba4898-1369-3521-981c-b9ac57420418 ce5033ee-e74e-354a-9299-8aaefbd03f59 e65e405c-8aea-30f5-a926-1e0fbbeefb9f db17141d-4d35-381d-9949-36ce767d6641 b66a9b8e-8fa8-3409-907f-a70ebd7051e1 e88132d0-4512-3d6f-a1c8-f60972332af5 df321672-461c-361b-aac9-e81cc9a88b9f bb25d7d0-9146-46a0-8ff1-ebfc25d63417 9bdb4139-173f-33d3-8730-e29752d737d3 f4cb6ba4-cd0f-30cc-9cc9-52bd14bfb3cc e28c16d0-084a-3dc4-aad4-9d157ca528de ede387f4-f390-3f0e-a071-eb543b73ed73 74ec2f62-9d57-39a9-bf88-97006f64ee5c a160c635-aa67-352c-a5e6-03b113493090 6180bbb1-95ce-381b-ba17-5411c5712824 d58d55ea-f30c-3622-8303-1574616b9865 6dadba1d-0f67-345b-bc5e-407ab8f7654c 3b2994cb-5f82-4835-9212-0cac8fb3d164 a88da814-ecc6-39c0-93ba-8a81f403a7e4 094c4119-eb33-3dfb-a18d-492cbdc8413a bbdb1e21-62eb-3230-8cef-a3b091c5edad aa82b61f-7156-3c68-95a4-b79cebd120eb b0663029-8f8d-398a-8a28-81ba29224696 8aeeeeca-6a79-34ef-b667-835d53536a8f f8412dbd-48b9-39f3-b534-08950f6e633b c062ba0f-7591-3225-a57d-8181622dc2da 75449af9-61a5-3a4f-95ec-3a3dc35b4cbb b7fbc13b-47ff-3e3f-a363-86d60ba664b8 79cb0109-4c92-3ede-8849-76cc6824b95a c556f8e0-a001-3586-b2cf-d3256685c39f ee27a871-85cf-494c-8519-f54815040af5 555a7659-ffce-39df-ba06-d9fcb2f812f0 1b8fc962-7036-4d7f-885e-40b631cbdeaf 7f7e4709-7596-35f9-89ac-d808178b1533 c2ec8955-1797-338f-9486-d7c41926f791 72ad5f22-3a9c-3758-81af-abda8181a622 2501c6d0-071c-3a7a-b51f-c8cbd37abe25 823371b1-3197-35d6-a6b7-bfd432e10440 63a006b5-07c8-375d-98e3-21466f5b9c6a 1a4e2d86-23d4-3a0d-a9ac-8b0936ae94ce 828ddef2-7609-3683-8e32-c21e7c07d6a6 df738339-958b-31fb-8e48-a4380f4c538a 3fbdfb6c-927f-4aaa-81b1-21b02efd4c01 c67a748c-1e93-3a6d-be38-daedf175f911 f9f6a7e9-4f79-3fdf-b1a7-ba300622f116 e1450d07-faed-3d97-b674-c6f8d2498d80 a7a2236e-8f8e-34aa-9343-722f9b3bb829 64b24fd1-f639-4f7e-a535-dbfe9fd737a1 44200521-4cad-3a5d-8568-e0f3f1ca24d4 444cce44-cc82-4620-b630-1b5849284ac7 e033cc8e-b23d-3fc6-8954-d90c5e98550e fa9ec72a-cbcf-35dc-be20-4d0d9e7215ef 3c51357e-f6e9-3cda-9036-fe6e6cd442fe 1e51a567-b416-3c46-9424-05688ff851f7 2b443c95-d55f-3cc4-a2a1-ae4af293d8d9 3d7743c1-c0a5-3ab2-976e-84af93270f30 441871a2-a9c5-3048-b7e9-d88af5acb8f1 f2b0585b-ada3-3123-963e-14df7d96ca9e c7f5e5c1-dc52-3619-8998-420b2e280d8a 855908a6-a848-3b7b-a4a3-bbab78a423cd 07e4fccb-eb2d-31e5-bbcb-6550d0860f64 f7cdc2d1-f59a-30a2-aae8-8bb81c769e6e 8223c3d0-3b08-3889-9cdc-a88592c4bd4a de586ff4-3413-367d-befc-ad022b73592b ac1b1697-42b9-4225-a666-d17f72204fa8 73539e96-eef2-3302-bdf4-a39e9d95b6e7 e0ba7664-d287-39df-8193-00d60cae1417 0a132537-3aec-35bb-af13-7faa0811000d b29b43d7-3af9-363e-aaeb-8805d958f982 8ca98d88-67b5-385e-80f7-b32758668fab a4f240a0-12d4-3542-a11f-0c592e90e4da 1844c439-b94c-332a-bb94-600818350eb4 ce0e814a-d9df-3975-a521-d8ae9a091e96 95a47a36-1041-3924-bbd0-4dcad52c323a f54c1d50-48a3-4651-bfb0-50b87f13dc9e 890cf3b7-3385-390c-8b2e-132c744b5d2d 189c8512-b034-3d58-a372-cf48eacf02dd 4e391f98-31a6-330d-9252-d02aab82f5db 1d950a38-5c2f-39ce-9cd3-61249bc85194 4e1ac476-80a2-3612-bfd7-1abd24d2b644 abd4fe8d-7520-3b35-b8ac-4de367141b6f 0b97f5dd-c396-3c02-b07d-b7fdbcb6c3d0 6aa2ac89-6b25-3af6-ad59-221351189f4b 298715e3-b204-3bf5-b8c2-fe3be9e310e8 e1f37027-6a39-3eb1-b38a-3f2836b84735 9e684390-4af3-3ec5-b163-855bbd026ff1 1842383a-1577-3b7a-90db-41a9a6668ee2 7a1412d3-5a53-378f-85df-ba58b2408f46 91cded81-9f72-3930-bab7-5d3e3fa0a220 a7c9bb12-322e-3f8e-8798-cf57a4a72f99 648e8393-f46f-384b-9bd1-c25a2285077d c69e348a-8e10-31dc-b71b-dd8e5cfd7211 87ca3d9f-f317-3efb-b1cb-aaaf525227e5 182ba3f7-b89a-36cc-ae40-32a341b0d3e9 f6cc0ebf-fc6a-3bf2-8bcb-76d8c43f194e f2576c8a-da9b-450e-88cf-a70af1b0eadf 78683234-e6f1-3e4e-af52-6f839254e4c0 7dbc2eac-5871-3480-b322-246e03d954d2 20bcd747-ef60-391a-9f4a-ae99f049c260 11ba4e81-c26f-3cd1-827d-b6913bcef64e eec284b2-840a-3c75-aa42-04d2e309bbe1 b50c4763-5d1e-37f4-a009-2244aeebabcd 15ec0778-826e-3ed7-9775-54fbf66997f4 e0ea281b-6956-3605-b720-71b54ec87d25 e8c9fd64-fdd2-422d-a2a2-6f47500d1d12 b8489c02-60d0-3f44-a3b4-9de62830d666 0b86f508-5df9-4a46-bc59-5b9536dbde9f 201fe83b-7dd7-38f4-9d26-7b4a668638a9 335aabef-269e-3211-a99d-2c3a3a8f8475 76916359-96f4-3274-81fe-bb145d497c11 22052525-4f85-3fe8-9d7d-000a9fffce36 4e3fedbb-847c-3d5b-8a62-c9ff84550985 77574006-881f-3bc8-bbb6-81d79cf02d83 dafe14f5-825c-4e7a-9009-6dfdfdd5b030 2f2321d2-7912-3567-a789-25e46a145bda bbd19ca1-805a-3c22-8df3-cd7501aa06f3 58e82365-03bc-3b2f-b55a-a4ad0e3e792d d770f926-bca8-31de-9790-73fbb7b6a890 b6500255-eba3-3f77-acfd-626c07aa8621 8749f79f-a30b-3c3f-8a44-dbfa682bbef1 47286726-5dd4-4e26-bd2d-5324f429e445 185d3943-dd15-397a-8b2e-69cd86628fb7 2ff4f798-78d9-3384-87e9-61928aa4cb6d 6803104a-bb06-402e-8471-e5af492db0a8 dc9077b9-2fe0-3d18-9b97-8067ff090874 7a2c222d-addc-30b2-aac6-596cb65a22e3 0fb7276f-ecb5-3e5b-87a8-cc74c709c715 3b3570b4-7b0b-3268-a571-b0889dbf40b6 e42aa296-0e5d-4733-87ec-131a82f917bc 19350c96-623d-4d77-af96-f8c23f00c358 02a00399-3857-444e-8db3-a8f58489c394 7e4d67b3-c3cc-3288-afe5-043602ea3c70 5c0584a3-52a6-3029-b6ff-ca45a19d8aa6 a1589ae2-2678-310e-91cc-c4b512cd7fa5 3de5b5d6-68c4-3c95-84ed-be7c83d829f8 9d16e76e-46ae-38c6-8399-99218514afde 2d403b7b-06e8-320c-b013-4f684ad53be2 f77889f6-ef5a-4eed-a4cd-5d67d4a6e9c5 1eb3360f-4c34-3310-9ce6-845ea9272c56 5546df9c-9310-3ed5-929a-d7da19e18bf8 a1358c59-b28d-3ddb-af1c-3a5d1c394ef5 1bd7db3a-0b42-31cf-ac1a-de88fd9fa721 a4400a38-bc38-391c-b102-ba385d7e475e 4fcdebe7-b52f-39e7-a5bc-c664eeba5e7b f7d568d4-0836-3f47-b330-f8d204c4b96e 412ccada-28df-3de2-b394-9cba3fca5bdf 6f3dbf4b-9559-340c-a3e4-cbe655bf2059 84c98474-28d8-309e-91c7-9cf9539825ab de23dfe1-c0b1-441b-810b-324090dc171b deec57d0-d31b-31ec-aa75-88db5d9dadf5 e95c8cc2-ddb3-3e7b-b8c3-e7584a778464 c3388791-4fef-3278-a085-26121cf5f513 45488531-3648-3e2d-8f9c-3c287032112d 21c0472c-5ba2-3276-aad4-b9aa66cb5fa3 98e7f0eb-4676-3120-94f1-8a790581e6a4 28bd43de-e2b7-3c60-a626-0e525f639357 5c0afbc0-a6ee-37c2-aebd-c1927caf7340 5677a441-abd2-3b29-9f0b-333e181cc907 d8192bbb-3b00-3c68-a79a-65872ea4276f 0fdbd56a-1ff7-3624-81f9-03cd68fd5616 6ef553eb-6dbb-3a2a-ae3e-ed7090b8826a bc20a6d3-2db2-3849-8843-1e1b8c93e5db 11a84740-18a3-3798-91c5-21dc9c765350 6a6e93f0-a130-3340-975b-b2c88b16d343 f6107596-76e0-3064-a4a6-86332a90e539 0f0cdd79-bc6c-35cd-9d99-7ae2fc7e165c d67d020a-4d28-3bfd-891d-d6aa7dcf0a69 a674e2e5-3dfd-3dd5-8503-192357b0e96c a89557fc-1268-36e5-9cce-335f2da27bc8 51428934-b0a7-3507-94e3-31d37bba38a3 f849731b-d288-3bec-8f35-6bea979f7dd8 91ac892f-d2c1-3143-b5c5-f0d4640cfc0d b48a15fb-2e84-34df-946f-ad72b3d7296f e7547e4c-1ebc-3428-8964-a5b91e81098e 069cc46d-38bb-309d-88cf-296a3d0c0820 8c52d911-fe34-3424-9864-d3fdfac38064 d33f667d-7b6c-39aa-9ba9-eac2fa615ae1 aedbd525-e6df-4c0c-8be6-61c27fe58fd6 81d2b40a-c579-3e9c-b520-bee26cda947d 3153b5b3-d381-3664-8f82-1d3c5ca841d2 c780d53a-2d37-3cd8-9e89-530966aef53e 88f47a10-87b4-3ea8-a0c7-a07d825b647d a91d4c7b-bf55-3a0e-9eba-1a43577bcca8 25e5c600-36fe-3245-9cc0-40ef91620c22 d5d6f11c-3026-3e0e-9d67-c111233e22de 91aab547-1912-3b8e-8e7f-df3b202147bf e1d68dde-22a9-3918-a526-0850b21ff2eb 9bb1f857-8b61-369f-a537-484c1323ae32 b6c4361a-7dd8-32a0-83d4-7f9d2beaed08 798354fc-30ee-36f4-83b4-f49c3b307db5 62a1e53b-b55c-36c2-bc5b-e216d494875a 47167c79-2ba4-369c-8db8-760a30b4c38a 2b044433-ddc1-3580-b560-d46474934089 380e5bf0-1c68-36a4-ac64-09a03b60bebf 8066e267-a653-3b43-8fce-a5a780912c82 3c56f1ef-d4df-30ae-80f3-0a5b22d4d3a6 3fa8c20e-a4b4-3af6-b9c4-6cb96f83916d 8e02e2db-2836-37ec-af33-a1cc2e6e49dc a36f80a5-5edc-3842-80af-292ae639ee74 bb9be2e6-8f0e-3bb3-8bb9-5d9aa9df384d dbe19bf6-93ad-372e-b96d-f7b652cdba93 1992ed13-948e-34e6-8d9b-a3416e545a95 a47ba6a9-ffa1-3979-bb40-512339284b8b 0a524e66-ee33-3b6c-89ef-eac1985316db e743b441-ea8a-36d7-8124-f14dfa13a0e6 8d8b550e-d0be-3cbb-a371-49ec36fa619f c85ebc24-0934-3423-9c14-f0fdbee64b68 65d3f43d-1969-35d4-bf86-bd5e4b1ac803 b51561d9-08b0-3599-bc78-016f1441bb91 8f317f00-f8b4-325e-a5c7-e4045427a610 2772dd5f-bc0a-47ea-ae19-a5e0dbef8f41 b98a7838-ac1f-339f-93c5-fe7f98ea8657 a146ab19-f4f3-334f-b830-fc68de83e26c 5481321f-d317-3e80-8061-6e9c635c4ca9 4a789b07-7578-36ec-89cd-68b01e0737fb f8825b65-5631-3417-8309-bd5677d694aa 790d3c83-f6bf-348e-80e7-12f29240e598 d26b95e4-d200-34e2-92c9-c16fda4cd9dd 945f3b20-778a-3581-adef-544de4a089ef 65732efc-1564-3ff8-8c7c-4239a08c0d70 5c7ee953-d8b0-33ef-a491-0bb716763cfe c67f439a-f945-33cb-8517-40c9fdf60d59 6f2f7d1e-8ded-35c5-ba83-3ca906b05127 72c31859-3676-3cbb-a773-0591d8d5799e 74a3e9ae-6811-4d11-a112-4c4963773cfe f41d0e8f-856e-3f7d-a3f9-ff5ba7c8e06d bd4a7d9d-14e1-3c17-873d-a74d0cd6a5d7 490f13c4-4c1f-3e3b-8a9f-0f27c6906b4e ed6ad297-ee09-3532-bcfc-c16ad5a05c49 595ec33e-a1aa-3aaf-8821-8d1780db354c 3933d1a2-f121-3c8a-8b01-7738e58c045f e0cfd042-ae29-3d21-bb47-81eb8f933ec8 b1a98ad6-9b3e-35fb-afae-70b279fcbfc0 1bf2bf1c-64d1-308f-afd1-220de9d30290 49a9df80-ab0a-31fb-9341-a79f7b0258dd 118a1e87-aff4-35f5-aa38-01504a63ddce 41c3597a-aab1-3123-85a1-dd5d459af461 9a8aea4b-9b61-3884-9f3c-84c3c36e6373 cea5f5c2-e786-30f5-8305-baead8923063 f03bfd11-5ba2-3bc6-ad76-4166b06491f5 e0d2fe70-8f98-3ce2-8d8f-4268a81f7169 da30abcf-652b-38df-a128-10942b225ec5 5e9fc665-2353-34da-a2e7-2094ab17e790 9b1da4e7-03a9-3277-91f3-ef6e610a6320 067b1c50-6567-3840-ab56-1ca2a0ed9c30 134bb8e9-9080-3bc5-948d-88d8cc034550 b56e3f47-72a6-34e8-9ada-b4169e28e5b9 84bb4b17-e7f2-3a1b-8c2b-6d6ec9a23e31 7c696d35-e34f-38b0-b4b4-e88803ad1f6a 8858428d-8fd5-3c3f-8ca4-d01f6e25e63c 93582b51-5be1-30cd-abb0-3eac16dd6dbc 32edd7c7-8a8f-360d-bcda-83ecf431e3e6 bdd7e8ba-f7fa-38d1-b6bf-9dc77334fec5 eb777faa-5b76-387e-a408-90524c6f2848 7ad46cf0-aa12-4050-ac2d-cf34b5f64d41 c990cafc-f96c-3107-b213-01d217b11272 61e56102-4d85-3a40-bbba-1a007c816f68 38609ed6-2445-3df3-bd92-849d3963510e a359e053-a350-36cf-ab1d-a7980afaffa2 76038978-47aa-30ed-bfa1-2d63753a866c c654b457-11d4-393c-a638-188855c8f2e5 5d062611-5417-3405-997c-1d1aefe4d85f 4058d838-75cb-35e2-af7e-a51aaa833271 6b0cc3b0-2802-33a7-b885-f1f1409345ac bb533c69-1e0b-341c-bedd-ff25fe9b84bf debbba6b-8cb8-3ab6-adfe-54fcc6b02839 133e2e0b-b0fe-3bb0-b1f9-c846fcfd29e8 edf3a727-664e-38be-b990-65d34012d926 ce34ff64-0faa-3fae-a79e-985f7a5172c9 0f257dcc-8606-3ef9-b17e-b022a3fc72c7 614812d4-3344-3975-a1c8-4131910c4a10 d9fd666a-8f55-38bb-8387-80fa44c29348 03b2cf2d-fb61-36fe-936f-36bbf197a8ac adcf7d18-0510-35b0-a2fa-b4cea13a6d76 e574050e-f787-3186-9686-2e9aca8102a0 f3d1e3c3-2770-3504-a592-b62619598812 953087a4-f704-37fe-a60f-82877e84a413 d5d40b4c-48d9-3b68-903a-025eb0fa334d 0749e9e0-ca52-3546-b324-d704138b11b5 e757cddd-5ff5-305a-af11-d7c6747d3979 d97ae2c0-b8d1-341c-94b7-f19d5fd2982a 46d917cd-531c-330b-8d7b-979b51a8927f 03fba633-8085-30bc-b675-687a715536ac 6419dcfd-8777-35fa-924c-ebefccde0a9b 855ba280-cd69-348d-9107-69e28cb8ad99 ce0575bf-c2fc-38bd-9947-ea7494a799f9 e125bb91-dcaf-3013-9cc7-da653d7e11e1 49d76058-b4f0-3931-86fa-de160b4c1b88 b48d6d4b-f0dd-35da-850d-36a715691e2f 99a3270d-c5c5-3df7-9a2d-a612c8104d0e 7cb4b11f-3872-3825-83b5-622e1a2cdb28 dc4d148d-f84c-307c-b2b7-f0cd7c267f57 106d962b-911d-354d-961d-9abe93119b9c 14bf638b-8f0d-35b2-a369-6d846b5b3892 8aad8778-73ce-3fa0-93c7-804ac998667d 5cf52bbe-f7f4-30c9-a4c2-a1fbb93513e4 bee1146d-2e80-37e3-b08a-6ac8858e8973 cdd752d0-caee-3d95-b1db-7fc20cbbc783 9caf211e-3e6e-3996-8518-f617b9454e67 0a8a4cfa-4902-3a76-8301-08698d6290a2 e4279e3e-b7e1-3f43-aeef-2bfa2836dab6 6ff3a51a-e0ab-32be-beb5-4079e56933c6 4d324eb4-39f1-3837-9b97-c10db5d2b61d a1537c1c-775b-3969-ae13-2e83e5a4728a 0d9e4cff-73ff-33eb-9981-795475e62faf 74648e09-358d-3183-9b40-278620befa40 ad319b98-6faa-3648-98bd-43afdbd20020 b9f73e2a-292a-3876-b363-3ebb94584c7a 7d1d720d-6708-3148-917a-b8dc78f1dcd9 f64ed43e-417a-31ad-a322-b6108bf99a71 4d7b84b9-0a03-3aa1-83f0-4766013c3fb1 c96a09c8-46ed-391f-8a66-c46fa8b76029 5c1db299-e2a2-35e5-84dd-acda8fb393bc 194b6c89-8060-3174-b402-308f72cb1c15 9ecbfef8-29c6-334a-b4ff-aa8201439826 c4ea1b05-c7d5-3b59-aed1-9f3d2621ac00 b403f8a3-4cad-333e-8557-d7da2e163f4b 4f1b4bb2-b30b-3537-8fed-dd8f843f5adb d201af7e-48c8-34ad-be1c-e649af2cb5c2 0d8aab9f-4edf-3fb3-895a-ba64e8f2cfb2 81700b3c-2db4-3f72-935c-274d3607d6d2 62879808-1586-4d49-80fe-2f547e355191 dc9c2d63-083f-32c3-90ff-943ca823a245 e331aa95-3660-3c71-be9e-030bab0b8ee2 7c5e3704-33c8-3a4e-b032-9187a6f90206 35a15c5c-fa4a-3838-a724-396e112ec95c 5d55a63a-3146-32d9-89ec-e207e95ecbde 0322b098-7e42-34db-bcec-9a4d072191e9 a2f568b5-060f-33f0-9175-7e2062d86b6c 332b278a-a6b9-3bc3-b88c-241e4b03b4ef 0c61aea3-3cba-35f3-8971-df42cd5b9b1a 53f5011b-2a8f-3a73-9d86-805462bb542d c1a6c20c-e336-3efa-81b6-7c1242d70bd2 7a17d467-9f29-3706-8e40-32bb7fb033de 0ab21841-0c08-3bae-8424-daa9b336683f eb69a196-fb43-3ddf-9bbe-9d55fa1e8200 a3876690-9d49-3c98-9421-02cfe0ccb551 7c30c3fc-ea17-38d8-9c52-c75ccb112253 4935629c-fd9e-3b2f-b68e-9489c89585df 49e970c4-7364-33cb-a298-ead218e9a705 511b93af-f16e-3195-8628-fbb972a17f74 91923e20-9a05-32e0-ac53-8c09b0b60341 00a6ffc1-6ce9-3bc3-a060-6006e9893a1a b5e6e498-54b3-37bb-b2a3-cdac33a18363 c730e199-fb8d-3abf-b7aa-bbc81bf8c08f d37be0e2-8223-3eeb-a0e2-c4b75d5ff87b 9afab336-dbae-3f70-a669-46813f4570d7 ab3d8387-8e07-37f6-a74c-cf100fb6a612 1a10b0e6-569f-32db-95e8-10c074e353e8 382cf8af-6c8d-3ed9-907b-12214d2c7cb0 d842ce41-8d9c-3c0f-9c04-595d97be5140 7cd08674-1787-37d9-9365-988df023724b f150d98f-0109-3380-8480-c6846fb8e9c8 bd90cd1a-38b6-33b7-adec-ba7d4207a8c0 52071780-5758-3ed4-8835-0d64ecdc5575 04994d08-156c-3018-9717-ba0e29be8153 bf360aeb-1bbd-3c1e-b143-09cf83e4f2e4 24642607-2a51-384a-90a7-228067956d05 78da7b7e-8ddf-3c7d-8716-eaa890106dd3 ff0dbfc5-8a7b-3a6e-8936-e5e812e45408 78f7cb5c-9d51-34f0-b356-9b3d83263c75 7606de8d-486c-4916-9cbb-002ee966f834 858d739b-a0ba-35aa-bafc-4f7988bcad17 b6e967f6-92bc-3bf5-99c9-1b0c4649fd67 de9cf513-a0cd-3389-bc79-3f9f6f261317 95bf6003-7068-3a78-a0c0-9e470a06e60f f1275002-842e-3571-8f7d-05816bc7cf56 f292cc5c-7a90-360d-b62a-074c643bdf59 3bffdcff-c3a7-38b6-a0f2-64196d130958 472a240a-10cd-39cd-8681-558f7c7cf868 adf9a841-e0db-30ab-b5b3-bf0b61658e1e a060c4c1-b9fc-39c1-9d30-d93a124c9066 6aaf5b08-9f84-3a2e-8a32-2e50e5e11a3c a33a44fb-6008-3dc2-b7c5-2d27b70741e8 9e9bcfb7-601d-3d80-bc12-ef7025174beb 2e3f2ae7-9ab9-3aef-a3ce-a0a97a0cb1ab f4c94798-4d77-36ab-bdc5-c1194e5e7aff 52971a8a-ed62-3bfd-bcd4-ca3308b594e0 0aa4e8f5-2f9a-39a1-8f80-c2fdde4405a2 fbee355f-8878-31fa-8ac8-b9a45a3f130a 214e388e-cbd7-3dde-a204-d2ec42298808 280269f9-6111-311d-b351-ce9f63f88c81 20d47f81-46e8-3adf-a0ca-564fbb5c599d b2053fdc-0b94-30bc-aee7-5bc6fb7e9f52 02678d04-cc9f-3148-9f95-1ba66347dff9 29a00842-ead2-3050-b587-c5ef507e4125 9a448a80-0e9a-3bf0-90f3-21750dfef55a e858fb96-6b1f-3025-b40a-f71fd8d28c32 d70dae33-b4b2-36da-a4eb-345ef1c484cc 386c34fc-ff56-371c-9288-6ba42620f23a aaed41a5-47f2-3e0a-9645-2dbd871f744f be0615bc-1d82-334b-9c98-6adf40406955 4abe4fc9-183a-3ec1-9434-bc74fb724c0f b40c0cbf-5d35-30df-9f63-de088ada278e 67be173f-28a9-3bcc-b110-4b81dfe3bf5e f554d503-4901-3b97-9516-a16398c66631 399064b4-6df3-3de8-8793-2738f8723ee3 ae908cc4-7301-3390-8940-eb9b679a8a39 a86ee261-b86b-34f7-92ab-be8367d1fc4c 3503b283-fbcd-3835-8779-0cb2b7ef55b0 1ca5291b-3178-3a93-a117-001497899b79 fb207d3b-d2d5-3100-94c0-9145aebc770b f7c4cf87-6bab-3723-bd74-1c9ac5add9cb 65f1eefa-cbc3-3d53-9991-dc0500ae9183 9320afa3-ed05-3364-a017-ae7ddc5d26c7 b248d26b-9c48-3d5f-bda1-a05ec99c2d97 7c4e5ad1-d604-3e44-81ae-68f7bfe21d27 e4221cc6-a19d-31ca-bf94-031adb0ea390 6784f175-e69d-3802-99df-d21ec2081878 97ae6596-a903-3045-836b-34f8206c6cfe 48c9cd36-68bf-3bb9-ab95-5e0a6fee61ab b42dc943-8b33-3b79-a260-14eb9f58a991 cf79d751-5d2a-3d5c-96a2-bb8d603f21e0 c2bbb391-a453-36af-b987-9d15f46b8589 803c44cc-e1de-3797-9b5f-15324a1604f8 af8471e6-6780-3df2-bc6a-1982a4b1b437 4e6d6bcd-8718-3e71-b9c1-7c352c991a56 6b6b2e8b-3f4d-3b7d-acaa-8f970cb12adb a7f532a3-87de-3129-8864-258396fd0b50 b7cbdba9-18ac-393a-8352-4841ffee722e 557dd6a4-2b80-3264-9c13-f70094526174 d029a394-7118-33c9-896d-eabb894f58c4 8ee606e6-4cbd-3c07-8419-fbda836ccaac ab8c747b-b9cb-3835-a275-54c56cb9a469 3e707e96-ad84-3e68-bea5-2f9ac502a2d9 1a7e18b5-d8dc-371d-be5f-03a37b113e81 7df1f32e-f059-3ac4-9d57-213f2f69b8b4 8e5442cf-8882-3b94-bc47-18fcad84bb20 3c27dfaf-1624-39d2-9075-158824ed8e8c ff8e7fdb-1073-3592-ba5e-8111bc3ce48b 770a58e6-eff6-39b7-a265-fe7f202fe8b2 ff52c01e-3d7b-32b1-b6a1-bcff3459ccdd 47358aac-2ec0-3d45-a837-f2069ca7cee3 18bdf01b-6ba6-30a8-a707-1f1458529d3d 71283e26-905b-3811-b9e0-c10c0253769b d0ba7a1b-f5ca-39d6-98d0-29c671baec65 29080565-8133-3274-80cf-6ea98078e50d 06e5ac08-f4cb-34ae-9406-3496f7cadc62 83faae69-e37e-4804-b7a9-684d4a900320 e4d53680-f7ef-364b-91a4-00e5aa91ab9b c94991c0-3662-3936-972c-1af63db486d8 79f3de22-c643-3e97-96d5-f77274a458c0 5bd6bd4d-3c89-3794-9935-2d044ce6ef37 f3cc42c7-84a8-35c5-8683-13878bb9beeb 9a25fd14-783b-35c3-ab2d-df4687f82b5e 7ccdda39-69b1-36d1-89c8-2acc3823264b 71d95611-9032-3787-a66e-e26313b08d46 b5a1b0b0-a7fc-3a47-af82-9b25a81a8c0b dd251cc5-736d-3b76-8ad3-3f6cb138178e 8a0ff1a2-9045-3be3-b67f-3914d88178ec 080b1ce2-9477-39ee-8233-b7f33e1dfe56 3dd173a6-8b21-3189-bd53-132919b96a48 2fff4135-98ec-3b82-a330-b73d8afdf36c 42c8449f-6e6d-3980-b54a-805eba6621c4 a1c1d559-0480-39d2-94f0-1a89f0226c4f cd2353c2-0fb6-3e18-8281-4c0df1d3189a d20c3612-a64d-3aa8-bd4a-58890413afbb 2716d83e-8c4f-39a3-a2a3-d5e255fe8a03 ea6895f2-504b-37b5-bfd0-cbf7017f22c3 3c3ed78e-1fcf-30ec-9e19-9bf142e2621d 20b00c37-4fe4-31dc-a258-dae253ae6992 c2f301b6-5d19-3296-a8ac-418ff48e052b fac8a63c-6b75-39d0-9f57-4344fde0f794 256c185c-284a-343e-93f4-894eed474edd 8c54e429-a3de-3eb3-96f3-d3127e2cc18f 41d69427-364c-366e-94a5-8e556bcac39f b436606f-daa4-337f-8103-4360bf4704d9 f84b4941-8e99-3957-b6f6-db1590338cf6 11420316-aec9-3ad9-8b4a-d618bcd180e9 48a52b7b-9391-3728-84f1-9aa6ca336214 f4d1a3c3-5002-336b-a67f-775b3725237e 2b6d18dc-4c95-3301-a498-3ed152798d5b ab83611b-436e-3de7-aad1-f0c9ad254196 389069d7-e6db-3d22-9328-e228c002bf75 e123ba3f-99bd-3039-b6e7-8c62eaebf9c2 12c3c14b-9cf2-3434-9a5d-e0bfa332f6ce 7da33189-2698-3a98-b038-b0e5a271ee96 3a789fb0-5cd2-3710-b8ea-f32fce38e3ca 7d3f2f76-2f4f-3762-bf0f-f94f79eb0404 16af3863-0d31-3cd1-8fa2-58053ffb953a 80da8956-f418-319c-9f49-3d47d9002546 eb222d5d-0052-3ce7-9b87-19e09054a2c0 b28a3715-4624-3a54-9652-b8f0b293a5a8 3b2e6033-f37f-3a73-9fab-88317b9b6095 a4e62775-131f-37c4-9239-c38e3b254dad f110598d-7e01-3ed7-a227-4e958987a31f 40870b19-3356-3e8e-a4a4-9f34eef8ea30 47972731-b0ea-3c38-a10f-5ffdd42329fc 991d11df-0265-3e41-b942-5b0c615d21e2 b81922e7-092f-3052-8cd1-fec6a6763295 c858bd6a-81ab-3f54-b46d-ffc091ef6945 9807c577-0dc0-3116-864b-cf46a1276389 a6817756-af01-32ec-829f-d9e56ef7b6e8 95312039-73b9-35a2-9aec-905494a4f7f0 5d333477-796b-3e49-bf41-0cdbed39c8dd 86519a39-4ce9-3d0b-a3f9-dd9aa26a2b25 76c3f58f-9003-3bdb-90a3-b87cfbfa1c3b b213af37-7d89-342d-ae39-8a3c72159a01 cae56e40-8470-3c9c-af75-6e444189488f e50e7698-de3d-355f-aca2-eddd09c09533 4c33fc38-5e59-34f8-96ba-4e5a404d3988 44adf4c4-6064-362f-94d3-323ed42cfda9 da036982-92bf-36a8-b880-4ccf4e20b74e bdb9d309-f14b-3ff6-ad1f-5d3f3f95a13e 0b5142c1-420b-3fea-9e98-b87327ae22c6 0c3bad78-9f1e-395d-a376-2eb7499229fd bf382949-3515-3c16-b505-319442937a43 19f53e16-9f99-3035-9672-7e860f3b0048 e13c06cb-cd01-380e-946f-6d92ac1af49d 1f434d15-8745-3fba-9c3e-ccb026688397 1da4a0aa-22ae-3958-856d-05303de1f576 f3cd0d0d-8b71-3266-9732-d9f0d5778eb6 96dd6923-994c-3afe-9830-b15bdfd60f64 6fa5051b-0220-3e04-8ae3-7a199c2f5877 32835bfa-e53b-3526-9ec0-b0efcd11cbdf 4fae2ef6-7112-309a-b926-448a5a3e1802 a9a3d5d7-e0c6-3f24-af35-2acadc1aa2d9 1c8648f9-e7a1-3056-a2c0-19c8827a6a50 c45888cf-30f5-3e27-abeb-4f55caecc1f0 d4c7aa45-dfd6-3d71-bb8a-40efd5110d3b 34c79495-dbdf-393d-bcc6-e6f92f797628 87e61f5a-083c-305e-9ff4-5f699e85900a e95e20d1-7f04-34b9-9105-4333f11bf6b9 b0116f1c-f88f-3c09-b4bf-fc3c8ebeda56 9da07440-1001-3b00-a29f-c8bdc2f2b7d4 2ee0eda7-151a-3957-bab5-1e5370192122 c91f95de-d041-32f6-8b18-628a220be100 c6b7a5fb-8cd8-3ee2-8e99-b788eb02e731 ================================================ FILE: tools/data_converter/av2_val_split.txt ================================================ 22dcf96c-ef5e-376b-9db5-dc9f91040f5e 5b1d8b11-4f90-3577-be0b-193e102fda82 3f9796e9-c892-3915-b719-3292df878ece b5a7ff7e-d74a-3be6-b95d-3fc0042215f6 4d73c4eb-5de9-300c-b34f-ff5d0af89653 e40d67c5-3749-397e-aa2a-7dfe576a31b0 b43d449e-daaf-33a1-bb7f-3f7a0b5f056c 69c0ec7c-e289-3c4d-ade3-d2287ec34026 f5a3ee79-a131-3f8a-91e9-a6475d778149 d3dc783e-663a-31b1-bd85-46e04ca693db 9239d493-31d7-3dd0-a05f-03d50a242392 9946b521-ea55-3c52-9fd1-71afc3abf3c6 1579b300-e7f5-3318-97c2-2c827b0c411e 14f5485e-7417-3a5b-9be3-ec88461d03d4 41e31361-569b-3ed8-bafd-2308b7a9377e 4207ef92-0b3b-4708-8868-4ffcaef308e0 5d40499f-c9be-38b9-a0cb-cd234850ba85 73d86f1c-5e5c-3842-b671-7f29c78ccc55 fa708289-f2b2-399e-989e-53f83fa379c5 d1695c5e-08a9-44fd-8f45-93c23f700c8b dfc6d65f-20f5-389d-a5cd-81c1c7ecb11f 2c652f9e-8db8-3572-aa49-fae1344a875b 4a78c5db-041b-347b-9821-ceb82f99e3f8 677c7bcc-f29b-34ae-a91d-74cb863117c8 2a930061-3d8c-3915-8aac-f81199db95d8 b6c86134-d7e6-3af6-9db5-8aba3df4f7a7 b1527e96-5a5d-3adc-a893-314ab3a6012e a4087bac-8194-4c9e-8b2d-4bda58773a3c 58d01358-5927-36fa-9e11-d18d1dc1f4f0 87ce1d90-ca77-363b-a885-ec0ef6783847 460324ea-c769-38db-bba9-044643c8780e 074d2237-ed1b-34d7-a2fc-68edbce50bb2 e94f58d9-177b-31be-aa05-e6dd10d04124 c453a8e7-d3da-317a-946b-f8e9678a8582 31f062b7-dd17-3e7e-945d-198e91597de9 ff6adc87-5f47-32f7-b36a-546453c0e332 3c58172c-7a07-3ad4-bdf6-7cae60928c56 11995cbe-e076-3a35-910d-1e56ecf2c3c8 8feb3dbe-4450-3aeb-b22b-e65128aa696b 3cd2847c-604e-32b4-af19-6cd0da0dcdc5 8de6abb6-6589-3da7-8e21-6ecc80004a36 04973bcf-fc64-367c-9642-6d6c5f363b61 a4f72852-c2ff-35d3-8375-e52055508240 b9b1564c-66d0-4597-a664-2735cf2ffd04 bffb0c9e-5e3a-3251-ab5e-299491b53cbf 0b1b993a-68b3-3232-9afa-fc9942b5b79b 3b68c074-1680-3a93-92e5-5b711406f2fe c049334b-5568-3ca0-9b28-0c09d00b7bb3 ad870270-f3d8-3790-866a-78d61b5b76ee 78cbd619-8ded-35b8-87a1-38c4f4aeb82d 7ce85124-312b-35f0-a1a2-32206f75a947 5426cd2f-f4b9-3660-99d2-6617bb0f1b26 349c4c1c-9561-360f-9ae7-59772335d54b 96284bbc-6b58-330f-a5a6-76cd518543f0 ef625e46-d0d4-38b9-9403-5614e7b39ec8 cd83b7cd-e2e7-34f6-bee7-1ff5ca3ed665 adc1fad7-de31-371f-810b-140576d9accc a98c14bf-bf01-3ae5-992d-ea9f0a18e3c7 0b324587-6097-3f92-a07a-a44f48c85d9e 35f32393-e82f-3b20-b214-1f6a43d60f23 36b38cbf-f6c5-3a12-8e7a-eb281cc9c2fc b9fcb487-363e-30a7-a316-a42dd81d8fe5 c222c78d-b574-4b9d-82e1-96a4f3f8bb27 51bbdd4d-3065-34ae-b369-b6e0444f34db ba67827f-6b99-3d2a-96ab-7c829eb999bb d5fa4d54-74ba-369c-a758-636441ad7f07 5f278cdd-ca28-3c53-8f5c-04e62308811d c865c156-0f26-411c-a16c-be985333f675 7b7f86ca-b430-3872-a131-ff5b4a6b5dcf 6da5d01e-54a7-3d7a-b86b-e0d6f8d3971d 3fca5366-2b2c-387b-b63c-7ae8f9e0cec1 5b614cfd-21c2-3b03-94c8-2a6c6bee166c 416f2e1c-0ffd-3089-97d2-0514b818f8d1 0c6e62d7-bdfa-3061-8d3d-03b13aa21f68 7e48bba5-438c-3813-9ce2-97c98868afed 756f4ed0-5352-31e4-b3c6-2841b9e779d7 6d3bfbc9-45dc-316e-a94c-a441371d0571 fd5c6932-2ee2-3cfb-9bdc-0b30bfb33a91 4a60c567-f167-3890-aa7e-01e75ccc40e0 87918291-e9ba-3759-be1a-4c874ca40997 4487b659-692e-3b35-9d1e-a230279ed646 aa539866-29e4-353e-95a9-b6d321b53b33 df1935dc-1e5f-3f4d-bdcb-e6c2bcb07667 087695bd-c662-3e86-83b4-aedc3b8eec36 19711b73-c43b-3922-be61-8c44df707a7d 1886b0d1-9c5e-326f-99df-30b64044638f b6642e23-d100-3680-8882-9f3b753b2eef 89f79c55-6698-3037-bd2e-d40c81af169a ca4144fb-10e5-3895-836f-87001f59ac65 dc3d4b79-6cd8-324b-bc70-cbd0e2a066da 28617035-7557-3cb9-99c2-754f72fd34b4 924116d9-0a48-3d97-b8c9-0d16b087c16a 3e7c4d87-dba1-3e22-a303-4f402f89cd20 2451c219-3002-3b2e-8fa9-2b7fea168b3b 5d9c1080-e6e9-3222-96a2-37ca7286a874 aa105408-2974-35e7-ae76-35060cfde21a 9efe1171-6faf-3427-8451-8f6469f7678e 9441ffdd-f06e-36e0-839e-b836b0f19bc9 20f785b0-e11a-3757-be79-b0731286c998 9fd55542-e982-361f-814f-61ad4ad07adf 2583a8ee-867d-3db6-b039-35b913fb8f70 b275d09d-9da2-380b-a748-528ee28bc9af e10475f7-0d56-3a75-870d-d4206fa165d7 120d7ac7-cce3-359e-a19c-1b9c0abd6be2 226199ab-c791-32a7-8bab-ab92878eb199 adbb2a17-a503-32cd-a9ed-b523b3e4da0b b8ce75e5-c1d2-3447-9249-70ab3d42389f 3b60751b-7a71-3a47-a743-96b96f0d9b2b 285ac213-8caf-31a4-b0fa-c240580f7f69 988ab841-c422-3d08-bb52-a09f8fdb6ab2 2ec904db-41aa-397c-a1e3-2e2ca0c8e8fb 5fe10166-ab1e-36d5-aa2b-c0d6f680f2c7 ba737c78-2ef2-3643-a5b2-4804dfff9d93 0526e68e-2ff1-3e53-b0f8-45df02e45a93 8934694e-8085-3673-96dd-eacebe691ed1 070bbf42-31d3-3aa9-aca4-c262afc9077d 5f8f4a26-59b1-3f70-bcab-b5e3e615d3bc 7de2e535-81df-3d5f-a5ca-62e4b940eb54 cd22abca-9150-3279-87a4-cb00ba517372 d89f80be-76d0-3853-8daa-76605cf4ce5e a7636fca-4d9e-3052-bef2-af0ce5d1df74 fbd62533-2d32-3c95-8590-7fd81bd68c87 7a8ec82c-1149-308b-8a12-477460843f35 e35a6aae-3608-38a7-b6e9-b5d6108b921d 5ea3cd9c-15d0-3b80-9cc4-02c8b5ad523a ed5fc860-c172-39c5-91c0-d712957fb1cd f2325996-961e-3f63-bbc0-44b7e76aeac9 7905533a-694b-35db-b39f-aec9e33fb3de c83da752-b12f-3fbd-b728-4abb9551723b b2d9d8a5-847b-3c3b-aed1-c414319d20af 131bd3d9-4f85-3ba3-b569-eb88308d79d5 e596b305-c951-3081-ae02-85406a473840 eb142141-683a-3a6d-a207-0302b1ff260d fdc0f552-4976-36a6-8691-9a8c6a5ba389 e68d1f0d-eb44-3751-975d-f80609f695ae 6ee06433-4820-3211-999a-95b79b2c692e 937093d8-7966-3df3-b334-0835595412b6 8940f5f1-13e0-3094-99ba-da2d17639774 919f13de-857f-3b1c-9f8e-7cbe500a60ae 5f5a25ff-ea07-3133-b5c6-26fada93f90f c93a30c8-168c-386c-a25a-cbd8d8410fbe e2e921fe-e489-3656-a0a2-5e17bd399ddf 27be7d34-ecb4-377b-8477-ccfd7cf4d0bc 9282db22-c361-3456-a7b5-414959f5f25e d70660da-4250-3ad1-a2d0-6a2d97b5379f 840b2b3f-5f52-32ae-b833-ad030063533d 87621780-827a-3df5-8fa5-a94267d2d807 307e27f2-6442-39a2-b62c-1e3d000cebaf 84ed050c-635f-36ec-9c28-8a0c10f5cf11 ================================================ FILE: tools/data_converter/nusc_split.py ================================================ TRAIN_SCENES = [ "scene-0002", "scene-0003", "scene-0004", "scene-0005", "scene-0006", "scene-0007", "scene-0008", "scene-0009", "scene-0012", "scene-0013", "scene-0014", "scene-0015", "scene-0016", "scene-0017", "scene-0018", "scene-0019", "scene-0021", "scene-0022", "scene-0023", "scene-0024", "scene-0025", "scene-0026", "scene-0027", "scene-0028", "scene-0029", "scene-0030", "scene-0031", "scene-0032", "scene-0033", "scene-0034", "scene-0035", "scene-0036", "scene-0039", "scene-0042", "scene-0043", "scene-0044", "scene-0045", "scene-0046", "scene-0047", "scene-0048", "scene-0049", "scene-0050", "scene-0051", "scene-0052", "scene-0055", "scene-0056", "scene-0057", "scene-0058", "scene-0059", "scene-0060", "scene-0061", "scene-0062", "scene-0063", "scene-0064", "scene-0065", "scene-0066", "scene-0067", "scene-0068", "scene-0069", "scene-0070", "scene-0071", "scene-0072", "scene-0073", "scene-0074", "scene-0075", "scene-0076", "scene-0092", "scene-0093", "scene-0094", "scene-0095", "scene-0096", "scene-0097", "scene-0098", "scene-0099", "scene-0100", "scene-0101", "scene-0102", "scene-0103", "scene-0104", "scene-0105", "scene-0106", "scene-0107", "scene-0108", "scene-0109", "scene-0110", "scene-0120", "scene-0123", "scene-0124", "scene-0125", "scene-0126", "scene-0127", "scene-0128", "scene-0129", "scene-0130", "scene-0131", "scene-0132", "scene-0133", "scene-0134", "scene-0135", "scene-0138", "scene-0149", "scene-0150", "scene-0151", "scene-0154", "scene-0155", "scene-0157", "scene-0158", "scene-0159", "scene-0161", "scene-0162", "scene-0163", "scene-0164", "scene-0165", "scene-0166", "scene-0167", "scene-0168", "scene-0170", "scene-0171", "scene-0172", "scene-0173", "scene-0174", "scene-0175", "scene-0176", "scene-0177", "scene-0178", "scene-0179", "scene-0180", "scene-0181", "scene-0182", "scene-0183", "scene-0185", "scene-0187", "scene-0188", "scene-0190", "scene-0191", "scene-0192", "scene-0193", "scene-0194", "scene-0195", "scene-0196", "scene-0199", "scene-0200", "scene-0202", "scene-0203", "scene-0204", "scene-0206", "scene-0207", "scene-0208", "scene-0209", "scene-0210", "scene-0211", "scene-0212", "scene-0213", "scene-0214", "scene-0218", "scene-0219", "scene-0220", "scene-0221", "scene-0222", "scene-0224", "scene-0225", "scene-0226", "scene-0227", "scene-0228", "scene-0229", "scene-0230", "scene-0231", "scene-0232", "scene-0233", "scene-0234", "scene-0235", "scene-0236", "scene-0237", "scene-0238", "scene-0239", "scene-0240", "scene-0241", "scene-0242", "scene-0243", "scene-0244", "scene-0245", "scene-0246", "scene-0247", "scene-0248", "scene-0249", "scene-0250", "scene-0251", "scene-0252", "scene-0253", "scene-0254", "scene-0255", "scene-0256", "scene-0257", "scene-0258", "scene-0259", "scene-0260", "scene-0261", "scene-0262", "scene-0263", "scene-0264", "scene-0268", "scene-0270", "scene-0271", "scene-0272", "scene-0273", "scene-0274", "scene-0275", "scene-0276", "scene-0277", "scene-0278", "scene-0283", "scene-0284", "scene-0285", "scene-0286", "scene-0287", "scene-0288", "scene-0289", "scene-0290", "scene-0291", "scene-0292", "scene-0293", "scene-0294", "scene-0295", "scene-0296", "scene-0297", "scene-0298", "scene-0299", "scene-0300", "scene-0301", "scene-0302", "scene-0303", "scene-0304", "scene-0305", "scene-0306", "scene-0315", "scene-0316", "scene-0317", "scene-0318", "scene-0321", "scene-0323", "scene-0324", "scene-0328", "scene-0329", "scene-0330", "scene-0331", "scene-0332", "scene-0344", "scene-0345", "scene-0346", "scene-0349", "scene-0350", "scene-0351", "scene-0352", "scene-0353", "scene-0354", "scene-0355", "scene-0356", "scene-0357", "scene-0358", "scene-0359", "scene-0360", "scene-0361", "scene-0362", "scene-0363", "scene-0364", "scene-0365", "scene-0367", "scene-0370", "scene-0371", "scene-0372", "scene-0373", "scene-0374", "scene-0375", "scene-0376", "scene-0377", "scene-0379", "scene-0380", "scene-0381", "scene-0382", "scene-0383", "scene-0384", "scene-0385", "scene-0386", "scene-0388", "scene-0399", "scene-0400", "scene-0401", "scene-0402", "scene-0403", "scene-0405", "scene-0406", "scene-0407", "scene-0408", "scene-0420", "scene-0421", "scene-0422", "scene-0423", "scene-0424", "scene-0425", "scene-0426", "scene-0427", "scene-0428", "scene-0429", "scene-0430", "scene-0431", "scene-0432", "scene-0433", "scene-0434", "scene-0435", "scene-0436", "scene-0437", "scene-0438", "scene-0439", "scene-0440", "scene-0441", "scene-0442", "scene-0443", "scene-0444", "scene-0445", "scene-0446", "scene-0447", "scene-0448", "scene-0449", "scene-0450", "scene-0451", "scene-0452", "scene-0453", "scene-0454", "scene-0455", "scene-0456", "scene-0457", "scene-0458", "scene-0459", "scene-0461", "scene-0462", "scene-0463", "scene-0464", "scene-0465", "scene-0467", "scene-0468", "scene-0469", "scene-0471", "scene-0472", "scene-0474", "scene-0475", "scene-0476", "scene-0477", "scene-0478", "scene-0479", "scene-0480", "scene-0499", "scene-0500", "scene-0501", "scene-0502", "scene-0504", "scene-0505", "scene-0506", "scene-0507", "scene-0508", "scene-0509", "scene-0510", "scene-0511", "scene-0512", "scene-0513", "scene-0514", "scene-0515", "scene-0517", "scene-0518", "scene-0519", "scene-0520", "scene-0521", "scene-0522", "scene-0523", "scene-0524", "scene-0552", "scene-0553", "scene-0554", "scene-0555", "scene-0559", "scene-0560", "scene-0561", "scene-0562", "scene-0563", "scene-0564", "scene-0565", "scene-0584", "scene-0585", "scene-0586", "scene-0587", "scene-0588", "scene-0589", "scene-0590", "scene-0591", "scene-0592", "scene-0593", "scene-0594", "scene-0595", "scene-0596", "scene-0597", "scene-0598", "scene-0599", "scene-0600", "scene-0625", "scene-0626", "scene-0627", "scene-0629", "scene-0630", "scene-0632", "scene-0633", "scene-0634", "scene-0635", "scene-0636", "scene-0637", "scene-0638", "scene-0639", "scene-0640", "scene-0652", "scene-0653", "scene-0654", "scene-0655", "scene-0656", "scene-0657", "scene-0658", "scene-0659", "scene-0660", "scene-0661", "scene-0662", "scene-0663", "scene-0664", "scene-0665", "scene-0666", "scene-0667", "scene-0668", "scene-0669", "scene-0670", "scene-0671", "scene-0672", "scene-0673", "scene-0674", "scene-0675", "scene-0676", "scene-0677", "scene-0678", "scene-0679", "scene-0681", "scene-0683", "scene-0684", "scene-0685", "scene-0686", "scene-0687", "scene-0688", "scene-0689", "scene-0695", "scene-0696", "scene-0697", "scene-0698", "scene-0700", "scene-0701", "scene-0703", "scene-0704", "scene-0705", "scene-0706", "scene-0707", "scene-0708", "scene-0709", "scene-0710", "scene-0711", "scene-0712", "scene-0713", "scene-0714", "scene-0715", "scene-0716", "scene-0717", "scene-0718", "scene-0719", "scene-0726", "scene-0727", "scene-0728", "scene-0730", "scene-0731", "scene-0733", "scene-0734", "scene-0735", "scene-0736", "scene-0737", "scene-0738", "scene-0780", "scene-0781", "scene-0782", "scene-0783", "scene-0784", "scene-0786", "scene-0787", "scene-0789", "scene-0790", "scene-0791", "scene-0792", "scene-0802", "scene-0806", "scene-0808", "scene-0809", "scene-0810", "scene-0811", "scene-0812", "scene-0813", "scene-0815", "scene-0816", "scene-0817", "scene-0819", "scene-0820", "scene-0821", "scene-0822", "scene-0847", "scene-0848", "scene-0849", "scene-0850", "scene-0851", "scene-0852", "scene-0853", "scene-0854", "scene-0855", "scene-0856", "scene-0858", "scene-0860", "scene-0861", "scene-0862", "scene-0863", "scene-0864", "scene-0865", "scene-0866", "scene-0868", "scene-0869", "scene-0870", "scene-0871", "scene-0872", "scene-0873", "scene-0875", "scene-0876", "scene-0877", "scene-0878", "scene-0880", "scene-0882", "scene-0883", "scene-0884", "scene-0885", "scene-0886", "scene-0887", "scene-0888", "scene-0889", "scene-0890", "scene-0891", "scene-0892", "scene-0893", "scene-0894", "scene-0895", "scene-0896", "scene-0897", "scene-0898", "scene-0899", "scene-0900", "scene-0901", "scene-0902", "scene-0903", "scene-0904", "scene-0905", "scene-0906", "scene-0907", "scene-0908", "scene-0909", "scene-0916", "scene-0917", "scene-0921", "scene-0922", "scene-0923", "scene-0925", "scene-0926", "scene-0927", "scene-0928", "scene-0929", "scene-0930", "scene-0931", "scene-0945", "scene-0947", "scene-0949", "scene-0952", "scene-0953", "scene-0955", "scene-0956", "scene-0957", "scene-0958", "scene-0959", "scene-0960", "scene-0961", "scene-0966", "scene-0967", "scene-0968", "scene-0969", "scene-0971", "scene-0972", "scene-0975", "scene-0976", "scene-0977", "scene-0978", "scene-0979", "scene-0980", "scene-0981", "scene-0982", "scene-0983", "scene-0984", "scene-0988", "scene-0989", "scene-0990", "scene-0991", "scene-0992", "scene-0994", "scene-0995", "scene-0996", "scene-0997", "scene-0998", "scene-0999", "scene-1000", "scene-1001", "scene-1004", "scene-1005", "scene-1006", "scene-1007", "scene-1008", "scene-1009", "scene-1010", "scene-1011", "scene-1012", "scene-1013", "scene-1014", "scene-1015", "scene-1019", "scene-1020", "scene-1021", "scene-1022", "scene-1023", "scene-1024", "scene-1025", "scene-1044", "scene-1045", "scene-1046", "scene-1047", "scene-1048", "scene-1049", "scene-1050", "scene-1051", "scene-1052", "scene-1053", "scene-1054", "scene-1064", "scene-1065", "scene-1066", "scene-1067", "scene-1068", "scene-1069", "scene-1070", "scene-1071", "scene-1072", "scene-1073", "scene-1074", "scene-1075", "scene-1076", "scene-1077", "scene-1078", "scene-1079", "scene-1080", "scene-1081", "scene-1082", "scene-1083", "scene-1084", "scene-1085", "scene-1086", "scene-1087", "scene-1088", "scene-1089", "scene-1090", "scene-1091", "scene-1092", "scene-1093", "scene-1094", "scene-1095", "scene-1096", "scene-1097", "scene-1098", "scene-1099", "scene-1100", "scene-1101", "scene-1102", "scene-1104", "scene-1105", "scene-1106", "scene-1107", "scene-1108", "scene-1109", "scene-1110"] VAL_SCENES = [ "scene-0001", "scene-0010", "scene-0011", "scene-0020", "scene-0038", "scene-0041", "scene-0053", "scene-0054", "scene-0121", "scene-0122", "scene-0139", "scene-0152", "scene-0160", "scene-0184", "scene-0269", "scene-0347", "scene-0348", "scene-0366", "scene-0368", "scene-0369", "scene-0378", "scene-0389", "scene-0390", "scene-0391", "scene-0392", "scene-0393", "scene-0394", "scene-0395", "scene-0396", "scene-0397", "scene-0398", "scene-0411", "scene-0412", "scene-0413", "scene-0414", "scene-0415", "scene-0416", "scene-0417", "scene-0418", "scene-0419", "scene-0525", "scene-0526", "scene-0527", "scene-0528", "scene-0529", "scene-0530", "scene-0531", "scene-0532", "scene-0533", "scene-0534", "scene-0535", "scene-0536", "scene-0537", "scene-0538", "scene-0539", "scene-0541", "scene-0542", "scene-0543", "scene-0544", "scene-0545", "scene-0546", "scene-0556", "scene-0557", "scene-0558", "scene-0566", "scene-0568", "scene-0570", "scene-0571", "scene-0572", "scene-0573", "scene-0574", "scene-0575", "scene-0576", "scene-0577", "scene-0578", "scene-0580", "scene-0582", "scene-0583", "scene-0642", "scene-0643", "scene-0644", "scene-0645", "scene-0646", "scene-0647", "scene-0648", "scene-0649", "scene-0650", "scene-0651", "scene-0739", "scene-0740", "scene-0741", "scene-0744", "scene-0746", "scene-0747", "scene-0749", "scene-0750", "scene-0751", "scene-0752", "scene-0757", "scene-0758", "scene-0759", "scene-0760", "scene-0761", "scene-0762", "scene-0763", "scene-0764", "scene-0765", "scene-0767", "scene-0768", "scene-0769", "scene-0770", "scene-0771", "scene-0775", "scene-0777", "scene-0778", "scene-0794", "scene-0795", "scene-0796", "scene-0797", "scene-0798", "scene-0799", "scene-0800", "scene-0803", "scene-0804", "scene-0911", "scene-0912", "scene-0913", "scene-0914", "scene-0915", "scene-0919", "scene-0920", "scene-0924", "scene-0962", "scene-0963", "scene-1002", "scene-1003", "scene-1016", "scene-1017", "scene-1018", "scene-1055", "scene-1056", "scene-1057", "scene-1058", "scene-1059", "scene-1060", "scene-1061", "scene-1062", "scene-1063"] CALIBRATION_SCENES = [ "scene-0852", "scene-0429", "scene-0956", "scene-0194", "scene-0811", "scene-1110", "scene-1107", "scene-0294", "scene-0900", "scene-0596", "scene-0296", "scene-0885", "scene-0866", "scene-0105", "scene-0782", "scene-0191", "scene-0876", "scene-0133", "scene-0231", "scene-0847", "scene-0363", "scene-0026", "scene-0791", "scene-0909", "scene-0002", "scene-0283", "scene-0007", "scene-0251", "scene-1100", "scene-0668", "scene-0584", "scene-0287", "scene-0260", "scene-0171", "scene-0789", "scene-0108", "scene-0190", "scene-0206", "scene-0635", "scene-0815", "scene-0058", "scene-0710", "scene-0302", "scene-0639", "scene-0166", "scene-0094", "scene-0735", "scene-0321", "scene-1091", "scene-0344" ] ================================================ FILE: tools/data_converter/nuscenes_converter.py ================================================ import mmcv import numpy as np from os import path as osp from pyquaternion import Quaternion import argparse from nusc_split import TRAIN_SCENES, VAL_SCENES nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier') nus_attributes = ('cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving', 'pedestrian.standing', 'pedestrian.sitting_lying_down', 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None') FAIL_SCENES = ['scene-0499', 'scene-0502', 'scene-0515', 'scene-0517'] def parse_args(): parser = argparse.ArgumentParser(description='Data converter arg parser') parser.add_argument( '--data-root', type=str, help='specify the root path of dataset') parser.add_argument( '--newsplit', action='store_true') parser.add_argument( '-v','--version', choices=['v1.0-mini', 'v1.0-trainval', 'v1.0-test'], default='v1.0-trainval') args = parser.parse_args() return args def create_nuscenes_infos_map(root_path, dest_path=None, info_prefix='nuscenes', version='v1.0-trainval', new_split=False): """Create info file for map learning task on nuscene dataset. Given the raw data, generate its related info file in pkl format. Args: root_path (str): Path of the data root. info_prefix (str): Prefix of the info file to be generated. version (str): Version of the data. Default: 'v1.0-trainval' """ from nuscenes.nuscenes import NuScenes nusc = NuScenes(version=version, dataroot=root_path, verbose=True) from nuscenes.utils import splits assert version in ['v1.0-trainval', 'v1.0-test', 'v1.0-mini'] if version == 'v1.0-trainval': train_scenes = splits.train val_scenes = splits.val elif version == 'v1.0-test': train_scenes = splits.test val_scenes = [] else: train_scenes = splits.mini_train val_scenes = splits.mini_val if new_split: train_scenes = TRAIN_SCENES val_scenes = VAL_SCENES test = 'test' in version if test: print('test scene: {}'.format(len(train_scenes))) else: print('train scene: {}, val scene: {}'.format( len(train_scenes), len(val_scenes))) train_samples, val_samples, test_samples = [], [], [] train_sample_idx = 0 val_sample_idx = 0 for sample in mmcv.track_iter_progress(nusc.sample): lidar_token = sample['data']['LIDAR_TOP'] sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP']) cs_record = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) lidar_path, boxes, _ = nusc.get_sample_data(lidar_token) #mmcv.check_file_exist(lidar_path) scene_record = nusc.get('scene', sample['scene_token']) log_record = nusc.get('log', scene_record['log_token']) location = log_record['location'] scene_name = scene_record['name'] if scene_name in FAIL_SCENES: continue info = { 'lidar_path': lidar_path, 'token': sample['token'], 'cams': {}, 'lidar2ego_translation': cs_record['translation'], 'lidar2ego_rotation': cs_record['rotation'], 'e2g_translation': pose_record['translation'], 'e2g_rotation': pose_record['rotation'], 'timestamp': sample['timestamp'], 'location': location, 'scene_name': scene_name } # obtain 6 image's information per frame camera_types = [ 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', ] for cam in camera_types: cam_token = sample['data'][cam] sd_rec = nusc.get('sample_data', cam_token) cs_record = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) cam2ego_rotation = Quaternion(cs_record['rotation']).rotation_matrix cam2ego_translation = np.array(cs_record['translation']) ego2cam_rotation = cam2ego_rotation.T ego2cam_translation = ego2cam_rotation.dot(-cam2ego_translation) transform_matrix = np.eye(4) #ego2cam transform_matrix[:3, :3] = ego2cam_rotation transform_matrix[:3, 3] = ego2cam_translation cam_info = dict( extrinsics=transform_matrix, # ego2cam intrinsics=cs_record['camera_intrinsic'], img_fpath=str(nusc.get_sample_data_path(sd_rec['token'])) ) info['cams'][cam] = cam_info if scene_name in train_scenes: info.update({ 'sample_idx': train_sample_idx, 'prev': train_sample_idx - 1, 'next': train_sample_idx + 1, }) if sample['prev'] == '': info['prev'] = -1 if sample['next'] == '': info['next'] = -1 train_samples.append(info) train_sample_idx += 1 elif scene_name in val_scenes: info.update({ 'sample_idx': val_sample_idx, 'prev': val_sample_idx - 1, 'next': val_sample_idx + 1, }) if sample['prev'] == '': info['prev'] = -1 if sample['next'] == '': info['next'] = -1 val_sample_idx += 1 val_samples.append(info) else: test_samples.append(info) if dest_path is None: dest_path = root_path if test: info_path = osp.join(dest_path, f'{info_prefix}_map_infos_test.pkl') print(f'saving test set to {info_path}') mmcv.dump(test_samples, info_path) else: # for training set if new_split: info_path = osp.join(dest_path, f'{info_prefix}_map_infos_train_newsplit.pkl') else: info_path = osp.join(dest_path, f'{info_prefix}_map_infos_train.pkl') print(f'saving training set to {info_path}') mmcv.dump(train_samples, info_path) # for val set if new_split: info_path = osp.join(dest_path, f'{info_prefix}_map_infos_val_newsplit.pkl') else: info_path = osp.join(dest_path, f'{info_prefix}_map_infos_val.pkl') print(f'saving validation set to {info_path}') mmcv.dump(val_samples, info_path) if __name__ == '__main__': args = parse_args() create_nuscenes_infos_map(root_path=args.data_root, version=args.version, new_split=args.newsplit) ================================================ FILE: tools/dist_test.sh ================================================ #!/usr/bin/env bash CONFIG=$1 CHECKPOINT=$2 GPUS=$3 PORT=${PORT:-29500} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} ================================================ FILE: tools/dist_train.sh ================================================ #!/usr/bin/env bash CONFIG=$1 GPUS=$2 PORT=${PORT:-29500} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} ================================================ FILE: tools/mmdet_test.py ================================================ import os.path as osp import pickle import shutil import tempfile import time import mmcv import torch import torch.distributed as dist from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info from mmdet.core import encode_mask_results def single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3): model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) batch_size = len(result) if show or out_dir: if batch_size == 1 and isinstance(data['img'][0], torch.Tensor): img_tensor = data['img'][0] else: img_tensor = data['img'][0].data[0] img_metas = data['img_metas'][0].data[0] imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg']) assert len(imgs) == len(img_metas) for i, (img, img_meta) in enumerate(zip(imgs, img_metas)): h, w, _ = img_meta['img_shape'] img_show = img[:h, :w, :] ori_h, ori_w = img_meta['ori_shape'][:-1] img_show = mmcv.imresize(img_show, (ori_w, ori_h)) if out_dir: out_file = osp.join(out_dir, img_meta['ori_filename']) else: out_file = None model.module.show_result( img_show, result[i], show=show, out_file=out_file, score_thr=show_score_thr) # encode mask results if isinstance(result[0], tuple): result = [(bbox_results, encode_mask_results(mask_results)) for bbox_results, mask_results in result] results.extend(result) for _ in range(batch_size): prog_bar.update() return results def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): """Test model with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' it encodes results to gpu tensors and use gpu communication for results collection. On cpu mode it saves the results on different gpus to 'tmpdir' and collects them by the rank 0 worker. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. Returns: list: The prediction results. """ model.eval() results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) time.sleep(2) # This line can prevent deadlock problem in some cases. for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) # encode mask results # if isinstance(result[0], tuple): # result = [(bbox_results, encode_mask_results(mask_results)) # for bbox_results, mask_results in result] results.extend(result) if rank == 0: batch_size = len(result) for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks if gpu_collect: results = collect_results_gpu(results, len(dataset)) else: results = collect_results_cpu(results, len(dataset), tmpdir) return results def collect_results_cpu(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: mmcv.mkdir_or_exist('.dist_test') tmpdir = tempfile.mkdtemp(dir='.dist_test') tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, f'part_{i}.pkl') part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results def collect_results_gpu(result_part, size): rank, world_size = get_dist_info() # dump result part to tensor with pickle part_tensor = torch.tensor( bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') # gather all result part tensor shape shape_tensor = torch.tensor(part_tensor.shape, device='cuda') shape_list = [shape_tensor.clone() for _ in range(world_size)] dist.all_gather(shape_list, shape_tensor) # padding result part tensor to max length shape_max = torch.tensor(shape_list).max() part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') part_send[:shape_tensor[0]] = part_tensor part_recv_list = [ part_tensor.new_zeros(shape_max) for _ in range(world_size) ] # gather all result part dist.all_gather(part_recv_list, part_send) if rank == 0: part_list = [] for recv, shape in zip(part_recv_list, shape_list): part_list.append( pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) # sort the results ordered_results = [] for res in zip(*part_list): ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] return ordered_results ================================================ FILE: tools/mmdet_train.py ================================================ import random import warnings import numpy as np import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner, Fp16OptimizerHook, OptimizerHook, build_optimizer, build_runner) from mmcv.utils import build_from_cfg from mmdet.core import DistEvalHook, EvalHook from mmdet.datasets import (build_dataloader, build_dataset, replace_ImageToTensor) from mmdet.utils import get_root_logger def set_random_seed(seed, deterministic=False): """Set random seed. Args: seed (int): Seed to be used. deterministic (bool): Whether to set the deterministic option for CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` to True and `torch.backends.cudnn.benchmark` to False. Default: False. """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if deterministic: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) if 'runner' not in cfg: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) else: if 'total_epochs' in cfg: assert cfg.total_epochs == cfg.runner.max_epochs runner = build_runner( cfg.runner, default_args=dict( model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: if isinstance(runner, EpochBasedRunner): runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = DistEvalHook if distributed else EvalHook runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow) ================================================ FILE: tools/slurm_test.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 CHECKPOINT=$4 GPUS=${GPUS:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} PY_ARGS=${@:5} SRUN_ARGS=${SRUN_ARGS:-""} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/slurm_train.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 WORK_DIR=$4 GPUS=${GPUS:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} SRUN_ARGS=${SRUN_ARGS:-""} PY_ARGS=${@:5} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/test.py ================================================ import argparse import mmcv import os import os.path as osp import torch import warnings from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from mmdet3d.apis import single_gpu_test from mmdet3d.datasets import build_dataset from mmdet3d.models import build_model # from mmdet_test import multi_gpu_test from mmdet_train import set_random_seed from mmdet.datasets import replace_ImageToTensor def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', type=str, help='checkpoint file') parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument('--result-path', help='submission file in pickle format to be evaluated') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', action='store_true', help='whether to run evaluation.') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where results will be saved') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument( '--deterministic', action='store_true', help='whether to set deterministic options for CUDNN backend.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both specified, ' '--options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() assert args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # import modules from plguin/xx, registry will be updated import sys sys.path.append(os.path.abspath('.')) if hasattr(cfg, 'plugin'): if cfg.plugin: import importlib if hasattr(cfg, 'plugin_dir'): def import_path(plugin_dir): _module_dir = os.path.dirname(plugin_dir) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m print(_module_path) plg_lib = importlib.import_module(_module_path) plugin_dirs = cfg.plugin_dir if not isinstance(plugin_dirs,list): plugin_dirs = [plugin_dirs,] for plugin_dir in plugin_dirs: import_path(plugin_dir) else: # import dir is the dirpath for the config file _module_dir = os.path.dirname(args.config) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m print(_module_path) plg_lib = importlib.import_module(_module_path) cfg.model.pretrained = None # in case the test dataset is concatenated samples_per_gpu = 1 if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) if samples_per_gpu > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # set random seeds if args.seed is not None: set_random_seed(args.seed, deterministic=args.deterministic) # build the dataloader if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) cfg.data.test.work_dir = cfg.work_dir print('work_dir: ',cfg.work_dir) dataset = build_dataset(cfg.data.test) if args.result_path: outputs = args.result_path dataset._evaluate(args.result_path) return from plugin.datasets.builder import build_dataloader data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False, shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), ) from plugin.core.apis.test import custom_multi_gpu_test as multi_gpu_test # build the model and load checkpoint cfg.model.train_cfg = None model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') # embed() if args.fuse_conv_bn: model = fuse_conv_bn(model) if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: kwargs = {} if args.eval_options is None else args.eval_options if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: eval_kwargs = cfg.get('evaluation', {}).copy() if args.eval_options is not None: eval_kwargs.update(args.eval_options) # hard-code way to remove EvalHook args for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule' ]: eval_kwargs.pop(key, None) print('start evaluation!') print(dataset.evaluate(outputs, **eval_kwargs)) if __name__ == '__main__': main() ================================================ FILE: tools/tracking/calculate_cmap.py ================================================ import argparse from mmcv import Config from mmdet3d.datasets import build_dataset import cv2 import torch import numpy as np import pickle import time from cmap_utils.utils import * from cmap_utils.match_utils import * from cmap_utils.data_utils import * font = cv2.FONT_HERSHEY_SIMPLEX location = (200,60) fontScale = 2 fontColor = (255,0,0) thickness = 2 lineType = 2 cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } id2cat = { 0:'ped_crossing', 1:'divider', 2:'boundary', } COLOR_MAPS_BGR = { # bgr colors 'divider': (0, 0, 255), 'boundary': (0, 255, 0), 'ped_crossing': (255, 0, 0), 'centerline': (51, 183, 255), 'drivable_area': (171, 255, 255) } COLOR_MAPS_PLT = { 'divider': 'r', 'boundary': 'g', 'ped_crossing': 'b', 'centerline': 'orange', 'drivable_area': 'y', } INTERP_NUM = 200 N_WORKERS = 0 def parse_args(): parser = argparse.ArgumentParser( description='Visualize groundtruth and results') parser.add_argument('config', help='config file path') parser.add_argument('--thr', type=float, default=0.4, help='score threshold to filter predictions') parser.add_argument( '--result_path', default=None, help='directory to submission file') parser.add_argument( '--consist', default=1, type=int, help='whether to use the consistent criterion' ) parser.add_argument( '--cons_frames', default=5, help='consective frames for cons metric' ) args = parser.parse_args() return args def instance_match(pred_lines, scores, gt_lines, threshold, metric='chamfer'): ### obtain tp,fp,score for a frame based on chamfer distance num_preds = pred_lines.shape[0] num_gts = gt_lines.shape[0] # tp and fp tp = np.zeros((num_preds), dtype=np.float32) fp = np.zeros((num_preds), dtype=np.float32) if num_gts == 0: fp[...] = 1 return (tp.copy(),fp.copy()) if num_preds == 0: return (tp.copy(),fp.copy()) assert pred_lines.shape[1] == gt_lines.shape[1], \ "sample points num should be the same" matrix = np.zeros((num_preds, num_gts)) matrix = chamfer_distance_batch(pred_lines, gt_lines) matrix_min = matrix.min(axis=1) matrix_argmin = matrix.argmin(axis=1) sort_inds = np.argsort(-scores) tp = np.zeros((num_preds), dtype=np.float32) fp = np.zeros((num_preds), dtype=np.float32) gt_covered = np.zeros(num_gts, dtype=bool) for i in sort_inds: if matrix_min[i] <= threshold: matched_gt = matrix_argmin[i] if not gt_covered[matched_gt]: gt_covered[matched_gt] = True tp[i] = 1 else: fp[i] = 1 else: fp[i] = 1 return (tp.copy(),fp.copy()) def _evaluate_single(pred_vectors, scores, gt_vectors, threshold, metric='chamfer'): ### collect tp-fp-score information pred_lines = np.array(pred_vectors) gt_lines = np.array(gt_vectors) if len(pred_lines) == 0 or len(gt_lines)==0: tp_fp_score = np.zeros((0,3)) return tp_fp_score scores = np.array(scores) tp_fp_list = instance_match(pred_lines, scores, gt_lines, threshold, metric) # (M, 2) tp, fp = tp_fp_list tp_fp_score = np.hstack([tp[:, None], fp[:, None], scores[:, None]]) return tp_fp_score def match_gt_w_pred(curr_data,curr_data_gt,thresh): ### find local id matching between predicted vector and gt vectors curr_vectors_np = {label: [] for label in cat2id.values()} curr_scores_np = {label: [] for label in cat2id.values()} for i in range(len(curr_data['labels'])): score = curr_data['scores'][i] label = curr_data['labels'][i] v = curr_data['vectors'][i] curr_vectors_np[label].append(v) curr_scores_np[label].append(score) curr_vectors = {} for label, vecs in curr_vectors_np.items(): if len(vecs) > 0: vecs = np.stack(vecs, 0) vecs = torch.tensor(vecs) curr_vectors[label] = vecs else: curr_vectors[label] = vecs curr_vectors_gt_np = curr_data_gt curr_vectors_gt = {} for label, vecs in curr_vectors_gt_np.items(): if len(vecs) > 0: vecs_np = [] for vec in vecs: vecs_np.append(vec) vecs = np.stack(vecs_np, 0) vecs = torch.tensor(vecs) curr_vectors_gt[label] = vecs else: curr_vectors_gt[label] = vecs pred2gt_matchings = find_matchings_chamfer(curr_vectors,curr_vectors_gt,curr_scores_np,thresh=thresh) return pred2gt_matchings def get_scene_matching_result(gts,pred_results,scene_name2token,scene_name,thresh=1.5): ### obtain local id matching of a scene start_token = scene_name2token[scene_name][0] vectors_seq = [] scores_seq = [] pred_matching_seq = [] vectors_gt_seq = [] pred2gt_matchings_seq = [] choose_scene = pred_results[start_token]['scene_name'] for local_idx,token in enumerate(scene_name2token[scene_name]): prev_data = pred_results[token] gt_vectors = gts[token] assert prev_data['scene_name'] == choose_scene assert prev_data['local_idx'] == local_idx vectors_gt_seq.append(gt_vectors) vectors = {label: [] for label in cat2id.values()} scores = {label: [] for label in cat2id.values()} pred_matching = {label: [] for label in cat2id.values()} for i in range(len(prev_data['labels'])): score, label, v,pred_glb_id = \ prev_data['scores'][i], prev_data['labels'][i], prev_data['vectors'][i], prev_data['global_ids'][i] vectors[label].append(v) scores[label].append(score) pred_matching[label].append(pred_glb_id) pred_matching_seq.append(pred_matching) vectors_seq.append(vectors) scores_seq.append(scores) pred2gt_matchings = match_gt_w_pred(prev_data,gt_vectors, thresh) pred2gt_matchings_seq.append(pred2gt_matchings) return vectors_seq, pred_matching_seq, pred2gt_matchings_seq def pred2gt_global_matching(ids_info,ids_info_gt,pred2gt_seq): ### obtain global id matching between predicted vectors and gt vectors of a scene pred2gt_global_seq = [] for frame_idx in range(len(pred2gt_seq)): f_match = pred2gt_seq[frame_idx] f_ids_info = ids_info[frame_idx] f_ids_info_gt = ids_info_gt[frame_idx] pred2gt_match_dict = {} for label in f_ids_info.keys(): pred2gt_match_dict[label] = {} f_label_match = f_match[label][0] f_ids_label_info,f_ids_label_info_gt = f_ids_info[label],f_ids_info_gt[label] for pred_match_idx, gt_match_idx in enumerate(f_label_match): pred_glb_match_idx = f_ids_label_info[pred_match_idx] if gt_match_idx != -1: gt_glb_match_idx = f_ids_label_info_gt[gt_match_idx] else: gt_glb_match_idx = -1 pred2gt_match_dict[label][pred_glb_match_idx] = gt_glb_match_idx pred2gt_global_seq.append(pred2gt_match_dict) return pred2gt_global_seq def get_tpfp_from_scene_single(scene_name,args,scene_name2token,pred_results,gts, gt_matching,threshold): ### generate tp-fp list in a single scene tpfp_score_record = {0:[],1:[],2:[]} scene_gt_matching = gt_matching[scene_name]['instance_ids'] if args.consist: vectors_seq, scene_pred_matching,pred2gt_seq \ = get_scene_matching_result(gts,pred_results,scene_name2token,scene_name,threshold) pred2gt_global_seq = pred2gt_global_matching(scene_pred_matching,scene_gt_matching,pred2gt_seq) vectors_seq = [] scores_seq = [] gt_flag_dict = {label:{} for label in cat2id.values()} for frame_idx, token in enumerate(scene_name2token[scene_name]): prev_data = pred_results[token] vectors_gt = gts[token] vectors = {label: [] for label in cat2id.values()} scores = {label: [] for label in cat2id.values()} for i in range(len(prev_data['labels'])): score, label, v = prev_data['scores'][i], prev_data['labels'][i], prev_data['vectors'][i] vectors[label].append(v) scores[label].append(score) for label in cat2id.values(): tpfp_score = _evaluate_single(vectors[label], scores[label], vectors_gt[label] ,threshold) if args.consist: #### deal with the consistency part for vec_idx,single_tpfp_score in enumerate(tpfp_score): curr_pred2gt_match = pred2gt_global_seq[frame_idx][label] ### pred_global_id: gt_global_id pred_local2global_mapping = scene_pred_matching[frame_idx][label] match_glb_pred_idx = pred_local2global_mapping[vec_idx] ### match_glb_gt_idx = curr_pred2gt_match[match_glb_pred_idx] if match_glb_gt_idx not in gt_flag_dict[label].keys(): gt_flag_dict[label][match_glb_gt_idx] = match_glb_pred_idx else: if match_glb_pred_idx != gt_flag_dict[label][match_glb_gt_idx]: tpfp_score[vec_idx][:2] = np.array([0,1]) tpfp_score_record[label].append(tpfp_score) vectors_seq.append(vectors) scores_seq.append(scores) return tpfp_score_record def get_mAP(tpfp_score_record,num_gts,threshold): ### calculate mean AP given tp-fp-score record result_dict = {} for cat_name,label in cat2id.items(): sum_AP = 0 result_dict[cat_name] = {} tp_fp_score = [np.vstack(i[label]) for i in tpfp_score_record] tp_fp_score = np.vstack(tp_fp_score) sort_inds = np.argsort(-tp_fp_score[:, -1]) tp = tp_fp_score[sort_inds, 0] fp = tp_fp_score[sort_inds, 1] tp = np.cumsum(tp, axis=0) fp = np.cumsum(fp, axis=0) eps = np.finfo(np.float32).eps recalls = tp / np.maximum(num_gts[label], eps) precisions = tp/np.maximum(tp+fp, eps) AP = average_precision(recalls, precisions, 'area') sum_AP += AP result_dict[cat_name].update({f'AP@{threshold}': AP}) return result_dict def main(): args = parse_args() cfg = Config.fromfile(args.config) import_plugin(cfg) dataset = build_dataset(cfg.eval_config) dataset[0] scene_name2idx = {} scene_name2token = {} for idx, sample in enumerate(dataset.samples): scene = sample['scene_name'] token = sample['token'] if scene not in scene_name2idx: scene_name2idx[scene] = [] scene_name2token[scene] = [] scene_name2idx[scene].append(idx) scene_name2token[scene].append(token) all_scene_names = sorted(list(scene_name2idx.keys())) gt_matching_path = cfg.eval_config.ann_file.replace('.pkl','_gt_tracks.pkl',) with open(gt_matching_path,'rb') as pf: gt_matching = pickle.load(pf) pred_matching_path = args.result_path with open(pred_matching_path,'rb') as ppf: pred_matching_result_raw = pickle.load(ppf) roi_size = torch.tensor(cfg.roi_size).numpy() origin = torch.tensor(cfg.pc_range[:2]).numpy() if roi_size[0] == 60: thresholds_list = [0.5,1.0,1.5] elif roi_size[0] == 100: thresholds_list = [1.0, 1.5, 2.0] else: raise ValueError('roi size {} not supported, check again...'.format(roi_size)) if 'newsplit' in args.result_path: gts = get_gts(dataset,new_split=True) else: gts = get_gts(dataset) ### interpolate vector data start_time = time.time() denormed_gts,pred_matching_result,num_gts,num_preds = \ get_data(pred_matching_result_raw,gts,origin,roi_size,INTERP_NUM,result_path=args.result_path,denorm=False) print('Preparing Data Time {}'.format(time.time()-start_time)) ### obtain mAP for each threshold scene_name_list = [] for single_scene_name in all_scene_names: scene_name_list.append( (single_scene_name,args) ) result_dict = {thr:{} for thr in thresholds_list} for threshold in thresholds_list: tpfp_score_list =[] for (scene_name,args) in scene_name_list: tpfp_score = get_tpfp_from_scene_single(scene_name,args,scene_name2token,pred_matching_result, denormed_gts,gt_matching,threshold) tpfp_score_list.append(tpfp_score) result_dict[threshold] = get_mAP(tpfp_score_list,num_gts,threshold) print(result_dict[threshold]) cat_mean_AP = np.array([0.,0.,0.]) mean_AP = 0 for thr in thresholds_list: for cat_name in cat2id.keys(): mean_AP += result_dict[thr][cat_name]['AP@{}'.format(thr)] cat_mean_AP[cat2id[cat_name]] += result_dict[thr][cat_name]['AP@{}'.format(thr)] cat_map_dict = {cat:cat_mean_AP[idx]/len(thresholds_list) for cat,idx in cat2id.items() } print('Category mean AP',cat_map_dict) print('mean AP ',mean_AP/(len(cat2id)*len(thresholds_list))) print('Overall Time',time.time()-start_time) if __name__ == '__main__': main() ================================================ FILE: tools/tracking/cmap_utils/__init__.py ================================================ ================================================ FILE: tools/tracking/cmap_utils/data_utils.py ================================================ import mmcv import os from mmdet3d.datasets import build_dataloader import numpy as np from copy import deepcopy from functools import partial from multiprocessing import Pool from .utils import * from .match_utils import * cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } def get_gts(dataset,new_split=False,N_WORKERS=16): roi_size = dataset.roi_size if 'av2' in dataset.ann_file: dataset_name = 'av2' else: dataset_name = 'nusc' if new_split: tmp_file = f'./tmp_gts_{dataset_name}_{roi_size[0]}x{roi_size[1]}_newsplit.pkl' else: tmp_file = f'./tmp_gts_{dataset_name}_{roi_size[0]}x{roi_size[1]}.pkl' if os.path.exists(tmp_file): print(f'loading cached gts from {tmp_file}') gts = mmcv.load(tmp_file) else: print('collecting gts...') gts = {} # pdb.set_trace() dataloader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=N_WORKERS, shuffle=False, dist=False) pbar = mmcv.ProgressBar(len(dataloader)) for data in dataloader: token = deepcopy(data['img_metas'].data[0][0]['token']) gt = deepcopy(data['vectors'].data[0][0]) # pdb.set_trace() gts[token] = gt pbar.update() del data # avoid dataloader memory crash for token, gt in gts.items(): for label, vectors in gt.items(): label_vecs = [] for vec in vectors: label_vecs.append(interp_fixed_num(vec,20)) gt[label] = label_vecs gts[token] = gt return gts def prepare_data_multi(token,idx,pred,gts,origin,roi_size,interp_num,dataset,denorm=False): num_gts = np.array([0,0,0]) num_preds = np.array([0,0,0]) denorm_gt = {} gt = gts[token] denorm_gt = {label:[] for label in cat2id.values()} scores_by_cls = {label: [] for label in cat2id.values()} vector_list = [] for i in range(len(pred['labels'])): score = pred['scores'][i] vector = pred['vectors'][i].reshape(-1,2) label = pred['labels'][i] scores_by_cls[label].append(score) if not denorm: vector_list.append(interp_fixed_num(vector,interp_num)) else: vector_list.append(interp_fixed_num(vector*roi_size+origin,interp_num)) for label in cat2id.values(): for vec in gt[label]: denorm_gt[label].append(interp_fixed_num(vec,interp_num)) for label in cat2id.values(): num_gts[label] += len(gt[label]) num_preds[label] += len(scores_by_cls[label]) return token,idx,denorm_gt, vector_list, num_gts,num_preds def get_data(pred_matching_result_raw,gts,origin,roi_size,num_interp,result_path,denorm=False): ### collect data, interpolate with multi_processing token_list = [] for idx,pred_res in enumerate(pred_matching_result_raw): token = pred_res['meta']['token'] token_list.append( (token,idx,pred_matching_result_raw[idx]) ) dataset = 'av2' if 'av2' in result_path else 'nusc' fn = partial(prepare_data_multi,gts=gts,origin=origin,roi_size=roi_size,interp_num=num_interp,dataset=dataset,denorm=denorm) denormed_gts = {} pred_matching_result = {} num_gts = np.zeros(3) num_preds = np.zeros(3) with Pool(processes=16) as pool: data_infos = pool.starmap(fn,token_list) for data_info in data_infos: token,idx, denorm_gt,pred_vector, num_gts_single,num_preds_single = data_info denormed_gts[token] = denorm_gt pred_matching_result_raw[idx]['vectors'] = pred_vector pred_matching_result[token] = pred_matching_result_raw[idx] num_gts = num_gts + num_gts_single num_preds = num_preds + num_preds_single return denormed_gts,pred_matching_result,num_gts,num_preds ================================================ FILE: tools/tracking/cmap_utils/match_utils.py ================================================ import torch import numpy as np from scipy.optimize import linear_sum_assignment from .utils import * cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } def get_prev2curr_matrix(prev_meta,curr_meta): # get relative pose prev_e2g_trans = torch.tensor(prev_meta['ego2global_translation'], dtype=torch.float64) prev_e2g_rot = torch.tensor(prev_meta['ego2global_rotation'], dtype=torch.float64) curr_e2g_trans = torch.tensor(curr_meta['ego2global_translation'], dtype=torch.float64) curr_e2g_rot = torch.tensor(curr_meta['ego2global_rotation'], dtype=torch.float64) prev_e2g_matrix = torch.eye(4, dtype=torch.float64) prev_e2g_matrix[:3, :3] = prev_e2g_rot prev_e2g_matrix[:3, 3] = prev_e2g_trans curr_g2e_matrix = torch.eye(4, dtype=torch.float64) curr_g2e_matrix[:3, :3] = curr_e2g_rot.T curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans) prev2curr_matrix = curr_g2e_matrix @ prev_e2g_matrix return prev2curr_matrix def find_matchings_iou(src_masks, tgt_masks, thresh=0.1): """Find the matching of map elements between two temporally connected frame Args: src_masks (_type_): instance masks of prev frame tgt_masks (_type_): instance masks of current frame thresh (float, optional): IOU threshold for matching. Defaults to 0.1. """ def _mask_iou(mask1, mask2): intersection = (mask1 * mask2).sum() if intersection == 0: return 0.0 union = np.logical_or(mask1, mask2).sum() return intersection / union matchings = {} for label, src_instances in src_masks.items(): tgt_instances = tgt_masks[label] cost = np.zeros([len(src_instances), len(tgt_instances)]) for i, src_ins in enumerate(src_instances): for j, tgt_ins in enumerate(tgt_instances): iou = _mask_iou(src_ins, tgt_ins) cost[i, j] = -iou row_ind, col_ind = linear_sum_assignment(cost) label_matching = [-1 for _ in range(len(src_instances))] label_matching_reverse = [-1 for _ in range(len(tgt_instances))] for i, j in zip(row_ind, col_ind): if -cost[i, j] > thresh: label_matching[i] = j label_matching_reverse[j] = i matchings[label] = (label_matching, label_matching_reverse) return matchings def find_matchings_chamfer(pred_vectors, gt_vectors, score_dict,thresh=0.5): matchings = {} for label, src_instances in pred_vectors.items(): tgt_instances = gt_vectors[label] num_gts = len(tgt_instances) num_preds = len(src_instances) label_matching = [-1 for _ in range(len(src_instances))] label_matching_reverse = [-1 for _ in range(len(tgt_instances))] if len(src_instances) == 0 or len(tgt_instances)==0: matchings[label] = (label_matching, label_matching_reverse) continue cdist = chamfer_distance_batch(src_instances, tgt_instances) label_score = np.array(score_dict[label]) matrix_min = cdist.min(axis=1) # for each det, which gt is the closest to it matrix_argmin = cdist.argmin(axis=1) sort_inds = np.argsort(-label_score) gt_covered = np.zeros(num_gts, dtype=bool) tp = np.zeros((num_preds), dtype=np.float32) fp = np.zeros((num_preds), dtype=np.float32) for i in sort_inds: if matrix_min[i] <= thresh: matched_gt = matrix_argmin[i] if not gt_covered[matched_gt]: gt_covered[matched_gt] = True label_matching[i] = matched_gt label_matching_reverse[matched_gt] = i matchings[label] = (label_matching, label_matching_reverse) return matchings def get_consecutive_vectors(prev_vectors,curr_vectors,prev2curr_matrix,origin,roi_size): # transform prev vectors prev2curr_vectors = dict() for label, vecs in prev_vectors.items(): if len(vecs) > 0: vecs = np.stack(vecs, 0) vecs = torch.tensor(vecs) N, num_points, _ = vecs.shape denormed_vecs = vecs * roi_size + origin # (num_prop, num_pts, 2) denormed_vecs = torch.cat([ denormed_vecs, denormed_vecs.new_zeros((N, num_points, 1)), # z-axis denormed_vecs.new_ones((N, num_points, 1)) # 4-th dim ], dim=-1) # (num_prop, num_pts, 4) transformed_vecs = torch.einsum('lk,ijk->ijl', prev2curr_matrix, denormed_vecs.double()).float() normed_vecs = (transformed_vecs[..., :2] - origin) / roi_size # (num_prop, num_pts, 2) normed_vecs = torch.clip(normed_vecs, min=0., max=1.) prev2curr_vectors[label] = normed_vecs else: prev2curr_vectors[label] = vecs # convert to ego space for visualization for label in prev2curr_vectors: if len(prev2curr_vectors[label]) > 0: prev2curr_vectors[label] = prev2curr_vectors[label] * roi_size + origin if len(curr_vectors[label]) > 0: curr_vecs = torch.tensor(np.stack(curr_vectors[label])) curr_vectors[label] = curr_vecs * roi_size + origin if len(prev_vectors[label]) > 0: prev_vecs = torch.tensor(np.stack(prev_vectors[label])) prev_vectors[label] = prev_vecs * roi_size + origin return prev_vectors, curr_vectors, prev2curr_vectors def filter_vectors(data_info, origin,roi_size,thr,num_interp=20): ### filter vectors over threshold filtered_vectors = {label: [] for label in cat2id.values()} for i in range(len(data_info['labels'])): score = data_info['scores'][i] label = data_info['labels'][i] v = data_info['vectors'][i] if score > thr: interp_v = interp_fixed_num(v,num_interp) filtered_vectors[label].append( (np.array(interp_v) - origin)/roi_size ) return filtered_vectors ================================================ FILE: tools/tracking/cmap_utils/utils.py ================================================ import cv2 from PIL import Image, ImageDraw import os import torch import numpy as np from shapely.geometry import LineString def import_plugin(cfg): ''' import modules from plguin/xx, registry will be update ''' import sys sys.path.append(os.path.abspath('.')) if hasattr(cfg, 'plugin'): if cfg.plugin: import importlib def import_path(plugin_dir): _module_dir = os.path.dirname(plugin_dir) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m print(_module_path) plg_lib = importlib.import_module(_module_path) plugin_dirs = cfg.plugin_dir if not isinstance(plugin_dirs, list): plugin_dirs = [plugin_dirs,] for plugin_dir in plugin_dirs: import_path(plugin_dir) def draw_polylines(vecs, roi_size, origin, cfg): results = [] for line_coords in vecs: canvas = np.zeros((cfg.canvas_size[1], cfg.canvas_size[0]), dtype=np.uint8) coords = (line_coords - origin) / roi_size * torch.tensor(cfg.canvas_size) coords = coords.numpy() cv2.polylines(canvas, np.int32([coords]), False, color=1, thickness=cfg.thickness) result = np.flipud(canvas) if result.sum() < 20: kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7,7)) result = cv2.dilate(result, kernel, iterations=1) results.append(result) return results def draw_polygons(vecs, roi_size, origin, cfg): results = [] for poly_coords in vecs: mask = Image.new("L", size=(cfg.canvas_size[0], cfg.canvas_size[1]), color=0) coords = (poly_coords - origin) / roi_size * torch.tensor(cfg.canvas_size) coords = coords.numpy() vert_list = [(x, y) for x, y in coords] if not (coords[0] == coords[-1]).all(): vert_list.append(vert_list[0]) ImageDraw.Draw(mask).polygon(vert_list, outline=1, fill=1) result = np.flipud(np.array(mask)) if result.sum() < 20: kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7,7)) result = cv2.dilate(result, kernel, iterations=1) results.append(result) return results def draw_instance_masks(vectors, roi_size, origin, cfg): masks = {} canvas = np.zeros((cfg.canvas_size[1], cfg.canvas_size[0])) for label, vecs in vectors.items(): if label == 0: masks[label] = draw_polygons(vecs, roi_size, origin, cfg) else: masks[label] = draw_polylines(vecs, roi_size, origin, cfg) for mask in masks[label]: canvas += mask return masks, canvas def interp_fixed_num(vector, num_pts): line = LineString(vector) distances = np.linspace(0, line.length, num_pts) sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).squeeze() return sampled_points def chamfer_distance_batch(pred_lines, gt_lines): _, num_pts, coord_dims = pred_lines.shape if not isinstance(pred_lines, torch.Tensor): pred_lines = torch.tensor(pred_lines) if not isinstance(gt_lines, torch.Tensor): gt_lines = torch.tensor(gt_lines) dist_mat = torch.cdist(pred_lines.view(-1, coord_dims), gt_lines.view(-1, coord_dims), p=2) # (num_query*num_points, num_gt*num_points) dist_mat = torch.stack(torch.split(dist_mat, num_pts)) # (num_query, num_points, num_gt*num_points) dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=-1)) # (num_gt, num_q, num_pts, num_pts) dist1 = dist_mat.min(-1)[0].sum(-1) dist2 = dist_mat.min(-2)[0].sum(-1) dist_matrix = (dist1 + dist2).transpose(0, 1) / (2 * num_pts) return dist_matrix.numpy() def average_precision(recalls, precisions, mode='area'): recalls = recalls[np.newaxis, :] precisions = precisions[np.newaxis, :] assert recalls.shape == precisions.shape and recalls.ndim == 2 num_scales = recalls.shape[0] ap = 0. if mode == 'area': zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) ones = np.ones((num_scales, 1), dtype=recalls.dtype) mrec = np.hstack((zeros, recalls, ones)) mpre = np.hstack((zeros, precisions, zeros)) for i in range(mpre.shape[1] - 1, 0, -1): mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) ind = np.where(mrec[0, 1:] != mrec[0, :-1])[0] ap = np.sum( (mrec[0, ind + 1] - mrec[0, ind]) * mpre[0, ind + 1]) elif mode == '11points': for thr in np.arange(0, 1 + 1e-3, 0.1): precs = precisions[0, recalls[i, :] >= thr] prec = precs.max() if precs.size > 0 else 0 ap += prec ap /= 11 else: raise ValueError( 'Unrecognized mode, only "area" and "11points" are supported') return ap ================================================ FILE: tools/tracking/prepare_gt_tracks.py ================================================ import argparse import mmcv from mmcv import Config import os from mmdet3d.datasets import build_dataset, build_dataloader import cv2 import torch import numpy as np from PIL import Image, ImageDraw import copy import imageio from scipy.optimize import linear_sum_assignment import pickle from functools import partial from multiprocessing import Pool font = cv2.FONT_HERSHEY_SIMPLEX location = (200,60) fontScale = 2 fontColor = (255,0,0) thickness = 2 lineType = 2 N_WORKERS = 16 def parse_args(): parser = argparse.ArgumentParser( description='Visualize groundtruth and results') parser.add_argument('config', help='config file path') parser.add_argument('--result', default=None, help='prediction result to visualize' 'If submission file is not provided, only gt will be visualized') parser.add_argument( '--out-dir', default='demo', help='directory where visualize results will be saved') parser.add_argument( '--visualize', action="store_true", default=False, help='whether visualize the formed gt tracks') args = parser.parse_args() return args def import_plugin(cfg): ''' import modules from plguin/xx, registry will be update ''' import sys sys.path.append(os.path.abspath('.')) if hasattr(cfg, 'plugin'): if cfg.plugin: import importlib def import_path(plugin_dir): _module_dir = os.path.dirname(plugin_dir) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m print(_module_path) plg_lib = importlib.import_module(_module_path) plugin_dirs = cfg.plugin_dir if not isinstance(plugin_dirs, list): plugin_dirs = [plugin_dirs,] for plugin_dir in plugin_dirs: import_path(plugin_dir) def draw_polylines(vecs, roi_size, origin, cfg): results = [] for line_coords in vecs: canvas = np.zeros((cfg.canvas_size[1], cfg.canvas_size[0]), dtype=np.uint8) coords = (line_coords - origin) / roi_size * torch.tensor(cfg.canvas_size) coords = coords.numpy() cv2.polylines(canvas, np.int32([coords]), False, color=1, thickness=cfg.thickness) result = np.flipud(canvas) if result.sum() < 20: kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7,7)) result = cv2.dilate(result, kernel, iterations=1) results.append(result) return results def draw_polygons(vecs, roi_size, origin, cfg): results = [] for poly_coords in vecs: mask = Image.new("L", size=(cfg.canvas_size[0], cfg.canvas_size[1]), color=0) coords = (poly_coords - origin) / roi_size * torch.tensor(cfg.canvas_size) coords = coords.numpy() vert_list = [(x, y) for x, y in coords] if not (coords[0] == coords[-1]).all(): vert_list.append(vert_list[0]) ImageDraw.Draw(mask).polygon(vert_list, outline=1, fill=1) result = np.flipud(np.array(mask)) if result.sum() < 20: kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7,7)) result = cv2.dilate(result, kernel, iterations=1) results.append(result) return results def draw_instance_masks(vectors, roi_size, origin, cfg): masks = {} for label, vecs in vectors.items(): if label == 0: masks[label] = draw_polygons(vecs, roi_size, origin, cfg) else: masks[label] = draw_polylines(vecs, roi_size, origin, cfg) return masks def _mask_iou(mask1, mask2): intersection = (mask1 * mask2).sum() if intersection == 0: return 0.0 union = np.logical_or(mask1, mask2).sum() return intersection / union def find_matchings(src_masks, tgt_masks, thresh=0.1): """Find the matching of map elements between two temporally connected frame Args: src_masks (_type_): instance masks of prev frame tgt_masks (_type_): instance masks of current frame thresh (float, optional): IOU threshold for matching. Defaults to 0.1. """ matchings = {} for label, src_instances in src_masks.items(): tgt_instances = tgt_masks[label] cost = np.zeros([len(src_instances), len(tgt_instances)]) for i, src_ins in enumerate(src_instances): for j, tgt_ins in enumerate(tgt_instances): iou = _mask_iou(src_ins, tgt_ins) cost[i, j] = -iou row_ind, col_ind = linear_sum_assignment(cost) label_matching = [-1 for _ in range(len(src_instances))] label_matching_reverse = [-1 for _ in range(len(tgt_instances))] for i, j in zip(row_ind, col_ind): if -cost[i, j] > thresh: label_matching[i] = j label_matching_reverse[j] = i matchings[label] = (label_matching, label_matching_reverse) return matchings def match_two_consecutive_frames(prev_data, curr_data, roi_size, origin, cfg): # get relative pose prev_e2g_trans = torch.tensor(prev_data['img_metas'].data['ego2global_translation'], dtype=torch.float64) prev_e2g_rot = torch.tensor(prev_data['img_metas'].data['ego2global_rotation'], dtype=torch.float64) curr_e2g_trans = torch.tensor(curr_data['img_metas'].data['ego2global_translation'], dtype=torch.float64) curr_e2g_rot = torch.tensor(curr_data['img_metas'].data['ego2global_rotation'], dtype=torch.float64) prev_e2g_matrix = torch.eye(4, dtype=torch.float64) prev_e2g_matrix[:3, :3] = prev_e2g_rot prev_e2g_matrix[:3, 3] = prev_e2g_trans curr_g2e_matrix = torch.eye(4, dtype=torch.float64) curr_g2e_matrix[:3, :3] = curr_e2g_rot.T curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans) prev2curr_matrix = curr_g2e_matrix @ prev_e2g_matrix # get vector data prev_vectors = copy.deepcopy(prev_data['vectors'].data) curr_vectors = copy.deepcopy(curr_data['vectors'].data) #meta_info = curr_data['img_metas'].data #imgs = [mmcv.imread(i) for i in meta_info['img_filenames']] #cam_extrinsics = meta_info['cam_extrinsics'] #cam_intrinsics = meta_info['cam_intrinsics'] #ego2cams = meta_info['ego2cam'] # transform prev vectors prev2curr_vectors = dict() for label, vecs in prev_vectors.items(): if len(vecs) > 0: vecs = np.stack(vecs, 0) vecs = torch.tensor(vecs) N, num_points, _ = vecs.shape denormed_vecs = vecs * roi_size + origin # (num_prop, num_pts, 2) denormed_vecs = torch.cat([ denormed_vecs, denormed_vecs.new_zeros((N, num_points, 1)), # z-axis denormed_vecs.new_ones((N, num_points, 1)) # 4-th dim ], dim=-1) # (num_prop, num_pts, 4) transformed_vecs = torch.einsum('lk,ijk->ijl', prev2curr_matrix, denormed_vecs.double()).float() normed_vecs = (transformed_vecs[..., :2] - origin) / roi_size # (num_prop, num_pts, 2) normed_vecs = torch.clip(normed_vecs, min=0., max=1.) prev2curr_vectors[label] = normed_vecs else: prev2curr_vectors[label] = vecs # convert to ego space for visualization for label in prev2curr_vectors: if len(prev2curr_vectors[label]) > 0: prev2curr_vectors[label] = prev2curr_vectors[label] * roi_size + origin if len(curr_vectors[label]) > 0: curr_vecs = torch.tensor(np.stack(curr_vectors[label])) curr_vectors[label] = curr_vecs * roi_size + origin prev2curr_masks = draw_instance_masks(prev2curr_vectors, roi_size, origin, cfg) curr_masks = draw_instance_masks(curr_vectors, roi_size, origin, cfg) prev2curr_matchings = find_matchings(prev2curr_masks, curr_masks, thresh=0.01) # For viz purpose, may display the maps in perspective images #viz_dir = os.path.join(scene_dir, '{}_viz_perspective'.format(local_idx)) #if not os.path.exists(viz_dir): # os.makedirs(viz_dir) #renderer.render_camera_views_from_vectors(curr_vectors, imgs, # cam_extrinsics, cam_intrinsics, ego2cams, 2, viz_dir) #renderer.render_bev_from_vectors(curr_vectors, out_dir=None, specified_path='cur.png') #renderer.render_bev_from_vectors(prev2curr_vectors, out_dir=None, specified_path='prev2cur.png') #from PIL import Image #background = Image.open("cur.png") #overlay = Image.open("prev2cur.png") #background = background.convert("RGBA") #overlay = overlay.convert("RGBA") #new_img = Image.blend(background, overlay, 0.5) #new_img.save("cur_overlapped.png","PNG") #import pdb; pdb.set_trace() return prev2curr_matchings def assign_global_ids(matchings_seq, vectors_seq): ids_seq = [] global_map_index = { 0: 0, 1: 0, 2: 0, } ids_0 = dict() for label, vectors in vectors_seq[0].items(): id_mapping = dict() for i, _ in enumerate(vectors): id_mapping[i] = global_map_index[label] global_map_index[label] += 1 ids_0[label] = id_mapping ids_seq.append(ids_0) # Trace all frames following the consecutive matching for t, vectors_t in enumerate(vectors_seq[1:]): ids_t = dict() for label, vectors in vectors_t.items(): reverse_matching = matchings_seq[t][label][1] id_mapping = dict() for i, _ in enumerate(vectors): if reverse_matching[i] != -1: prev_id = reverse_matching[i] global_id = ids_seq[-1][label][prev_id] else: global_id = global_map_index[label] global_map_index[label] += 1 id_mapping[i] = global_id ids_t[label] = id_mapping ids_seq.append(ids_t) return ids_seq def _denorm(vectors, roi_size, origin): for label in vectors: for i, vec in enumerate(vectors[label]): vectors[label][i] = vec * roi_size + origin return vectors def form_gt_track_single(scene_name, scene_name2idx, dataset, out_dir, cfg, args): print('Process scene {}'.format(scene_name)) renderer = dataset.renderer roi_size = torch.tensor(cfg.roi_size) origin = torch.tensor(cfg.pc_range[:2]) start_idx = scene_name2idx[scene_name][0] matchings_seq = [] vectors_seq = [] for idx in scene_name2idx[scene_name]: local_idx = idx - start_idx if idx == start_idx: prev_data = dataset[idx] if idx == scene_name2idx[scene_name][-1]: # prev_data is the last frame vectors_seq.append(prev_data['vectors'].data) break curr_data = dataset[idx+1] matchings = match_two_consecutive_frames(prev_data, curr_data, roi_size, origin, cfg) matchings_seq.append(matchings) vectors_seq.append(prev_data['vectors'].data) prev_data = curr_data # Derive global ids... # get global ids by traversing all consecutive matching results ids_info = assign_global_ids(matchings_seq, vectors_seq) matching_meta = { 'sample_ids':scene_name2idx[scene_name], 'instance_ids': ids_info, } if args.visualize: print('Visualize gt tracks for scene {}'.format(scene_name)) scene_dir = os.path.join(out_dir, scene_name) os.makedirs(scene_dir, exist_ok=True) # visualize with matched track ids imgs = [] for idx, (id_info, vectors) in enumerate(zip(ids_info, vectors_seq)): vectors = _denorm(vectors, roi_size.numpy(), origin.numpy()) save_path = os.path.join(scene_dir, f'{idx}_with_id.png') renderer.render_bev_from_vectors(vectors, out_dir=None, specified_path=save_path, id_info=id_info) viz_img = np.ascontiguousarray(cv2.imread(save_path)[:, :, ::-1], dtype=np.uint8) if idx == 0: img_shape = (viz_img.shape[1], viz_img.shape[0]) else: viz_img = cv2.resize(viz_img, img_shape) cv2.putText(viz_img, 't={}'.format(idx), location, font, fontScale, fontColor, thickness, lineType) imgs.append(viz_img) gif_path = os.path.join(scene_dir, 'matching.gif') imageio.mimsave(gif_path, imgs, duration=500) return scene_name, matching_meta def main(): args = parse_args() cfg = Config.fromfile(args.config) import_plugin(cfg) for split in ['train', 'val']: if split == 'train' and split not in cfg.match_config.ann_file: cfg.match_config.ann_file = cfg.match_config.ann_file.replace('val', 'train') if split == 'val' and split not in cfg.match_config.ann_file: cfg.match_config.ann_file = cfg.match_config.ann_file.replace('train', 'val') # build the dataset dataset = build_dataset(cfg.match_config) scene_name2idx = {} for idx, sample in enumerate(dataset.samples): scene = sample['scene_name'] if scene not in scene_name2idx: scene_name2idx[scene] = [] scene_name2idx[scene].append(idx) all_scene_names = sorted(list(scene_name2idx.keys())) all_scene_matching_meta = {} out_dir = os.path.join(args.out_dir, split) if not os.path.exists(out_dir): os.makedirs(out_dir) all_scene_infos = [] for scene_idx, scene_name in enumerate(all_scene_names): all_scene_infos.append((scene_name,)) if N_WORKERS > 0: fn = partial(form_gt_track_single, scene_name2idx=scene_name2idx, dataset=dataset, cfg=cfg, out_dir=out_dir, args=args) pool = Pool(N_WORKERS) matching_results = pool.starmap(fn, all_scene_infos) pool.close() else: matching_results =[] for scene_info in all_scene_infos: scene_name = scene_info[0] single_matching_result = form_gt_track_single(scene_name=scene_name, scene_name2idx=scene_name2idx, dataset=dataset, cfg=cfg, out_dir=out_dir, args=args) matching_results.append(single_matching_result) for scene_name, matching_meta in matching_results: all_scene_matching_meta[scene_name] = matching_meta track_gt_path = cfg.match_config.ann_file[:-4] + '_gt_tracks.pkl' with open(track_gt_path, 'wb') as f: pickle.dump(all_scene_matching_meta, f, protocol=pickle.HIGHEST_PROTOCOL) if __name__ == '__main__': main() ================================================ FILE: tools/tracking/prepare_pred_tracks.py ================================================ import argparse import mmcv from mmcv import Config import os from mmdet3d.datasets import build_dataset import cv2 import torch import numpy as np import imageio import pickle from functools import partial from multiprocessing import Pool import time from cmap_utils.utils import * from cmap_utils.match_utils import get_prev2curr_matrix, find_matchings_iou, get_consecutive_vectors,filter_vectors font = cv2.FONT_HERSHEY_SIMPLEX location = (200,60) fontScale = 2 fontColor = (255,0,0) thickness = 2 lineType = 2 cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } N_WORKERS = 10 def parse_args(): parser = argparse.ArgumentParser( description='Visualize groundtruth and results') parser.add_argument('config', help='config file path') parser.add_argument('--thr', type=float, default=0.4, help='score threshold to filter predictions') parser.add_argument( '--result_path', default=None, help='directory to submission file') parser.add_argument( '--cons_frames', default=5, type=int, help='consective frames for matchings' ) parser.add_argument( '--visual', default=0, type=int, help='whether to visual' ) args = parser.parse_args() return args def match_two_consecutive_frames_pred(args,prev_data,prev_meta, curr_data, curr_meta,roi_size, origin, cfg): prev2curr_matrix = get_prev2curr_matrix(prev_meta,curr_meta) prev_vectors = filter_vectors(prev_data,origin,roi_size,args.thr) curr_vectors = filter_vectors(curr_data,origin,roi_size,args.thr) prev_vectors, curr_vectors, prev2curr_vectors = get_consecutive_vectors(prev_vectors,curr_vectors, prev2curr_matrix,origin,roi_size) prev2curr_masks, prev2curr_viz = draw_instance_masks(prev2curr_vectors, roi_size, origin, cfg) curr_masks, curr_viz = draw_instance_masks(curr_vectors, roi_size, origin, cfg) prev2curr_matchings = find_matchings_iou(prev2curr_masks, curr_masks, thresh=0.001) curr2prev_matchings = {label:[match_info[1],match_info[0]] for label,match_info in prev2curr_matchings.items()} return curr2prev_matchings def collect_pred(data,thr): vectors = {label: [] for label in cat2id.values()} scores = {label: [] for label in cat2id.values()} for i in range(len(data['labels'])): score, label, v = data['scores'][i], data['labels'][i], data['vectors'][i] if score > thr: vectors[label].append(np.array(v)) scores[label].append(score) return vectors, scores def get_scene_matching_result(args,cfg,pred_results,dataset,origin,roi_size, scene_name2idx): ### obtain local id sequence matching results of predictions vectors_seq = [] scores_seq = [] ids_seq = [] global_map_index = { 0: 0, 1: 0, 2: 0, } frame_token_list = [] pred_data_list = [] meta_list = [] for idx in scene_name2idx: token = dataset[idx]['img_metas'].data['token'] pred_data = pred_results[token] frame_token_list.append(token) meta_list.append(dataset[idx]['img_metas'].data) pred_data_list.append(pred_data) for local_idx in range(len(frame_token_list)): curr_pred_data = pred_data_list[local_idx] vectors_info, scores = collect_pred(curr_pred_data,args.thr) vectors_seq.append(vectors_info) scores_seq.append(scores) ### assign global id for the first frame if local_idx == 0: ids_0 = dict() for label, vectors in vectors_info.items(): id_mapping = dict() for i, _ in enumerate(vectors): id_mapping[i] = global_map_index[label] global_map_index[label] += 1 ids_0[label] = id_mapping ids_seq.append(ids_0) continue ### from the farthest to the nearest history_range = range(max(local_idx-args.cons_frames,0),local_idx) tmp_ids_list = [] for comeback_idx,prev_idx in enumerate(history_range): tmp_ids = {label:{} for label in cat2id.values()} curr_pred_data = pred_data_list[local_idx] comeback_pred_data = pred_data_list[prev_idx] curr_meta = meta_list[local_idx] comeback_meta = meta_list[prev_idx] curr2prev_matching = match_two_consecutive_frames_pred(args,comeback_pred_data,comeback_meta, curr_pred_data, curr_meta,roi_size, origin, cfg) for label,match_info in curr2prev_matching.items(): for curr_match_local_idx,prev_match_local_idx in enumerate(match_info[0]): if prev_match_local_idx == -1: tmp_ids[label][curr_match_local_idx] = -1 else: prev_match_global_idx = ids_seq[prev_idx][label][prev_match_local_idx] tmp_ids[label][curr_match_local_idx] = prev_match_global_idx tmp_ids_list.append(tmp_ids) ids_n = {label:{} for label in cat2id.values()} ### assign global id based on previous k frames' global id missing_matchings = {label:[] for label in cat2id.values()} for tmp_match in tmp_ids_list[::-1]: for label, matching in tmp_match.items(): for vec_local_idx, vec_glb_idx in matching.items(): if vec_local_idx not in ids_n[label].keys(): if vec_glb_idx != -1 and vec_glb_idx not in ids_n[label].values(): ids_n[label][vec_local_idx] = vec_glb_idx if vec_local_idx in missing_matchings[label]: missing_matchings[label].remove(vec_local_idx) else: missing_matchings[label].append(vec_local_idx) ### assign new id if one vector is not matched for label,miss_match in missing_matchings.items(): for miss_idx in miss_match: if miss_idx not in ids_n[label].keys(): ids_n[label][miss_idx] = global_map_index[label] global_map_index[label] += 1 ids_seq.append(ids_n) return ids_seq, vectors_seq, scores_seq, meta_list def generate_results(ids_info,vectors_seq,scores_seq,meta_list,scene_name): ### assign global id global_gt_idx = {} result_list = [] instance_count = 0 for f_idx in range(len(ids_info)): output_dict = {'vectors':[],'global_ids':[],'labels':[],'scores':[],'local_idx':[]} output_dict['scene_name'] = scene_name output_dict['meta'] = meta_list[f_idx] for label in cat2id.values(): for local_idx, global_label_idx in ids_info[f_idx][label].items(): overall_count_idx = label*100 + global_label_idx if overall_count_idx not in global_gt_idx.keys(): overall_global_idx = instance_count global_gt_idx[overall_count_idx] = overall_global_idx instance_count += 1 else: overall_global_idx = global_gt_idx[overall_count_idx] output_dict['global_ids'].append(overall_global_idx) output_dict['vectors'].append(vectors_seq[f_idx][label][local_idx]) output_dict['scores'].append(scores_seq[f_idx][label][local_idx]) output_dict['labels'].append(label) output_dict['local_idx'] = f_idx result_list.append(output_dict) return result_list def get_matching_single(scene_name,args,scene_name2idx,dataset,cfg,pred_results,origin,roi_size): name2idx = scene_name2idx[scene_name] ids_info, vectors_seq,scores_seq,meta_list = get_scene_matching_result(args,cfg,pred_results,dataset, origin,roi_size,name2idx) gen_result = generate_results(ids_info,vectors_seq,scores_seq,meta_list,scene_name) return (scene_name,ids_info,gen_result) def main(): args = parse_args() cfg = Config.fromfile(args.config) import_plugin(cfg) dataset = build_dataset(cfg.match_config) scene_name2idx = {} scene_name2token = {} for idx, sample in enumerate(dataset.samples): scene = sample['scene_name'] token = sample['token'] if scene not in scene_name2idx: scene_name2idx[scene] = [] scene_name2token[scene] = [] scene_name2idx[scene].append(idx) submission = mmcv.load(args.result_path) results = submission['results'] all_scene_names = sorted(list(scene_name2idx.keys())) all_scene_matching_meta = {} scene_info_list = [] for single_scene_name in all_scene_names: scene_info_list.append( (single_scene_name,args) ) roi_size = torch.tensor(cfg.roi_size).numpy() origin = torch.tensor(cfg.pc_range[:2]).numpy() start_time = time.time() if N_WORKERS > 0: fn = partial(get_matching_single, scene_name2idx=scene_name2idx,dataset=dataset,cfg=cfg, pred_results=results,origin=origin,roi_size=roi_size) pool = Pool(N_WORKERS) matching_results = pool.starmap(fn,scene_info_list) pool.close() else: matching_results =[] for scene_info in scene_info_list: scene_name = scene_info[0] single_matching_result = get_matching_single(scene_name=scene_name, scene_name2idx=scene_name2idx, args=args, dataset=dataset,cfg=cfg,pred_results=results,origin=origin,roi_size=roi_size) matching_results.append(single_matching_result) final_reuslt = [] for single_matching_info in matching_results: scene_name = single_matching_info[0] single_matching = single_matching_info[1] all_scene_matching_meta[scene_name] = single_matching final_reuslt.extend(single_matching_info[2]) meta_path = args.result_path.replace('submission_vector.json','pos_predictions_{}.pkl'.format(args.cons_frames)) with open(meta_path, 'wb') as f: pickle.dump(final_reuslt, f, protocol=pickle.HIGHEST_PROTOCOL) print('Matching Time',time.time()-start_time) if __name__ == '__main__': main() ================================================ FILE: tools/train.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- from __future__ import division import argparse import copy import mmcv import os import time import torch import warnings from mmcv import Config, DictAction from mmcv.runner import get_dist_info, init_dist, wrap_fp16_model from os import path as osp from mmdet import __version__ as mmdet_version from mmdet3d import __version__ as mmdet3d_version from mmdet3d.apis import train_model from mmdet3d.datasets import build_dataset from mmdet3d.models import build_model from mmdet3d.utils import collect_env, get_root_logger from mmdet.apis import set_random_seed from mmseg import __version__ as mmseg_version from mmcv.utils import TORCH_VERSION, digit_version def parse_args(): parser = argparse.ArgumentParser(description='Train a detector') parser.add_argument('config', help='train config file path') parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument( '--resume-from', help='the checkpoint file to resume from') parser.add_argument( '--no-validate', action='store_true', help='whether not to evaluate the checkpoint during training') group_gpus = parser.add_mutually_exclusive_group() group_gpus.add_argument( '--gpus', type=int, help='number of gpus to use ' '(only applicable to non-distributed training)') group_gpus.add_argument( '--gpu-ids', type=int, nargs='+', help='ids of gpus to use ' '(only applicable to non-distributed training)') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument( '--deterministic', action='store_true', help='whether to set deterministic options for CUDNN backend.') parser.add_argument( '--options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file (deprecate), ' 'change to --cfg-options instead.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) parser.add_argument( '--autoscale-lr', action='store_true', help='automatically scale lr with the number of gpus') args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.cfg_options: raise ValueError( '--options and --cfg-options cannot be both specified, ' '--options is deprecated in favor of --cfg-options') if args.options: warnings.warn('--options is deprecated in favor of --cfg-options') args.cfg_options = args.options return args def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # import modules from plguin/xx, registry will be updated import sys sys.path.append(os.path.abspath('.')) if hasattr(cfg, 'plugin'): if cfg.plugin: import importlib if hasattr(cfg, 'plugin_dir'): def import_path(plugin_dir): _module_dir = os.path.dirname(plugin_dir) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m print(_module_path) plg_lib = importlib.import_module(_module_path) plugin_dirs = cfg.plugin_dir if not isinstance(plugin_dirs,list): plugin_dirs = [plugin_dirs,] for plugin_dir in plugin_dirs: import_path(plugin_dir) else: # import dir is the dirpath for the config file _module_dir = os.path.dirname(args.config) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m print(_module_path) plg_lib = importlib.import_module(_module_path) # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW': cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # re-set gpu_ids with distributed training mode _, world_size = get_dist_info() cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # dump config cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') # specify logger name, if we still use 'mmdet', the output info will be # filtered and won't be saved in the log_file # TODO: ugly workaround to judge whether we are training det or seg model if cfg.model.type in ['EncoderDecoder3D']: logger_name = 'mmseg' else: logger_name = 'mmdet' logger = get_root_logger( log_file=log_file, log_level=cfg.log_level, name=logger_name) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info meta['config'] = cfg.pretty_text # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed meta['exp_name'] = osp.basename(args.config) model = build_model( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) model.init_weights() if cfg.get('SyncBN', False): import torch.nn as nn model = nn.SyncBatchNorm.convert_sync_batchnorm(model) logger.info("Using SyncBN") logger.info(f'Model:\n{model}') cfg.data.train.work_dir = cfg.work_dir cfg.data.val.work_dir = cfg.work_dir datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) # in case we use a dataset wrapper if 'dataset' in cfg.data.train: val_dataset.pipeline = cfg.data.train.dataset.pipeline else: val_dataset.pipeline = cfg.data.train.pipeline # set test_mode=False here in deep copied config # which do not affect AP/AR calculation later # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa val_dataset.test_mode = False datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict( mmdet_version=mmdet_version, mmseg_version=mmseg_version, mmdet3d_version=mmdet3d_version, config=cfg.pretty_text, CLASSES=None, PALETTE=datasets[0].PALETTE # for segmentors if hasattr(datasets[0], 'PALETTE') else None) # add an attribute for visualization convenience # model.CLASSES = datasets[0].CLASSES from plugin.core.apis import custom_train_model custom_train_model( model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta) if __name__ == '__main__': main() ================================================ FILE: tools/visualization/vis_global.py ================================================ import sys import os SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.dirname(SCRIPT_DIR)) import argparse import mmcv from mmcv import Config import matplotlib.transforms as transforms from mmdet3d.datasets import build_dataset import cv2 import torch import numpy as np from PIL import Image import pickle from collections import defaultdict import matplotlib.pyplot as plt from shapely.geometry import LineString, Point from shapely.ops import nearest_points from scipy.spatial import ConvexHull from PIL import Image import cv2 import imageio import math from tracking.cmap_utils.match_utils import * def parse_args(): parser = argparse.ArgumentParser( description='Visualize groundtruth and results') parser.add_argument('config', help='config file path') parser.add_argument( '--out_dir', required=True, default="", help='') parser.add_argument( '--data_path', required=True, default="", help='Directory to submission file') parser.add_argument( '--scene_id', type=str, nargs='+', default=None, help='Specify the scene_id to visulize') parser.add_argument( '--option', required=True, default="vis-pred", help='vis-pred, vis-gt') parser.add_argument( '--simplify', default=0.5, type=float, help='Line simplification tolerance' ) parser.add_argument( '--line_opacity', default=0.75, type=float, help='Line opacity' ) parser.add_argument( '--overwrite', default=1, type=int, help='Whether to overwrite the existing visualization files' ) parser.add_argument( '--per_frame_result', default=1, type=int, help='Whether to visualize per frame result' ) parser.add_argument( '--dpi', default=20, type=int, help='DPI of the output image' ) parser.add_argument( '--transparent', default=False, action='store_true', help='Whether to use transparent background' ) args = parser.parse_args() return args def combine_images_with_labels(image_paths, labels, output_path, font_scale=0.5, font_color=(0, 0, 0)): # Load images images = [cv2.imread(path) for path in image_paths] # Determine the maximum dimensions max_height = max(image.shape[0] for image in images) max_width = max(image.shape[1] for image in images) # Create a blank white canvas to hold the 2x2 grid of images final_image = np.ones((max_height * 1, max_width * 2, 3), dtype=np.uint8) * 255 # Font settings font = cv2.FONT_HERSHEY_SIMPLEX for i, img in enumerate(images): # Resize image if necessary img = cv2.resize(img, (max_width, max_height)) # Calculate position for each image x_offset = (i % 2) * max_width y_offset = (i // 2) * max_height # Place image in the canvas final_image[y_offset:y_offset+max_height, x_offset:x_offset+max_width] = img # Add label cv2.putText(final_image, labels[i], (x_offset + 5, y_offset + 15), font, font_scale, font_color, 1, cv2.LINE_AA) # Save the final image cv2.imwrite(output_path, final_image) def merge_corssing(polylines): convex_hull_polygon = find_largest_convex_hull(polylines) return convex_hull_polygon def find_largest_convex_hull(polylines): # Merge all points from the polylines into a single collection all_points = [] for polyline in polylines: all_points.extend(list(polyline.coords)) # Convert the points to a NumPy array for processing with scipy points_array = np.array(all_points) # Compute the convex hull using scipy hull = ConvexHull(points_array) # Extract the vertices of the convex hull hull_points = points_array[hull.vertices] # Create a shapely Polygon object representing the convex hull convex_hull_polygon = LineString(hull_points).convex_hull return convex_hull_polygon def project_point_onto_line(point, line): """Project a point onto a line segment and return the projected point.""" line_start, line_end = np.array(line.coords[0]), np.array(line.coords[1]) line_vec = line_end - line_start point_vec = np.array(point.coords[0]) - line_start line_len = np.linalg.norm(line_vec) line_unitvec = line_vec / line_len point_vec_scaled = point_vec / line_len t = np.dot(line_unitvec, point_vec_scaled) t = np.clip(t, 0.0, 1.0) nearest = line_start + t * line_vec return Point(nearest) def find_nearest_projection_on_polyline(point, polyline): """Find the nearest projected point of a point onto a polyline.""" min_dist = float('inf') nearest_point = None for i in range(len(polyline.coords) - 1): segment = LineString(polyline.coords[i:i+2]) proj_point = project_point_onto_line(point, segment) dist = point.distance(proj_point) if dist < min_dist: min_dist = dist nearest_point = proj_point return np.array(nearest_point.coords) def find_and_sort_intersections(segmenet1, segment2): # Convert polylines to LineString objects # Find the intersection between the two LineStrings intersection = segmenet1.intersection(segment2) # Prepare a list to store intersection points intersections = [] # Check the type of intersection if "Point" in intersection.geom_type: # Single point or multiple points if intersection.geom_type == "MultiPoint": intersections.extend(list(intersection)) else: intersections.append(intersection) elif "LineString" in intersection.geom_type: # In case of lines or multiline, get boundary points (start and end points of line segments) if intersection.geom_type == "MultiLineString": for line in intersection: intersections.extend(list(line.boundary)) else: intersections.extend(list(intersection.boundary)) # Remove duplicates and ensure they are Point objects unique_intersections = [Point(coords) for coords in set(pt.coords[0] for pt in intersections)] # Sort the intersection points by their distance along the first polyline sorted_intersections = sorted(unique_intersections, key=lambda pt: segmenet1.project(pt)) return sorted_intersections def get_intersection_point_on_line(line, intersection): intersection_points = find_and_sort_intersections(LineString(line), intersection) if len(intersection_points) >= 2: line_intersect_start = intersection_points[0] line_intersect_end = intersection_points[-1] elif len(intersection_points) == 1: if intersection.contains(Point(line[0])): line_intersect_start = Point(line[0]) line_intersect_end = intersection_points[0] elif intersection.contains(Point(line[-1])): line_intersect_start = Point(line[-1]) line_intersect_end = intersection_points[0] else: return None, None else: return None, None return line_intersect_start, line_intersect_end def merge_l2_points_to_l1(line1, line2, line2_intersect_start, line2_intersect_end): # get nearest point on line2 to line2_intersect_start line2_point_to_merge = [] line2_intersect_start_dis = line2.project(line2_intersect_start) line2_intersect_end_dis = line2.project(line2_intersect_end) for point in np.array(line2.coords): point_geom = Point(point) dis = line2.project(point_geom) if dis > line2_intersect_start_dis and dis < line2_intersect_end_dis: line2_point_to_merge.append(point) # merged the points merged_line2_points = [] for point in line2_point_to_merge: # Use the `project` method to find the distance along the polyline to the closest point point_geom = Point(point) # Use the `interpolate` method to find the actual point on the polyline closest_point_on_line = find_nearest_projection_on_polyline(point_geom, line1) if len(closest_point_on_line) == 0: merged_line2_points.append(point) else: merged_line2_points.append(((closest_point_on_line + point) / 2)[0]) if len(merged_line2_points) == 0: merged_line2_points = np.array([]).reshape(0, 2) else: merged_line2_points = np.array(merged_line2_points) return merged_line2_points def segment_line_based_on_merged_area(line, merged_points): if len(merged_points) == 0: return np.array(line.coords), np.array([]).reshape(0, 2) first_merged_point = merged_points[0] last_merged_point = merged_points[-1] start_dis = line.project(Point(first_merged_point)) end_dis = line.project(Point(last_merged_point)) start_segmenet = [] for point in np.array(line.coords): point_geom = Point(point) if line.project(point_geom) < start_dis: start_segmenet.append(point) end_segmenet = [] for point in np.array(line.coords): point_geom = Point(point) if line.project(point_geom) > end_dis: end_segmenet.append(point) if len(start_segmenet) == 0: start_segmenet = np.array([]).reshape(0, 2) else: start_segmenet = np.array(start_segmenet) if len(end_segmenet) == 0: end_segmenet = np.array([]).reshape(0, 2) else: end_segmenet = np.array(end_segmenet) return start_segmenet, end_segmenet def get_bbox_size_for_points(points): if len(points) == 0: return 0, 0 # Initialize min and max coordinates with the first point min_x, min_y = points[0] max_x, max_y = points[0] # Iterate through each point to update min and max coordinates for x, y in points[1:]: min_x = min(min_x, x) min_y = min(min_y, y) max_x = max(max_x, x) max_y = max(max_y, y) return max_x - min_x, max_y - min_y def get_longer_segmenent_to_merged_points(l1_segment, l2_segment, merged_line2_points, segment_type="start"): # remove points from segments if it's too close to merged_line2_points l1_segment_temp = [] if len(merged_line2_points) > 1: merged_polyline = LineString(merged_line2_points) for point in l1_segment: if merged_polyline.distance(Point(point)) > 0.1: l1_segment_temp.append(point) elif len(merged_line2_points) == 1: for point in l1_segment: if Point(point).distance(Point(merged_line2_points[0])) > 0.1: l1_segment_temp.append(point) elif len(merged_line2_points) == 0: l1_segment_temp = l1_segment l1_segment = np.array(l1_segment_temp) l2_segmenet_temp = [] if len(merged_line2_points) > 1: merged_polyline = LineString(merged_line2_points) for point in l2_segment: if merged_polyline.distance(Point(point)) > 0.1: l2_segmenet_temp.append(point) elif len(merged_line2_points) == 1: for point in l2_segment: if Point(point).distance(Point(merged_line2_points[0])) > 0.1: l2_segmenet_temp.append(point) elif len(merged_line2_points) == 0: l2_segmenet_temp = l2_segment l2_segment = np.array(l2_segmenet_temp) if segment_type == "start": temp = l1_segment.tolist() if len(merged_line2_points) > 0: temp.append(merged_line2_points[0]) l1_start_box_size = get_bbox_size_for_points(temp) temp = l2_segment.tolist() if len(merged_line2_points) > 0: temp.append(merged_line2_points[0]) l2_start_box_size = get_bbox_size_for_points(temp) if l2_start_box_size[0]*l2_start_box_size[1] >= l1_start_box_size[0]*l1_start_box_size[1]: longer_segment = l2_segment else: longer_segment = l1_segment else: temp = l1_segment.tolist() if len(merged_line2_points) > 0: temp.append(merged_line2_points[-1]) l1_end_box_size = get_bbox_size_for_points(temp) temp = l2_segment.tolist() if len(merged_line2_points) > 0: temp.append(merged_line2_points[-1]) l2_end_box_size = get_bbox_size_for_points(temp) if l2_end_box_size[0]*l2_end_box_size[1] >= l1_end_box_size[0]*l1_end_box_size[1]: longer_segment = l2_segment else: longer_segment = l1_segment if len(longer_segment) == 0: longer_segment = np.array([]).reshape(0, 2) else: longer_segment = np.array(longer_segment) return longer_segment def get_line_lineList_max_intersection(merged_lines, line, thickness=4): pre_line = merged_lines[-1] max_iou = 0 merged_line_index = 0 for line_index, one_merged_line in enumerate(merged_lines): line1 = LineString(one_merged_line) line2 = LineString(line) thick_line1 = line1.buffer(thickness) thick_line2 = line2.buffer(thickness) intersection = thick_line1.intersection(thick_line2) if intersection.area / thick_line2.area > max_iou: max_iou = intersection.area / thick_line2.area pre_line = np.array(line1.coords) merged_line_index = line_index return intersection, pre_line, merged_line_index def algin_l2_with_l1(line1, line2): if len(line1) > len(line2): l2_len = len(line2) line1_geom = LineString(line1) interval_length = line1_geom.length / (l2_len - 1) line1 = [np.array(line1_geom.interpolate(interval_length * i)) for i in range(l2_len)] elif len(line1) < len(line2): l1_len = len(line1) line2_geom = LineString(line2) interval_length = line2_geom.length / (l1_len - 1) line2 = [np.array(line2_geom.interpolate(interval_length * i)) for i in range(l1_len)] # make line1 and line2 same direction, pre_line.coords[0] shold be closer to line2.coords[0] line1_geom = LineString(line1) line2_flip = np.flip(line2, axis=0) line2_traj_len = 0 for point_idx, point in enumerate(line2): line2_traj_len += np.linalg.norm(point - line1[point_idx]) flip_line2_traj_len = 0 for point_idx, point in enumerate(line2_flip): flip_line2_traj_len += np.linalg.norm(point - line1[point_idx]) if abs(flip_line2_traj_len - line2_traj_len) < 3: # get the trajectory length line2_walk_len = 0 for point in line2: point_geom = Point(point) proj_point = find_nearest_projection_on_polyline(point_geom, line1_geom) if len(proj_point) != 0: line2_walk_len += line1_geom.project(Point(proj_point[0])) flip_line2_walk_len = 0 for point in line2: point_geom = Point(point) proj_point = find_nearest_projection_on_polyline(point_geom, line1_geom) if len(proj_point) != 0: flip_line2_walk_len += line1_geom.project(Point(proj_point[0])) if flip_line2_walk_len < line2_walk_len: return line2_flip else: return line2 if flip_line2_traj_len < line2_traj_len: return line2_flip else: return line2 def _is_u_shape(line, direction): assert direction in ['left', 'right'], 'Wrong direction argument {}'.format(direction) line_geom = LineString(line) length = line_geom.length mid_point = np.array(line_geom.interpolate(length / 2).coords)[0] start = line[0] end = line[-1] if direction == 'left': cond1 = mid_point[0] < start[0] and mid_point[0] < end[0] else: cond1 = mid_point[0] > start[0] and mid_point[0] > end[0] dist_start_end = np.sqrt((start[0] - end[0])**2 + (start[1]-end[1])**2) cond2 = length >= math.pi / 2 * dist_start_end return cond1 and cond2 def check_circle(pre_line, vec): # if the last line in merged_lines is a circle if np.linalg.norm(pre_line[0] - pre_line[-1]) == 0: return True # if the last line in merged_lines is almost a circle and the new line is close to the circle if np.linalg.norm(pre_line[0] - pre_line[-1]) < 0.1: vec_2_circle_distance = 0 for point in vec: vec_2_circle_distance += LineString(pre_line).distance(Point(point)) if vec_2_circle_distance < 3: return True return False def connect_polygon(merged_polyline, merged_lines): start_end_connect = [merged_polyline[0], merged_polyline[-1]] iou = [] length_ratio = [] for one_merged_line in merged_lines: line1 = LineString(one_merged_line) line2 = LineString(start_end_connect) thickness = 1 thick_line1 = line1.buffer(thickness) thick_line2 = line2.buffer(thickness) intersection = thick_line1.intersection(thick_line2) iou.append(intersection.area / thick_line2.area) length_ratio.append(line1.length / line2.length) if max(iou) > 0.95 and max(length_ratio) > 3.0: merged_polyline = np.concatenate((merged_polyline, [merged_polyline[0]]), axis=0) return merged_polyline def iou_merge_boundry(merged_lines, vec, thickness=1): # intersection : the intersection area between the new line and the line in the merged_lines; is a polygon intersection, pre_line, merged_line_index = get_line_lineList_max_intersection(merged_lines, vec, thickness) # corner case: check if the last line in merged_lines is a circle if check_circle(pre_line, vec): return merged_lines # Handle U-shape, the main corner case if _is_u_shape(pre_line, 'left'): if _is_u_shape(vec, 'right'): # Two u shapes with opposite directions, directly generate a polygon exterior polygon = find_largest_convex_hull([LineString(pre_line), LineString(vec)]) merged_lines[-1] = np.array(polygon.exterior.coords) return merged_lines elif not _is_u_shape(vec, 'left'): line_geom1 = LineString(pre_line) line1_dists = np.array([line_geom1.project(Point(x)) for x in pre_line]) split_mask = line1_dists > line_geom1.length / 2 split_1 = LineString(pre_line[~split_mask]) split_2 = LineString(pre_line[split_mask]) # get the projected distance np1 = np.array(nearest_points(split_1, Point(Point(pre_line[-1])))[0].coords)[0] np2 = np.array(nearest_points(split_2, Point(Point(pre_line[0])))[0].coords)[0] dist1 = np.linalg.norm(np1-pre_line[-1]) dist2 = np.linalg.norm(np2-pre_line[0]) dist = min(dist1, dist2) if dist < thickness: line_geom2 = LineString(vec) dist1 = line_geom2.distance(Point(pre_line[0])) dist2 = line_geom2.distance(Point(pre_line[-1])) pt = pre_line[0] if dist1 <= dist2 else pre_line[-1] if vec[0][0] > vec[1][0]: vec = np.array(vec[::-1]) line_geom2 = LineString(vec) proj_length = line_geom2.project(Point(pt)) l2_select_mask = np.array([line_geom2.project(Point(x)) > proj_length for x in vec]) selected_l2 = vec[l2_select_mask] merged_result = np.concatenate([pre_line[:-1, :], pt[None, ...], selected_l2], axis=0) merged_lines[-1] = merged_result return merged_lines # align the new line with the line in the merged_lines so that points on two lines are traversed in the same direction vec = algin_l2_with_l1(pre_line, vec) line1 = LineString(pre_line) line2 = LineString(vec) # get the intersection points between IOU area and two lines line1_intersect_start, line1_intersect_end = get_intersection_point_on_line(pre_line, intersection) line2_intersect_start, line2_intersect_end = get_intersection_point_on_line(vec, intersection) # If no intersection points are found, use the last point of the line1 and the first point of the line2 as the intersection points --> this is a corner case that we will connect the two lines head to tail directly if line1_intersect_start is None or line1_intersect_end is None or line2_intersect_start is None or line2_intersect_end is None: line1_intersect_start = Point(pre_line[-1]) line1_intersect_end = Point(pre_line[-1]) line2_intersect_start = Point(vec[0]) line2_intersect_end = Point(vec[0]) # merge the points on line2's intersection area towards line1 merged_line2_points = merge_l2_points_to_l1(line1, line2, line2_intersect_start, line2_intersect_end) # merge the points on line1's intersection area towards line2 merged_line1_points = merge_l2_points_to_l1(line2, line1, line1_intersect_start, line1_intersect_end) # segment the lines based on the merged points (intersection area); split the line in to start segment and merged segment and end segment l2_start_segment, l2_end_segment = segment_line_based_on_merged_area(line2, merged_line2_points) l1_start_segment, l1_end_segment = segment_line_based_on_merged_area(line1, merged_line1_points) # choose the longer segment between line1 and line2 to be the final start segment and end segment start_segment = get_longer_segmenent_to_merged_points(l1_start_segment, l2_start_segment, merged_line2_points, segment_type="start") end_segment = get_longer_segmenent_to_merged_points(l1_end_segment, l2_end_segment, merged_line2_points, segment_type="end") merged_polyline = np.concatenate((start_segment, merged_line2_points, end_segment), axis=0) # corner case : check if need to connect the polyline to form a circle merged_polyline = connect_polygon(merged_polyline, merged_lines) merged_lines[merged_line_index] = merged_polyline return merged_lines def iou_merge_divider(merged_lines, vec, thickness=1): # intersection : the intersection area between the new line and the line in the merged_lines; is a polygon # pre_line : the line in merged_lines that has max IOU with the new line intersection, pre_line, merged_line_index = get_line_lineList_max_intersection(merged_lines, vec, thickness) # align the new line with the line in the merged_lines so that points on two lines are traversed in the same direction vec = algin_l2_with_l1(pre_line, vec) line1 = LineString(pre_line) line2 = LineString(vec) # get the intersection points between IOU area and two lines line1_intersect_start, line1_intersect_end = get_intersection_point_on_line(pre_line, intersection) line2_intersect_start, line2_intersect_end = get_intersection_point_on_line(vec, intersection) # If no intersection points are found, use the last point of the line1 and the first point of the line2 as the intersection points --> this is a corner case that we will connect the two lines head to tail directly if line1_intersect_start is None or line1_intersect_end is None or line2_intersect_start is None or line2_intersect_end is None: line1_intersect_start = Point(pre_line[-1]) line1_intersect_end = Point(pre_line[-1]) line2_intersect_start = Point(vec[0]) line2_intersect_end = Point(vec[0]) # merge the points on line2's intersection area towards line1 merged_line2_points = merge_l2_points_to_l1(line1, line2, line2_intersect_start, line2_intersect_end) # merge the points on line1's intersection area towards line2 merged_line1_points = merge_l2_points_to_l1(line2, line1, line1_intersect_start, line1_intersect_end) # segment the lines based on the merged points (intersection area); split the line in to start segment and merged segment and end segment l2_start_segment, l2_end_segment = segment_line_based_on_merged_area(line2, merged_line2_points) l1_start_segment, l1_end_segment = segment_line_based_on_merged_area(line1, merged_line1_points) # choose the longer segment between line1 and line2 to be the final start segment and end segment start_segment = get_longer_segmenent_to_merged_points(l1_start_segment, l2_start_segment, merged_line2_points, segment_type="start") end_segment = get_longer_segmenent_to_merged_points(l1_end_segment, l2_end_segment, merged_line2_points, segment_type="end") merged_polyline = np.concatenate((start_segment, merged_line2_points, end_segment), axis=0) # update the merged_lines merged_lines[merged_line_index] = merged_polyline return merged_lines def merge_divider(vecs=None, thickness=1): merged_lines = [] for vec in vecs: # if the merged_lines is empty, add the first line if len(merged_lines) == 0: merged_lines.append(vec) continue # thicken the vec (the new line) and the merged_lines calculate the max IOU between the new line and the merged_lines iou = [] for one_merged_line in merged_lines: line1 = LineString(one_merged_line) line2 = LineString(vec) thick_line1 = line1.buffer(thickness) thick_line2 = line2.buffer(thickness) intersection = thick_line1.intersection(thick_line2) iou.append(intersection.area / thick_line2.area) # If the max IOU is 0, add the new line to the merged_lines if max(iou) == 0: merged_lines.append(vec) # If IOU is not 0, merge the new line with the line in the merged_lines else: merged_lines = iou_merge_divider(merged_lines, vec, thickness=thickness) return merged_lines def merge_boundary(vecs=None, thickness=1, iou_threshold=0.95): merged_lines = [] for vec in vecs: # if the merged_lines is empty, add the first line if len(merged_lines) == 0: merged_lines.append(vec) continue # thicken the vec (the new line) and the merged_lines calculate the max IOU between the new line and the merged_lines iou = [] for one_merged_line in merged_lines: line1 = LineString(one_merged_line) line2 = LineString(vec) thick_line1 = line1.buffer(thickness) thick_line2 = line2.buffer(thickness) intersection = thick_line1.intersection(thick_line2) iou.append(intersection.area / thick_line2.area) # If the max IOU larger than the threshold, skip the new line if max(iou) > iou_threshold: continue # If IOU is not 0, merge the new line with the line in the merged_lines if max(iou) > 0: merged_lines = iou_merge_boundry(merged_lines, vec, thickness=thickness) else: merged_lines.append(vec) return merged_lines def get_consecutive_vectors_with_opt(prev_vectors=None,prev2curr_matrix=None,origin=None,roi_size=None, denormalize=False, clip=False): # transform prev vectors prev2curr_vectors = dict() for label, vecs in prev_vectors.items(): if len(vecs) > 0: vecs = np.stack(vecs, 0) vecs = torch.tensor(vecs) N, num_points, _ = vecs.shape if denormalize: denormed_vecs = vecs * roi_size + origin # (num_prop, num_pts, 2) else: denormed_vecs = vecs denormed_vecs = torch.cat([ denormed_vecs, denormed_vecs.new_zeros((N, num_points, 1)), # z-axis denormed_vecs.new_ones((N, num_points, 1)) # 4-th dim ], dim=-1) # (num_prop, num_pts, 4) transformed_vecs = torch.einsum('lk,ijk->ijl', prev2curr_matrix, denormed_vecs.double()).float() normed_vecs = (transformed_vecs[..., :2] - origin) / roi_size # (num_prop, num_pts, 2) if clip: normed_vecs = torch.clip(normed_vecs, min=0., max=1.) prev2curr_vectors[label] = normed_vecs else: prev2curr_vectors[label] = vecs # convert to ego space for visualization for label in prev2curr_vectors: if len(prev2curr_vectors[label]) > 0: prev2curr_vectors[label] = prev2curr_vectors[label] * roi_size + origin return prev2curr_vectors def get_prev2curr_vectors(vecs=None, prev2curr_matrix=None,origin=None,roi_size=None, denormalize=False, clip=False): # transform prev vectors if len(vecs) > 0: vecs = np.stack(vecs, 0) vecs = torch.tensor(vecs) N, num_points, _ = vecs.shape if denormalize: denormed_vecs = vecs * roi_size + origin # (num_prop, num_pts, 2) else: denormed_vecs = vecs denormed_vecs = torch.cat([ denormed_vecs, denormed_vecs.new_zeros((N, num_points, 1)), # z-axis denormed_vecs.new_ones((N, num_points, 1)) # 4-th dim ], dim=-1) # (num_prop, num_pts, 4) transformed_vecs = torch.einsum('lk,ijk->ijl', prev2curr_matrix, denormed_vecs.double()).float() vecs = (transformed_vecs[..., :2] - origin) / roi_size # (num_prop, num_pts, 2) if clip: vecs = torch.clip(vecs, min=0., max=1.) # vecs = vecs * roi_size + origin return vecs def plot_fig_merged_per_frame(num_frames, car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args): os.makedirs(pred_save_folder, exist_ok=True) # key the current status of the instance, add into the dict when it first appears instance_bank = dict() # trace the path reversely, get the sub-sampled traj for visualizing the car pre_center = car_trajectory[-1][0] selected_traj_timesteps = [] for timestep, (car_center, rotation_degrees) in enumerate(car_trajectory[::-1]): if np.linalg.norm(car_center - pre_center) < 5 and timestep > 0 and timestep < len(car_trajectory)-1: continue selected_traj_timesteps.append(len(car_trajectory)-1-timestep) pre_center = car_center selected_traj_timesteps = selected_traj_timesteps[::-1] image_list = [pred_save_folder + f'/{frame_timestep}.png' for frame_timestep in range(num_frames)] #save_t(len(image_list), pred_save_folder) # save the timestep text mp4 file # plot the figure at each frame for frame_timestep in range(num_frames): plt.figure(facecolor='lightgreen') fig = plt.figure(figsize=(int(abs(x_min) + abs(x_max)) + 10 , int(abs(y_min) + abs(y_max)) + 10)) ax = fig.add_subplot(1, 1, 1) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) # setup the figure with car car_img = Image.open('resources/car-orange.png') faded_rate = np.linspace(0.2, 1, num=len(car_trajectory)) pre_center = car_trajectory[0][0] for t in selected_traj_timesteps: # only plot the car at the selected timesteps if t > frame_timestep: # if the car has not appeared at this frame break car_center, rotation_degrees = car_trajectory[t] translation = transforms.Affine2D().translate(car_center[0], car_center[1]) rotation = transforms.Affine2D().rotate_deg(rotation_degrees) rotation_translation = rotation + translation ax.imshow(car_img, extent=[-2.2, 2.2, -2, 2], transform=rotation_translation+ ax.transData, alpha=faded_rate[t]) for vec_tag, vec_all_frames in id_prev2curr_pred_vectors.items(): vec_frame_info = id_prev2curr_pred_frame[vec_tag] first_appear_frame = sorted(list(vec_frame_info.keys()))[0] need_merge = False if frame_timestep < first_appear_frame : # the instance has not appeared continue elif frame_timestep in vec_frame_info: need_merge = True vec_index_in_instance = vec_frame_info[frame_timestep] label, vec_glb_idx = vec_tag.split('_') label = int(label) vec_glb_idx = int(vec_glb_idx) if need_merge: curr_vec = vec_all_frames[vec_index_in_instance] curr_vec_polyline = LineString(curr_vec) if vec_tag not in instance_bank: # if the instance first appears polylines = [curr_vec_polyline,] else: # if the instance has appeared before, polylines = previous merged polyline + current polyline polylines = instance_bank[vec_tag] + [curr_vec_polyline,] else: # if the instance has not appeared in this frame polylines = instance_bank[vec_tag] if label == 0: # ped_crossing color = 'b' elif label == 1: # divider color = 'r' elif label == 2: # boundary color = 'g' if label == 0: # crossing, merged by convex hull if need_merge: polygon = merge_corssing(polylines) polygon = polygon.simplify(args.simplify) vector = np.array(polygon.exterior.coords) else: # if no new instance, use the previous merged polyline to plot vector = np.array(polylines[0].coords) pts = vector[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity) ax.plot(x, y, "o", color=color, markersize=50) # update instance bank for ped updated_polyline = LineString(vector) instance_bank[vec_tag] = [updated_polyline, ] elif label == 1: # divider, merged fitting a polyline if need_merge: polylines_vecs = [np.array(one_line.coords) for one_line in polylines] polylines_vecs = merge_divider(polylines_vecs) else: # if no new instance, use the previous merged polyline to plot polylines_vecs = [np.array(line.coords) for line in polylines] for one_line in polylines_vecs: one_line = np.array(LineString(one_line).simplify(args.simplify*2).coords) pts = one_line[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity) ax.plot(x, y, "o", color=color, markersize=50) # update instance bank for line updated_polylines = [LineString(vec) for vec in polylines_vecs] instance_bank[vec_tag] = updated_polylines elif label == 2: # boundary, do not merge if need_merge: polylines_vecs = [np.array(one_line.coords) for one_line in polylines] polylines_vecs = merge_boundary(polylines_vecs) else: # if no new instance, use the previous merged polyline to plot polylines_vecs = [np.array(line.coords) for line in polylines] for one_line in polylines_vecs: one_line = np.array(LineString(one_line).simplify(args.simplify).coords) pts = one_line[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity) ax.plot(x, y, "o", color=color, markersize=50) # update instance bank for line updated_polylines = [LineString(vec) for vec in polylines_vecs] instance_bank[vec_tag] = updated_polylines pred_save_path = pred_save_folder + f'/{frame_timestep}.png' plt.grid(False) plt.savefig(pred_save_path, bbox_inches='tight', transparent=args.transparent, dpi=args.dpi) plt.clf() plt.close(fig) print("image saved to : ", pred_save_path) image_list = [pred_save_folder + f'/{frame_timestep}.png' for frame_timestep in range(num_frames)] gif_output_path = pred_save_folder + '/vis.gif' save_as_video(image_list, gif_output_path) # merge the vectors across all frames and plot the merged vectors def plot_fig_merged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args): # setup the figure with car fig = plt.figure(figsize=(int(abs(x_min) + abs(x_max)) + 10 , int(abs(y_min) + abs(y_max)) + 10)) ax = fig.add_subplot(1, 1, 1) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) car_img = Image.open('resources/car-orange.png') faded_rate = np.linspace(0.2, 1, num=len(car_trajectory)) # trace the path reversely, get the sub-sampled traj for visualizing the car pre_center = car_trajectory[-1][0] selected_traj = [] selected_timesteps = [] for timestep, (car_center, rotation_degrees) in enumerate(car_trajectory[::-1]): if np.linalg.norm(car_center - pre_center) < 5 and timestep > 0 and timestep < len(car_trajectory)-1: continue selected_traj.append([car_center, rotation_degrees]) selected_timesteps.append(len(car_trajectory)-1-timestep) pre_center = car_center selected_traj = selected_traj[::-1] selected_timesteps = selected_timesteps[::-1] for selected_t, (car_center, rotation_degrees) in zip(selected_timesteps, selected_traj): translation = transforms.Affine2D().translate(car_center[0], car_center[1]) rotation = transforms.Affine2D().rotate_deg(rotation_degrees) rotation_translation = rotation + translation ax.imshow(car_img, extent=[-2.2, 2.2, -2, 2], transform=rotation_translation+ ax.transData, alpha=faded_rate[selected_t]) # merge the vectors across all frames for tag, vecs in id_prev2curr_pred_vectors.items(): label, vec_glb_idx = tag.split('_') label = int(label) vec_glb_idx = int(vec_glb_idx) if label == 0: # ped_crossing color = 'b' elif label == 1: # divider color = 'r' elif label == 2: # boundary color = 'g' # get the vectors belongs to the same instance polylines = [] for vec in vecs: polylines.append(LineString(vec)) if len(polylines) <= 0: continue if label == 0: # crossing, merged by convex hull polygon = merge_corssing(polylines) if polygon.area < 2: continue polygon = polygon.simplify(args.simplify) vector = np.array(polygon.exterior.coords) pts = vector[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity) ax.plot(x, y, "o", color=color, markersize=50) elif label == 1: # divider, merged by interpolation polylines_vecs = [np.array(one_line.coords) for one_line in polylines] polylines_vecs = merge_divider(polylines_vecs) for one_line in polylines_vecs: one_line = np.array(LineString(one_line).simplify(args.simplify).coords) pts = one_line[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity) ax.plot(x, y, "o", color=color, markersize=50) elif label == 2: # boundary, merged by interpolation polylines_vecs = [np.array(one_line.coords) for one_line in polylines] polylines_vecs = merge_boundary(polylines_vecs) for one_line in polylines_vecs: one_line = np.array(LineString(one_line).simplify(args.simplify).coords) pts = one_line[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) ax.plot(x, y, '-', color=color, linewidth=20, markersize=50, alpha=args.line_opacity) ax.plot(x, y, "o", color=color, markersize=50) plt.grid(False) plt.savefig(pred_save_path, bbox_inches='tight', transparent=args.transparent, dpi=args.dpi) plt.clf() plt.close(fig) print("image saved to : ", pred_save_path) def plot_fig_unmerged_per_frame(num_frames, car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args): os.makedirs(pred_save_folder, exist_ok=True) # trace the path reversely, get the sub-sampled traj for visualizing the car pre_center = car_trajectory[-1][0] selected_traj_timesteps = [] for timestep, (car_center, rotation_degrees) in enumerate(car_trajectory[::-1]): if np.linalg.norm(car_center - pre_center) < 5 and timestep > 0 and timestep < len(car_trajectory)-1: continue selected_traj_timesteps.append(len(car_trajectory)-1-timestep) pre_center = car_center selected_traj_timesteps = selected_traj_timesteps[::-1] # setup the figure with car fig = plt.figure(figsize=(int(abs(x_min) + abs(x_max)) + 10 , int(abs(y_min) + abs(y_max)) + 10)) ax = fig.add_subplot(1, 1, 1) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) car_img = Image.open('resources/car-orange.png') for frame_timestep in range(num_frames): faded_rate = np.linspace(0.2, 1, num=len(car_trajectory)) if frame_timestep in selected_traj_timesteps: car_center, rotation_degrees = car_trajectory[frame_timestep] translation = transforms.Affine2D().translate(car_center[0], car_center[1]) rotation = transforms.Affine2D().rotate_deg(rotation_degrees) rotation_translation = rotation + translation ax.imshow(car_img, extent=[-2.2, 2.2, -2, 2], transform=rotation_translation+ ax.transData, alpha=faded_rate[frame_timestep]) # plot the vectors for vec_tag, vec_all_frames in id_prev2curr_pred_vectors.items(): vec_frame_info = id_prev2curr_pred_frame[vec_tag] if frame_timestep not in vec_frame_info: # the instance has not appeared continue else: vec_index_in_instance = vec_frame_info[frame_timestep] curr_vec = vec_all_frames[vec_index_in_instance] label, vec_glb_idx = vec_tag.split('_') label = int(label) vec_glb_idx = int(vec_glb_idx) if label == 0: # ped_crossing color = 'b' elif label == 1: # divider color = 'r' elif label == 2: # boundary color = 'g' polyline = LineString(curr_vec) vector = np.array(polyline.coords) pts = vector[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) ax.plot(x, y, 'o-', color=color, linewidth=20, markersize=50) pred_save_path = pred_save_folder + f'/{frame_timestep}.png' plt.savefig(pred_save_path, bbox_inches='tight', transparent=args.transparent, dpi=args.dpi) print("image saved to : ", pred_save_path) plt.grid(False) plt.clf() plt.close(fig) image_list = [pred_save_folder + f'/{frame_timestep}.png' for frame_timestep in range(num_frames)] gif_output_path = pred_save_folder + '/vis.gif' save_as_video(image_list, gif_output_path) def plot_fig_unmerged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args): # setup the figure with car fig = plt.figure(figsize=(int(abs(x_min) + abs(x_max)) + 10 , int(abs(y_min) + abs(y_max)) + 10)) ax = fig.add_subplot(1, 1, 1) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) car_img = Image.open('resources/car-orange.png') # trace the path reversely, get the sub-sampled traj for visualizing the car pre_center = car_trajectory[-1][0] selected_traj = [] selected_timesteps = [] for timestep, (car_center, rotation_degrees) in enumerate(car_trajectory[::-1]): if np.linalg.norm(car_center - pre_center) < 5 and timestep > 0 and timestep < len(car_trajectory)-1: continue selected_traj.append([car_center, rotation_degrees]) selected_timesteps.append(len(car_trajectory)-1-timestep) pre_center = car_center selected_traj = selected_traj[::-1] selected_timesteps = selected_timesteps[::-1] # plot the car trajectory with faded_rate faded_rate = np.linspace(0.2, 1, num=len(car_trajectory)) for selected_t, (car_center, rotation_degrees) in zip(selected_timesteps, selected_traj): translation = transforms.Affine2D().translate(car_center[0], car_center[1]) rotation = transforms.Affine2D().rotate_deg(rotation_degrees) rotation_translation = rotation + translation ax.imshow(car_img, extent=[-2.2, 2.2, -2, 2], transform=rotation_translation+ ax.transData, alpha=faded_rate[selected_t]) # plot the unmerged vectors (all the predicted/ gt vectors) for tag, vecs in id_prev2curr_pred_vectors.items(): label, vec_glb_idx = tag.split('_') label = int(label) vec_glb_idx = int(vec_glb_idx) if label == 0: # ped_crossing color = 'b' elif label == 1: # divider color = 'r' elif label == 2: # boundary color = 'g' polylines = [] for vec in vecs: polylines.append(LineString(vec)) if len(polylines) <= 0: continue for one_line in polylines: vector = np.array(one_line.coords) pts = vector[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) ax.plot(x, y, 'o-', color=color, linewidth=20, markersize=50) plt.savefig(pred_save_path, bbox_inches='tight', transparent=args.transparent, dpi=args.dpi) plt.clf() plt.close(fig) print("image saved to : ", pred_save_path) # the timestep text visualization def save_t(t_max, main_save_folder): txt_save_folder = os.path.join(main_save_folder, 'txt') os.makedirs(txt_save_folder, exist_ok=True) t = range(t_max) for i in t: fig, ax = plt.subplots(figsize=(2, 1), dpi=300) # Increase DPI for higher resolution ax.text(0.1, 0.5, f't = {i}', fontsize=40,ha='left', va='center') ax.axis('off') ax.set_xlim(0, 1) ax.set_ylim(0, 1) fig.subplots_adjust(left=0, right=1, top=1, bottom=0) # Remove margins around the text plt.savefig(f'{txt_save_folder}/text_{i}.png',pad_inches=0) plt.close(fig) text_images = [f'{txt_save_folder}/text_{i}.png' for i in t] frames = [imageio.imread(img_path) for img_path in text_images] mp4_output_path = os.path.join(main_save_folder, 'text.mp4') imageio.mimsave(mp4_output_path, frames, fps=10) # fps controls the speed of the video print("mp4 saved to : ", mp4_output_path) def save_as_video(image_list, mp4_output_path, scale=None): mp4_output_path = mp4_output_path.replace('.gif','.mp4') images = [Image.fromarray(imageio.imread(img_path)).convert("RGBA") for img_path in image_list] if scale is not None: w, h = images[0].size images = [img.resize((int(w*scale), int(h*scale)), Image.Resampling.LANCZOS) for img in images] # images = [Image.new('RGBA', images[0].size, (255, 255, 255, 255))] + images try: imageio.mimsave(mp4_output_path, images, format='MP4',fps=10) except ValueError: # in case the shapes are not the same, have to manually adjust resized_images = [img.resize(images[0].size, Image.Resampling.LANCZOS) for img in images] print('Size not all the same, manually adjust...') imageio.mimsave(mp4_output_path, resized_images, format='MP4',fps=10) print("mp4 saved to : ", mp4_output_path) def vis_pred_data(scene_name="", pred_results=None, origin=None, roi_size=None, args=None): # get the item index of the scene index_list = [] for index in range(len(pred_results)): if pred_results[index]["scene_name"] == scene_name: index_list.append(index) car_trajectory = [] id_prev2curr_pred_vectors = defaultdict(list) id_prev2curr_pred_frame_info = defaultdict(list) id_prev2curr_pred_frame = defaultdict(list) # iterate through each frame last_index = index_list[-1] for index in index_list: vectors = np.array(pred_results[index]["vectors"]).reshape((len(np.array(pred_results[index]["vectors"])), 20, 2)) if abs(vectors.max()) <= 1: curr_vectors = vectors * roi_size + origin else: curr_vectors = vectors # get the transformation matrix of the last frame prev_e2g_trans = torch.tensor(pred_results[index]['meta']['ego2global_translation'], dtype=torch.float64) prev_e2g_rot = torch.tensor(pred_results[index]['meta']['ego2global_rotation'], dtype=torch.float64) curr_e2g_trans = torch.tensor(pred_results[last_index]['meta']['ego2global_translation'], dtype=torch.float64) curr_e2g_rot = torch.tensor(pred_results[last_index]['meta']['ego2global_rotation'], dtype=torch.float64) prev_e2g_matrix = torch.eye(4, dtype=torch.float64) prev_e2g_matrix[:3, :3] = prev_e2g_rot prev_e2g_matrix[:3, 3] = prev_e2g_trans curr_g2e_matrix = torch.eye(4, dtype=torch.float64) curr_g2e_matrix[:3, :3] = curr_e2g_rot.T curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans) prev2curr_matrix = curr_g2e_matrix @ prev_e2g_matrix prev2curr_pred_vectors = get_prev2curr_vectors(curr_vectors, prev2curr_matrix,origin,roi_size,False,False) prev2curr_pred_vectors = prev2curr_pred_vectors * roi_size + origin rotation_degrees = np.degrees(np.arctan2(prev2curr_matrix[:3, :3][1, 0], prev2curr_matrix[:3, :3][0, 0])) car_center = get_prev2curr_vectors(np.array((0,0)).reshape(1,1,2), prev2curr_matrix,origin,roi_size,False,False)* roi_size + origin car_trajectory.append([car_center.squeeze(), rotation_degrees]) for i, (label, vec_glb_idx) in enumerate(zip(pred_results[index]['labels'], pred_results[index]['global_ids'])): dict_key = "{}_{}".format(label, vec_glb_idx) id_prev2curr_pred_vectors[dict_key].append(prev2curr_pred_vectors[i]) id_prev2curr_pred_frame_info[dict_key].append([pred_results[index]["local_idx"], len(id_prev2curr_pred_frame[dict_key])]) for key, frame_info in id_prev2curr_pred_frame_info.items(): frame_localIdx = dict() for frame_time, local_index in frame_info: frame_localIdx[frame_time] = local_index id_prev2curr_pred_frame[key] = frame_localIdx # sort the id_prev2curr_pred_vectors id_prev2curr_pred_vectors = {key: id_prev2curr_pred_vectors[key] for key in sorted(id_prev2curr_pred_vectors)} # set the size of the image x_min = -roi_size[0] / 2 x_max = roi_size[0] / 2 y_min = -roi_size[1] / 2 y_max = roi_size[1] / 2 all_points = [] for vecs in id_prev2curr_pred_vectors.values(): points = np.concatenate(vecs, axis=0) all_points.append(points) all_points = np.concatenate(all_points, axis=0) x_min = min(x_min, all_points[:,0].min()) x_max = max(x_max, all_points[:,0].max()) y_min = min(y_min, all_points[:,1].min()) y_max = max(y_max, all_points[:,1].max()) scene_dir = os.path.join(args.out_dir, scene_name) os.makedirs(scene_dir,exist_ok=True) if args.per_frame_result: num_frames = len(index_list) pred_save_folder = os.path.join(scene_dir, f'pred_merged_per_frame') plot_fig_merged_per_frame(num_frames, car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args) pred_save_folder = os.path.join(scene_dir, f'pred_unmerged_per_frame') plot_fig_unmerged_per_frame(num_frames, car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args) pred_save_path = os.path.join(scene_dir, f'pred_unmerged.png') plot_fig_unmerged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args) pred_save_path = os.path.join(scene_dir, f'pred_merged.png') plot_fig_merged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args) comb_save_path = os.path.join(scene_dir, f'pred_comb.png') image_paths = [os.path.join(scene_dir, f'pred_merged.png'), os.path.join(scene_dir, f'pred_unmerged.png')] labels = ['Merged', 'Unmerged'] combine_images_with_labels(image_paths, labels, comb_save_path) print("image saved to : ", comb_save_path) def vis_gt_data(scene_name, args, dataset, gt_data, origin, roi_size): gt_info = gt_data[scene_name] gt_info_list = [] ids_info = [] # get the item index of the sample for index, one_idx in enumerate(gt_info["sample_ids"]): gt_info_list.append(dataset[one_idx]) ids_info.append(gt_info["instance_ids"][index]) car_trajectory = [] scene_dir = os.path.join(args.out_dir,scene_name) os.makedirs(scene_dir,exist_ok=True) # key : label, vec_glb_idx ; value : list of vectors in the last frame's coordinate id_prev2curr_pred_vectors = defaultdict(list) # dict to store some information of the vectors id_prev2curr_pred_frame_info = defaultdict(list) # key : label, vec_glb_idx ; value : {frame_time : idx of the vector; idx range from 0 to the number of vectors of the same instance } id_prev2curr_pred_frame = defaultdict(dict) scene_len = len(gt_info_list) for idx in range(scene_len): curr_vectors = dict() # denormalize the vectors for label, vecs in gt_info_list[idx]['vectors'].data.items(): if len(vecs) > 0: # if vecs != [] curr_vectors[label] = vecs * roi_size + origin else: curr_vectors[label] = vecs # get the transformation matrix of the last frame prev_e2g_trans = torch.tensor(gt_info_list[idx]['img_metas'].data['ego2global_translation'], dtype=torch.float64) prev_e2g_rot = torch.tensor(gt_info_list[idx]['img_metas'].data['ego2global_rotation'], dtype=torch.float64) curr_e2g_trans = torch.tensor(gt_info_list[scene_len-1]['img_metas'].data['ego2global_translation'], dtype=torch.float64) curr_e2g_rot = torch.tensor(gt_info_list[scene_len-1]['img_metas'].data['ego2global_rotation'], dtype=torch.float64) prev_e2g_matrix = torch.eye(4, dtype=torch.float64) prev_e2g_matrix[:3, :3] = prev_e2g_rot prev_e2g_matrix[:3, 3] = prev_e2g_trans curr_g2e_matrix = torch.eye(4, dtype=torch.float64) curr_g2e_matrix[:3, :3] = curr_e2g_rot.T curr_g2e_matrix[:3, 3] = -(curr_e2g_rot.T @ curr_e2g_trans) # get the transformed vectors from current frame to the last frame prev2curr_matrix = curr_g2e_matrix @ prev_e2g_matrix prev2curr_pred_vectors = get_consecutive_vectors_with_opt(curr_vectors,prev2curr_matrix,origin,roi_size,False,False) for label, id_info in ids_info[idx].items(): for vec_local_idx, vec_glb_idx in id_info.items(): dict_key = "{}_{}".format(label, vec_glb_idx) id_prev2curr_pred_vectors[dict_key].append(prev2curr_pred_vectors[label][vec_local_idx]) # gt_info_list[idx]["seq_info"].data[1] stores the frame time that the vector appears id_prev2curr_pred_frame_info[dict_key].append([gt_info_list[idx]["seq_info"].data[1], len(id_prev2curr_pred_frame[dict_key])]) # set len(id_prev2curr_pred_frame[dict_key]) to be the index of the vector belongs to the same instance for key, frame_info in id_prev2curr_pred_frame_info.items(): frame_localIdx = dict() for frame_time, local_index in frame_info: frame_localIdx[frame_time] = local_index id_prev2curr_pred_frame[key] = frame_localIdx rotation_degrees = np.degrees(np.arctan2(prev2curr_matrix[:3, :3][1, 0], prev2curr_matrix[:3, :3][0, 0])) # get the center of the car in the last frame's coordinate car_center = get_prev2curr_vectors(np.array((0,0)).reshape(1,1,2), prev2curr_matrix,origin,roi_size,False,False)* roi_size + origin car_trajectory.append([car_center.squeeze(), rotation_degrees]) # sort the id_prev2curr_pred_vectors by label and vec_glb_idx id_prev2curr_pred_vectors = {key: id_prev2curr_pred_vectors[key] for key in sorted(id_prev2curr_pred_vectors)} # get the x_min, x_max, y_min, y_max for the figure size x_min = -roi_size[0] / 2 x_max = roi_size[0] / 2 y_min = -roi_size[1] / 2 y_max = roi_size[1] / 2 all_points = [] for vecs in id_prev2curr_pred_vectors.values(): points = np.concatenate(vecs, axis=0) all_points.append(points) all_points = np.concatenate(all_points, axis=0) x_min = min(x_min, all_points[:,0].min()) x_max = max(x_max, all_points[:,0].max()) y_min = min(y_min, all_points[:,1].min()) y_max = max(y_max, all_points[:,1].max()) scene_dir = os.path.join(args.out_dir,scene_name) os.makedirs(scene_dir,exist_ok=True) # if visulize the per frame result if args.per_frame_result: pred_save_folder = os.path.join(scene_dir, f'gt_merged_per_frame') plot_fig_merged_per_frame(len(gt_info_list), car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args) pred_save_folder = os.path.join(scene_dir, f'gt_unmerged_per_frame') plot_fig_unmerged_per_frame(len(gt_info_list), car_trajectory, x_min, x_max, y_min, y_max, pred_save_folder, id_prev2curr_pred_vectors, id_prev2curr_pred_frame, args) # plot result for across all frames pred_save_path = os.path.join(scene_dir, f'gt_unmerged.png') plot_fig_unmerged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args) pred_save_path = os.path.join(scene_dir, f'gt_merged.png') plot_fig_merged(car_trajectory, x_min, x_max, y_min, y_max, pred_save_path, id_prev2curr_pred_vectors, args) # combine the merged and unmerged images into one plot for comparison comb_save_path = os.path.join(scene_dir, f'gt_comb.png') image_paths = [os.path.join(scene_dir, f'gt_merged.png'), os.path.join(scene_dir, f'gt_unmerged.png')] labels = ['Merged', 'Unmerged'] combine_images_with_labels(image_paths, labels, comb_save_path) print("image saved to : ", comb_save_path) def main(): args = parse_args() cfg = Config.fromfile(args.config) import_plugin(cfg) dataset = build_dataset(cfg.match_config) scene_name2idx = {} scene_name2token = {} for idx, sample in enumerate(dataset.samples): scene = sample['scene_name'] if scene not in scene_name2idx: scene_name2idx[scene] = [] scene_name2token[scene] = [] scene_name2idx[scene].append(idx) # load the GT data if args.option == "vis-gt": data = mmcv.load(args.data_path) # load the prediction data elif args.option == "vis-pred": with open(args.data_path,'rb') as fp: data = pickle.load(fp) all_scene_names = sorted(list(scene_name2idx.keys())) roi_size = torch.tensor(cfg.roi_size).numpy() origin = torch.tensor(cfg.pc_range[:2]).numpy() for scene_name in all_scene_names: if args.scene_id is not None and scene_name not in args.scene_id: continue scene_dir = os.path.join(args.out_dir,scene_name) if os.path.exists(scene_dir) and len(os.listdir(scene_dir)) > 0 and not args.overwrite: print(f"Scene {scene_name} already generated, skipping...") continue os.makedirs(scene_dir,exist_ok=True) if args.option == "vis-gt": # visualize the GT data vis_gt_data(scene_name=scene_name, args=args, dataset=dataset, gt_data=data, origin=origin, roi_size=roi_size) elif args.option == "vis-pred": # visualize the prediction results vis_pred_data(scene_name=scene_name, pred_results=data, origin=origin, roi_size=roi_size, args=args) else: raise ValueError('Invalid visualization option {}'.format(args.option)) if __name__ == '__main__': main() ================================================ FILE: tools/visualization/vis_per_frame.py ================================================ import sys import os SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.dirname(SCRIPT_DIR)) import argparse import mmcv from mmcv import Config import os from mmdet3d.datasets import build_dataset import torch import numpy as np from PIL import Image import pickle from collections import defaultdict import matplotlib.pyplot as plt import numpy as np from PIL import Image import imageio from tracking.cmap_utils.match_utils import * def parse_args(): parser = argparse.ArgumentParser( description='Visualize groundtruth and results') parser.add_argument('config', help='config file path') parser.add_argument( '--out_dir', required=True, default='demo', help='directory where visualize results will be saved') parser.add_argument( '--data_path', required=True, default="", help='directory to submission file') parser.add_argument( '--scene_id', type=str, nargs='+', default=None, help='scene_id to visulize') parser.add_argument( '--option', default="vis-gt", help='vis-gt or vis-pred') parser.add_argument( '--line_opacity', default=0.75, type=float, help='Line simplification tolerance' ) parser.add_argument( '--overwrite', default=1, type=int, help='whether to overwrite the existing images' ) parser.add_argument( '--dpi', default=20, type=int, help='whether to merge boundary lines' ) args = parser.parse_args() return args def save_as_video(image_list, mp4_output_path, scale=None): mp4_output_path = mp4_output_path.replace('.gif','.mp4') images = [Image.fromarray(img).convert("RGBA") for img in image_list] if scale is not None: w, h = images[0].size images = [img.resize((int(w*scale), int(h*scale)), Image.Resampling.LANCZOS) for img in images] images = [Image.new('RGBA', images[0].size, (255, 255, 255, 255))] + images try: imageio.mimsave(mp4_output_path, images, format='MP4',fps=10) except ValueError: # in case the shapes are not the same, have to manually adjust resized_images = [img.resize(images[0].size, Image.Resampling.LANCZOS) for img in images] print('Size not all the same, manually adjust...') imageio.mimsave(mp4_output_path, resized_images, format='MP4',fps=10) print("mp4 saved to : ", mp4_output_path) def plot_one_frame_results(vectors, id_info, roi_size, scene_dir, args): # setup the figure with car plt.figure(figsize=(roi_size[0], roi_size[1])) plt.xlim(-roi_size[0] / 2, roi_size[0] / 2) plt.ylim(-roi_size[1] / 2, roi_size[1] / 2) plt.axis('off') plt.autoscale(False) car_img = Image.open('resources/car-orange.png') plt.imshow(car_img, extent=[-2.2, 2.2, -2, 2]) for label, vecs in vectors.items(): if label == 0: # ped_crossing color = 'b' label_text = 'P' elif label == 1: # divider color = 'r' label_text = 'D' elif label == 2: # boundary color = 'g' label_text = 'B' if len(vecs) == 0: continue for vec_idx, vec in enumerate(vecs): pts = vec[:, :2] x = np.array([pt[0] for pt in pts]) y = np.array([pt[1] for pt in pts]) plt.plot(x, y, 'o-', color=color, linewidth=25, markersize=20, alpha=args.line_opacity) vec_id = id_info[label][vec_idx] mid_idx = len(x) // 2 # Put instance id, prevent the text from changing fig size... if -roi_size[1]/2 <= y[mid_idx] < -roi_size[1]/2 + 2: text_y = y[mid_idx] + 2 elif roi_size[1]/2 - 2 < y[mid_idx] <= roi_size[1]/2: text_y = y[mid_idx] - 2 else: text_y = y[mid_idx] if -roi_size[0]/2 <= x[mid_idx] < -roi_size[0]/2 + 4: text_x = x[mid_idx] + 4 elif roi_size[0]/2 - 4 < x[mid_idx] <= roi_size[0]/2: text_x = x[mid_idx] - 4 else: text_x = x[mid_idx] plt.text(text_x, text_y, f'{label_text}{vec_id}', fontsize=80, color=color) save_path = os.path.join(scene_dir, 'temp.png') plt.savefig(save_path, bbox_inches='tight', transparent=False, dpi=args.dpi) plt.clf() plt.close() viz_image = imageio.imread(save_path) return viz_image def vis_pred_data(scene_name, args, pred_results, origin,roi_size): # get the item index of the scene scene_idx = defaultdict(list) for index in range(len(pred_results)): scene_idx[pred_results[index]["scene_name"]].append(index) index_list = scene_idx[scene_name] scene_dir = os.path.join(args.out_dir,scene_name) os.makedirs(scene_dir,exist_ok=True) g2l_id_mapping = dict() label_ins_counter = {0:0, 1:0, 2:0} all_viz_images = [] # iterate through each frame of the pred sequence for index in index_list: vectors = np.array(pred_results[index]["vectors"]).reshape((len(np.array(pred_results[index]["vectors"])), 20, 2)) # some results are normalized, some not... if np.abs(vectors).max() <= 1: vectors = vectors * roi_size + origin labels = np.array(pred_results[index]["labels"]) global_ids = np.array(pred_results[index]["global_ids"]) per_label_results = defaultdict(list) for ins_idx in range(len(vectors)): label = int(labels[ins_idx]) global_id = int(global_ids[ins_idx]) if global_id not in g2l_id_mapping: local_idx = label_ins_counter[label] g2l_id_mapping[global_id] = (label, local_idx) label_ins_counter[label] += 1 else: if label == g2l_id_mapping[global_id][0]: local_idx = g2l_id_mapping[global_id][1] else: # label changes for a tracked instance (can happen in our method) # need to update the global id info local_idx = label_ins_counter[label] g2l_id_mapping[global_id] = (label, local_idx) label_ins_counter[label] += 1 per_label_results[label].append([vectors[ins_idx], global_id, local_idx]) curr_vectors = defaultdict(list) id_info = dict() for label, results in per_label_results.items(): vec_results = [item[0] for item in results] global_ids = [item[1] for item in results] local_ids = [item[2] for item in results] curr_vectors[label] = np.stack(vec_results, axis=0) id_info[label] = {idx:ins_id for idx, ins_id in enumerate(local_ids)} viz_image = plot_one_frame_results(curr_vectors, id_info, roi_size, scene_dir, args) all_viz_images.append(viz_image) gif_path = os.path.join(scene_dir, 'per_frame_pred.gif') save_as_video(all_viz_images, gif_path) def vis_gt_data(scene_name, args, dataset, scene_name2idx, gt_data, origin, roi_size): gt_info = gt_data[scene_name] gt_info_list = [] ids_info = [] scene_dir = os.path.join(args.out_dir,scene_name) os.makedirs(scene_dir,exist_ok=True) for index, one_idx in enumerate(gt_info["sample_ids"]): gt_info_list.append(dataset[one_idx]) ids_info.append(gt_info["instance_ids"][index]) scene_len = len(gt_info_list) all_viz_images = [] all_cam_images = {cam_name: [] for cam_name in dataset.samples[0]['cams'].keys()} for frame_idx in range(scene_len): global_idx = scene_name2idx[scene_name][frame_idx] # collect images for each camera cams = dataset.samples[global_idx]['cams'] for cam, info in cams.items(): img = imageio.imread(info['img_fpath']) all_cam_images[cam].append(img) # collect vectors for each frame curr_vectors = dict() for label, vecs in gt_info_list[frame_idx]['vectors'].data.items(): if len(vecs) > 0: curr_vectors[label] = vecs * roi_size + origin else: curr_vectors[label] = vecs id_info = ids_info[frame_idx] viz_image = plot_one_frame_results(curr_vectors, id_info, roi_size, scene_dir, args) all_viz_images.append(viz_image) gif_path = os.path.join(scene_dir, 'per_frame_gt.gif') save_as_video(all_viz_images, gif_path) for cam_name, image_list in all_cam_images.items(): gif_path = os.path.join(scene_dir, f'{cam_name}.gif') save_as_video(image_list, gif_path, scale=0.3) def main(): args = parse_args() cfg = Config.fromfile(args.config) import_plugin(cfg) dataset = build_dataset(cfg.match_config) scene_name2idx = {} scene_name2token = {} for idx, sample in enumerate(dataset.samples): scene = sample['scene_name'] if scene not in scene_name2idx: scene_name2idx[scene] = [] scene_name2token[scene] = [] scene_name2idx[scene].append(idx) if args.data_path == "": data = {} elif args.option == "vis-gt": # visulize GT option data = mmcv.load(args.data_path) elif args.option == "vis-pred": with open(args.data_path,'rb') as fp: data = pickle.load(fp) all_scene_names = sorted(list(scene_name2idx.keys())) scene_info_list = [] for single_scene_name in all_scene_names: scene_info_list.append((single_scene_name, args)) roi_size = torch.tensor(cfg.roi_size).numpy() origin = torch.tensor(cfg.pc_range[:2]).numpy() for scene_name in all_scene_names: if args.scene_id is not None and scene_name not in args.scene_id: continue scene_dir = os.path.join(args.out_dir,scene_name) if os.path.exists(scene_dir) and len(os.listdir(scene_dir)) > 0 and not args.overwrite: print(f"Scene {scene_name} already generated, skipping...") continue os.makedirs(scene_dir,exist_ok=True) if args.option == "vis-gt": vis_gt_data(scene_name=scene_name, args=args, dataset=dataset, scene_name2idx=scene_name2idx, gt_data=data,origin=origin,roi_size=roi_size) elif args.option == "vis-pred": vis_pred_data(scene_name=scene_name, args=args, pred_results=data, origin=origin, roi_size=roi_size) if __name__ == '__main__': main()